class AIRL(SingleTimestepIRL): """ Fits advantage function based reward functions """ def __init__(self, env, expert_trajs=None, discrim_arch=relu_net, discrim_arch_args={}, normalize_reward=False, score_dtau=False, init_itrs=None, discount=1.0, l2_reg=0, state_only=False, shaping_with_actions=False, max_itrs=100, fusion=False, fusion_subsample=0.5, action_penalty=0.0, name='trajprior'): super(AIRL, self).__init__() env_spec = env.spec if fusion: self.fusion = RamFusionDistr(100, subsample_ratio=fusion_subsample) else: self.fusion = None self.dO = env_spec.observation_space.flat_dim self.dU = env_spec.action_space.flat_dim if isinstance(env.action_space, Box): self.continuous = True else: self.continuous = False self.normalize_reward = normalize_reward self.score_dtau = score_dtau self.init_itrs = init_itrs self.gamma = discount #assert fitted_value_fn_arch is not None self.set_demos(expert_trajs) self.state_only = state_only self.max_itrs = max_itrs # build energy model with tf.variable_scope(name) as _vs: # Should be batch_size x T x dO/dU self.obs_t = tf.placeholder(tf.float32, [None, self.dO], name='obs') self.nobs_t = tf.placeholder(tf.float32, [None, self.dO], name='nobs') self.act_t = tf.placeholder(tf.float32, [None, self.dU], name='act') self.nact_t = tf.placeholder(tf.float32, [None, self.dU], name='nact') self.labels = tf.placeholder(tf.float32, [None, 1], name='labels') self.lprobs = tf.placeholder(tf.float32, [None, 1], name='log_probs') self.lr = tf.placeholder(tf.float32, (), name='lr') #obs_act = tf.concat([self.obs_t, self.act_t], axis=1) with tf.variable_scope('discrim') as dvs: if self.state_only: with tf.variable_scope('energy') as vs: # reward function (or q-function) self.energy = discrim_arch(self.obs_t, dout=1, **discrim_arch_args) energy_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name) else: if self.continuous: obs_act = tf.concat([self.obs_t, self.act_t], axis=1) with tf.variable_scope('energy') as vs: # reward function (or q-function) self.energy = discrim_arch(obs_act, dout=1, **discrim_arch_args) energy_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name) else: raise ValueError() if shaping_with_actions: nobs_act = tf.concat([self.nobs_t, self.nact_t], axis=1) obs_act = tf.concat([self.obs_t, self.act_t], axis=1) else: nobs_act = self.nobs_t obs_act = self.obs_t # with tf.variable_scope('vfn'): # fitted_value_fn_n = fitted_value_fn_arch(nobs_act, dout=1) # with tf.variable_scope('vfn', reuse=True): # self.value_fn = fitted_value_fn = fitted_value_fn_arch(obs_act, dout=1) self.value_fn = tf.zeros(shape=[]) # Define log p_tau(a|s) = r + gamma * V(s') - V(s) if action_penalty > 0: self.r = r = -self.energy + action_penalty * tf.reduce_sum( tf.square(self.act_t), axis=1, keepdims=True) else: self.r = r = -self.energy self.qfn = r #+self.gamma*fitted_value_fn_n log_p_tau = r # + self.gamma*fitted_value_fn_n - fitted_value_fn discrim_vars = tf.get_collection('reg_vars', scope=dvs.name) log_q_tau = self.lprobs if l2_reg > 0: reg_loss = l2_reg * tf.reduce_sum( [tf.reduce_sum(tf.square(var)) for var in discrim_vars]) else: reg_loss = 0 log_pq = tf.reduce_logsumexp([log_p_tau, log_q_tau], axis=0) self.d_tau = tf.exp(log_p_tau - log_pq) cent_loss = -tf.reduce_mean(self.labels * (log_p_tau - log_pq) + (1 - self.labels) * (log_q_tau - log_pq)) self.loss = cent_loss tot_loss = self.loss + reg_loss self.step = tf.train.AdamOptimizer( learning_rate=self.lr).minimize(tot_loss) self._make_param_ops(_vs) def fit(self, paths, policy=None, batch_size=32, logger=None, lr=1e-3, last_timestep_only=False, **kwargs): if self.fusion is not None: old_paths = self.fusion.sample_paths(n=len(paths)) self.fusion.add_paths(paths) paths = paths + old_paths self._compute_path_probs(paths, insert=True) #self.eval_expert_probs(paths, policy, insert=True) for traj in self.expert_trajs: if 'agent_infos' in traj: #print('deleting agent_infos') del traj['agent_infos'] del traj['a_logprobs'] self.eval_expert_probs(self.expert_trajs, policy, insert=True) self._insert_next_state(paths) self._insert_next_state(self.expert_trajs) obs, obs_next, acts, acts_next, path_probs = self.extract_paths( paths, keys=('observations', 'observations_next', 'actions', 'actions_next', 'a_logprobs'), last_timestep_only=last_timestep_only) expert_obs, expert_obs_next, expert_acts, expert_acts_next, expert_probs = self.extract_paths( self.expert_trajs, keys=('observations', 'observations_next', 'actions', 'actions_next', 'a_logprobs'), last_timestep_only=last_timestep_only) # Train discriminator for it in TrainingIterator(self.max_itrs, heartbeat=5): nobs_batch, obs_batch, nact_batch, act_batch, lprobs_batch = \ self.sample_batch(obs_next, obs, acts_next, acts, path_probs, batch_size=batch_size) nexpert_obs_batch, expert_obs_batch, nexpert_act_batch, expert_act_batch, expert_lprobs_batch = \ self.sample_batch(expert_obs_next, expert_obs, expert_acts_next, expert_acts, expert_probs, batch_size=batch_size) labels = np.zeros((batch_size * 2, 1)) labels[batch_size:] = 1.0 obs_batch = np.concatenate([obs_batch, expert_obs_batch], axis=0) nobs_batch = np.concatenate([nobs_batch, nexpert_obs_batch], axis=0) act_batch = np.concatenate([act_batch, expert_act_batch], axis=0) nact_batch = np.concatenate([nact_batch, nexpert_act_batch], axis=0) lprobs_batch = np.expand_dims(np.concatenate( [lprobs_batch, expert_lprobs_batch], axis=0), axis=1).astype(np.float32) feed_dict = { self.act_t: act_batch, self.obs_t: obs_batch, self.nobs_t: nobs_batch, self.nact_t: nact_batch, self.labels: labels, self.lprobs: lprobs_batch, self.lr: lr } loss, _ = tf.get_default_session().run([self.loss, self.step], feed_dict=feed_dict) it.record('loss', loss) if it.heartbeat: print(it.itr_message()) mean_loss = it.pop_mean('loss') print('\tLoss:%f' % mean_loss) if logger: logger.record_tabular('GCLDiscrimLoss', mean_loss) #obs_next = np.r_[obs_next, np.expand_dims(obs_next[-1], axis=0)] #logZ, energy, logZ, dtau = tf.get_default_session().run( [self.energy, self.value_fn, self.d_tau], feed_dict={ self.act_t: acts, self.obs_t: obs, self.nobs_t: obs_next, self.nact_t: acts_next, self.lprobs: np.expand_dims(path_probs, axis=1) }) logger.record_tabular('GCLLogZ', np.mean(logZ)) logger.record_tabular('GCLAverageEnergy', np.mean(energy)) logger.record_tabular('GCLAverageLogPtau', np.mean(-energy - logZ)) logger.record_tabular('GCLAverageLogQtau', np.mean(path_probs)) logger.record_tabular('GCLMedianLogQtau', np.median(path_probs)) logger.record_tabular('GCLAverageDtau', np.mean(dtau)) #expert_obs_next = np.r_[expert_obs_next, np.expand_dims(expert_obs_next[-1], axis=0)] energy, logZ, dtau = tf.get_default_session().run( [self.energy, self.value_fn, self.d_tau], feed_dict={ self.act_t: expert_acts, self.obs_t: expert_obs, self.nobs_t: expert_obs_next, self.nact_t: expert_acts_next, self.lprobs: np.expand_dims(expert_probs, axis=1) }) logger.record_tabular('GCLAverageExpertEnergy', np.mean(energy)) logger.record_tabular('GCLAverageExpertLogPtau', np.mean(-energy - logZ)) logger.record_tabular('GCLAverageExpertLogQtau', np.mean(expert_probs)) logger.record_tabular('GCLMedianExpertLogQtau', np.median(expert_probs)) logger.record_tabular('GCLAverageExpertDtau', np.mean(dtau)) return mean_loss def eval(self, paths, gamma=1.0, **kwargs): """ Return bonus """ if self.score_dtau: self._compute_path_probs(paths, insert=True) obs, obs_next, acts, path_probs = self.extract_paths( paths, keys=('observations', 'observations_next', 'actions', 'a_logprobs')) path_probs = np.expand_dims(path_probs, axis=1) scores = tf.get_default_session().run(self.d_tau, feed_dict={ self.act_t: acts, self.obs_t: obs, self.nobs_t: obs_next, self.lprobs: path_probs }) score = np.log(scores) - np.log(1 - scores) score = score[:, 0] else: obs, acts = self.extract_paths(paths) energy = tf.get_default_session().run(self.energy, feed_dict={ self.act_t: acts, self.obs_t: obs }) score = (-energy)[:, 0] return self.unpack(score, paths) def eval_discrim(self, paths): self._compute_path_probs(paths, insert=True) obs, obs_next, acts, path_probs = self.extract_paths( paths, keys=('observations', 'observations_next', 'actions', 'a_logprobs')) path_probs = np.expand_dims(path_probs, axis=1) scores = tf.get_default_session().run(self.d_tau, feed_dict={ self.act_t: acts, self.obs_t: obs, self.nobs_t: obs_next, self.lprobs: path_probs }) score = (scores) score = score[:, 0] return self.unpack(score, paths) def eval_single(self, obs): energy = tf.get_default_session().run(self.energy, feed_dict={self.obs_t: obs}) score = (-energy)[:, 0] return score def debug_eval(self, paths, **kwargs): obs, acts = self.extract_paths(paths) energy, v, qfn = tf.get_default_session().run( [self.energy, self.value_fn, self.qfn], feed_dict={ self.act_t: acts, self.obs_t: obs }) return { 'reward': -energy, 'value': v, 'qfn': qfn, } return {}
class AIRL(SingleTimestepIRL): """ Args: fusion (bool): Use trajectories from old iterations to train. state_only (bool): Fix the learned reward to only depend on state. score_discrim (bool): Use log D - log 1-D as reward (if true you should not need to use an entropy bonus) max_itrs (int): Number of training iterations to run per fit step. """ def __init__(self, env, expert_trajs=None, reward_arch=relu_net, reward_arch_args=None, value_fn_arch=relu_net, score_discrim=False, discount=1.0, state_only=False, max_itrs=100, fusion=False, name='airl'): super(AIRL, self).__init__() env_spec = env.spec if reward_arch_args is None: reward_arch_args = {} if fusion: self.fusion = RamFusionDistr(100, subsample_ratio=0.5) else: self.fusion = None self.dO = env_spec.observation_space.flat_dim self.dU = env_spec.action_space.flat_dim assert isinstance(env.action_space, Box) self.score_discrim = score_discrim self.gamma = discount assert value_fn_arch is not None self.set_demos(expert_trajs) self.state_only=state_only self.max_itrs=max_itrs # build energy model with tf.variable_scope(name) as _vs: # Should be batch_size x T x dO/dU self.obs_t = tf.placeholder(tf.float32, [None, self.dO], name='obs') self.nobs_t = tf.placeholder(tf.float32, [None, self.dO], name='nobs') self.act_t = tf.placeholder(tf.float32, [None, self.dU], name='act') self.nact_t = tf.placeholder(tf.float32, [None, self.dU], name='nact') self.labels = tf.placeholder(tf.float32, [None, 1], name='labels') self.lprobs = tf.placeholder(tf.float32, [None, 1], name='log_probs') self.lr = tf.placeholder(tf.float32, (), name='lr') with tf.variable_scope('discrim') as dvs: rew_input = self.obs_t if not self.state_only: rew_input = tf.concat([self.obs_t, self.act_t], axis=1) with tf.variable_scope('reward'): self.reward = reward_arch(rew_input, dout=1, **reward_arch_args) #energy_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name) # value function shaping with tf.variable_scope('vfn'): fitted_value_fn_n = value_fn_arch(self.nobs_t, dout=1) with tf.variable_scope('vfn', reuse=True): self.value_fn = fitted_value_fn = value_fn_arch(self.obs_t, dout=1) # Define log p_tau(a|s) = r + gamma * V(s') - V(s) self.qfn = self.reward + self.gamma*fitted_value_fn_n log_p_tau = self.reward + self.gamma*fitted_value_fn_n - fitted_value_fn log_q_tau = self.lprobs log_pq = tf.reduce_logsumexp([log_p_tau, log_q_tau], axis=0) self.discrim_output = tf.exp(log_p_tau-log_pq) cent_loss = -tf.reduce_mean(self.labels*(log_p_tau-log_pq) + (1-self.labels)*(log_q_tau-log_pq)) self.loss = cent_loss tot_loss = self.loss self.step = tf.train.AdamOptimizer(learning_rate=self.lr).minimize(tot_loss) self._make_param_ops(_vs) def fit(self, paths, policy=None, batch_size=32, logger=None, lr=1e-3,**kwargs): if self.fusion is not None: old_paths = self.fusion.sample_paths(n=len(paths)) self.fusion.add_paths(paths) paths = paths+old_paths # eval samples under current policy self._compute_path_probs(paths, insert=True) # eval expert log probs under current policy self.eval_expert_probs(self.expert_trajs, policy, insert=True) self._insert_next_state(paths) self._insert_next_state(self.expert_trajs) obs, obs_next, acts, acts_next, path_probs = \ self.extract_paths(paths, keys=('observations', 'observations_next', 'actions', 'actions_next', 'a_logprobs')) expert_obs, expert_obs_next, expert_acts, expert_acts_next, expert_probs = \ self.extract_paths(self.expert_trajs, keys=('observations', 'observations_next', 'actions', 'actions_next', 'a_logprobs')) # Train discriminator for it in TrainingIterator(self.max_itrs, heartbeat=5): nobs_batch, obs_batch, nact_batch, act_batch, lprobs_batch = \ self.sample_batch(obs_next, obs, acts_next, acts, path_probs, batch_size=batch_size) nexpert_obs_batch, expert_obs_batch, nexpert_act_batch, expert_act_batch, expert_lprobs_batch = \ self.sample_batch(expert_obs_next, expert_obs, expert_acts_next, expert_acts, expert_probs, batch_size=batch_size) # Build feed dict labels = np.zeros((batch_size*2, 1)) labels[batch_size:] = 1.0 obs_batch = np.concatenate([obs_batch, expert_obs_batch], axis=0) nobs_batch = np.concatenate([nobs_batch, nexpert_obs_batch], axis=0) act_batch = np.concatenate([act_batch, expert_act_batch], axis=0) nact_batch = np.concatenate([nact_batch, nexpert_act_batch], axis=0) lprobs_batch = np.expand_dims(np.concatenate([lprobs_batch, expert_lprobs_batch], axis=0), axis=1).astype(np.float32) feed_dict = { self.act_t: act_batch, self.obs_t: obs_batch, self.nobs_t: nobs_batch, self.nact_t: nact_batch, self.labels: labels, self.lprobs: lprobs_batch, self.lr: lr } loss, _ = tf.get_default_session().run([self.loss, self.step], feed_dict=feed_dict) it.record('loss', loss) if it.heartbeat: print(it.itr_message()) mean_loss = it.pop_mean('loss') print('\tLoss:%f' % mean_loss) if logger: logger.record_tabular('GCLDiscrimLoss', mean_loss) #obs_next = np.r_[obs_next, np.expand_dims(obs_next[-1], axis=0)] energy, logZ, dtau = tf.get_default_session().run([self.reward, self.value_fn, self.discrim_output], feed_dict={self.act_t: acts, self.obs_t: obs, self.nobs_t: obs_next, self.nact_t: acts_next, self.lprobs: np.expand_dims(path_probs, axis=1)}) energy = -energy logger.record_tabular('GCLLogZ', np.mean(logZ)) logger.record_tabular('GCLAverageEnergy', np.mean(energy)) logger.record_tabular('GCLAverageLogPtau', np.mean(-energy-logZ)) logger.record_tabular('GCLAverageLogQtau', np.mean(path_probs)) logger.record_tabular('GCLMedianLogQtau', np.median(path_probs)) logger.record_tabular('GCLAverageDtau', np.mean(dtau)) #expert_obs_next = np.r_[expert_obs_next, np.expand_dims(expert_obs_next[-1], axis=0)] energy, logZ, dtau = tf.get_default_session().run([self.reward, self.value_fn, self.discrim_output], feed_dict={self.act_t: expert_acts, self.obs_t: expert_obs, self.nobs_t: expert_obs_next, self.nact_t: expert_acts_next, self.lprobs: np.expand_dims(expert_probs, axis=1)}) energy = -energy logger.record_tabular('GCLAverageExpertEnergy', np.mean(energy)) logger.record_tabular('GCLAverageExpertLogPtau', np.mean(-energy-logZ)) logger.record_tabular('GCLAverageExpertLogQtau', np.mean(expert_probs)) logger.record_tabular('GCLMedianExpertLogQtau', np.median(expert_probs)) logger.record_tabular('GCLAverageExpertDtau', np.mean(dtau)) return mean_loss def eval(self, paths, **kwargs): """ Return bonus """ if self.score_discrim: self._compute_path_probs(paths, insert=True) obs, obs_next, acts, path_probs = self.extract_paths(paths, keys=('observations', 'observations_next', 'actions', 'a_logprobs')) path_probs = np.expand_dims(path_probs, axis=1) scores = tf.get_default_session().run(self.discrim_output, feed_dict={self.act_t: acts, self.obs_t: obs, self.nobs_t: obs_next, self.lprobs: path_probs}) score = np.log(np.maximum(scores,1e-8)) - np.log(np.maximum(1 - scores,1e-8)) score = score[:,0] else: obs, acts = self.extract_paths(paths) reward = tf.get_default_session().run(self.reward, feed_dict={self.act_t: acts, self.obs_t: obs}) score = reward[:,0] return self.unpack(score, paths) def eval_single(self, obs): reward = tf.get_default_session().run(self.reward, feed_dict={self.obs_t: obs}) score = reward[:, 0] return score def debug_eval(self, paths, **kwargs): obs, acts = self.extract_paths(paths) reward, v, qfn = tf.get_default_session().run([self.reward, self.value_fn, self.qfn], feed_dict={self.act_t: acts, self.obs_t: obs}) return { 'reward': reward, 'value': v, 'qfn': qfn, }
class Empowerment(SingleTimestepIRL): """ Args: fusion (bool): Use trajectories from old iterations to train. state_only (bool): Fix the learned reward to only depend on state. max_itrs (int): Number of training iterations to run per fit step. """ def __init__( self, env, emp_fn_arch=relu_net, #_dropout, scope='efn', max_itrs=100, fusion=False, name='empowerment'): super(Empowerment, self).__init__() env_spec = env.spec if fusion: self.fusion = RamFusionDistr(100, subsample_ratio=0.5) else: self.fusion = None self.dO = env_spec.observation_space.flat_dim self.dU = env_spec.action_space.flat_dim assert isinstance(env.action_space, Box) assert emp_fn_arch is not None self.max_itrs = max_itrs # build energy model with tf.variable_scope(name) as _vs: # Should be batch_size x T x dO/dU self.obs_t = tf.placeholder(tf.float32, [None, self.dO], name='obs') self.act_qvar = tf.placeholder(tf.float32, [None, 1], name='act_qvar') self.act_policy = tf.placeholder(tf.float32, [None, 1], name='act_policy') self.lr = tf.placeholder(tf.float32, (), name='lr') # empowerment function with tf.variable_scope(scope): self.empwerment = emp_fn_arch(self.obs_t, dout=1) cent_loss = tf.losses.mean_squared_error( predictions=(self.empwerment + self.act_policy), labels=self.act_qvar) self.loss_emp = cent_loss tot_loss_emp = self.loss_emp self.step_emp = tf.train.AdamOptimizer( learning_rate=self.lr).minimize(tot_loss_emp) self._make_param_ops(_vs) def fit(self, paths, irl_model=None, tempw=None, policy=None, qvar_model=None, batch_size=32, logger=None, lr=1e-3, **kwargs): if self.fusion is not None: old_paths = self.fusion.sample_paths(n=len(paths)) self.fusion.add_paths(paths) paths = paths + old_paths #self._insert_next_state(paths) obs, ac, obs_next = self.extract_paths(paths, keys=('observations', 'actions', 'observations_next')) for it in TrainingIterator(self.max_itrs, heartbeat=1): nobs_batch, obs_batch, ac_batch = self.sample_batch( obs_next, obs, ac) dist_info_vars = policy.dist_info_sym(obs_batch, None) dist_s = policy.distribution.log_likelihood( policy.distribution.sample(dist_info_vars), dist_info_vars) q_input = tf.concat([obs_batch, nobs_batch], axis=1) q_dist_info_vars = qvar_model.dist_info_sym(q_input, None) q_dist_s = policy.distribution.log_likelihood( policy.distribution.sample(q_dist_info_vars), q_dist_info_vars) # Build feed dict feed_dict = { self.obs_t: obs_batch, self.act_qvar: q_dist_s.eval(), self.act_policy: dist_s.eval(), self.lr: lr } loss_emp, _ = tf.get_default_session().run( [self.loss_emp, self.step_emp], feed_dict=feed_dict) it.record('loss_emp', loss_emp) if it.heartbeat: print(it.itr_message()) mean_loss_emp = it.pop_mean('loss_emp') print('\tLoss_emp:%f' % mean_loss_emp) return mean_loss_emp def eval(self, obs, **kwargs): """ Return bonus """ empw = tf.get_default_session().run(self.empwerment, feed_dict={self.obs_t: obs}) return empw
class Qvar(SingleTimestepIRL): """ Args: fusion (bool): Use trajectories from old iterations to train. state_only (bool): Fix the learned reward to only depend on state. score_discrim (bool): Use log D - log 1-D as reward (if true you should not need to use an entropy bonus) max_itrs (int): Number of training iterations to run per fit step. """ def __init__(self, env, expert_trajs=None, qvar=None, score_discrim=False, discount=1.0, state_only=False, max_itrs=100, fusion=False, name='qvar'): super(Qvar, self).__init__() env_spec = env.spec if qvar is not None: self.qvar = qvar if fusion: self.fusion = RamFusionDistr(100, subsample_ratio=0.5) else: self.fusion = None self.dO = env_spec.observation_space.flat_dim self.dU = env_spec.action_space.flat_dim assert isinstance(env.action_space, Box) self.set_demos(expert_trajs) self.max_itrs=max_itrs # build energy model with tf.variable_scope(name) as _vs: # Should be batch_size x T x dO/dU self.act_t = tf.placeholder(tf.float32, [None, self.dU], name='act') self.obs_t = tf.placeholder(tf.float32, [None, self.dO], name='obs') self.nobs_t = tf.placeholder(tf.float32, [None, self.dO], name='nobs') self.lr = tf.placeholder(tf.float32, (), name='lr') with tf.variable_scope('q_var') as dvs: q_input = tf.concat([self.obs_t,self.nobs_t],axis=1) self.act_predicted=self.qvar.dist_info_sym(q_input,None) self.loss_q = tf.losses.mean_squared_error(predictions=self.act_predicted["mean"],labels=self.act_t) tot_loss_q = self.loss_q self.step_q = tf.train.AdamOptimizer(learning_rate=self.lr).minimize(tot_loss_q) self._make_param_ops(_vs) def fit(self, paths, batch_size=32, logger=None, lr=1e-3,**kwargs): if self.fusion is not None: old_paths = self.fusion.sample_paths(n=len(paths)) self.fusion.add_paths(paths) paths = paths+old_paths obs, obs_next, acts = \ self.extract_paths(paths, keys=('observations', 'observations_next', 'actions')) '''expert_obs, expert_obs_next, expert_acts = \ self.extract_paths(self.expert_trajs, keys=('observations', 'observations_next', 'actions'))''' # Train discriminator for it in TrainingIterator(self.max_itrs, heartbeat=5): nobs_batch, obs_batch, act_batch = \ self.sample_batch(obs_next, obs, acts, batch_size=batch_size) feed_dict = { self.act_t: act_batch, self.obs_t: obs_batch, self.nobs_t: nobs_batch, self.lr: lr } loss_q, _ = tf.get_default_session().run([self.loss_q, self.step_q], feed_dict=feed_dict) it.record('loss_q', loss_q) if it.heartbeat: mean_loss_q = it.pop_mean('loss_q') print('\tLoss_q:%f' % mean_loss_q) return mean_loss_q def dist_info_sym(self, q_input,state_info_vars): return self.qvar.dist_info_sym(q_input,None) '''def set_params(self, params):
class AIRL_Nstep(SingleTimestepIRL): """ Args: fusion (bool): Use trajectories from old iterations to train. state_only (bool): Fix the learned reward to only depend on state. score_discrim (bool): Use log D - log 1-D as reward (if true you should not need to use an entropy bonus) max_itrs (int): Number of training iterations to run per fit step. """ def __init__(self, env, expert_trajs=None, reward_arch=relu_net, reward_arch_args=None, value_fn_arch=relu_net, score_discrim=False, discount=1.0, number_obs = 3, state_only=False, max_itrs=100, fusion=False, name='airl'): super(AIRL_Nstep, self).__init__() env_spec = env.spec if reward_arch_args is None: reward_arch_args = {} if fusion: self.fusion = RamFusionDistr(100, subsample_ratio=0.5) else: self.fusion = None self.dO = env_spec.observation_space.flat_dim self.dU = env_spec.action_space.flat_dim assert isinstance(env.action_space, Box) self.score_discrim = score_discrim self.gamma = discount assert value_fn_arch is not None self.set_demos(expert_trajs) self.state_only=state_only self.max_itrs=max_itrs self.number_obs = number_obs # build energy model with tf.variable_scope(name) as _vs: # Should be batch_size x T x dO/dU self.obs_t = tf.placeholder(tf.float32, [None, self.number_obs, self.dO], name='obs') self.nobs_t = tf.placeholder(tf.float32, [None, self.dO], name='nobs') self.act_t = tf.placeholder(tf.float32, [None, self.number_obs,self.dU], name='act') self.nact_t = tf.placeholder(tf.float32, [None, self.dU], name='nact') self.labels = tf.placeholder(tf.float32, [None, 1], name='labels') self.lprobs = tf.placeholder(tf.float32, [None, 1], name='log_probs') self.lr = tf.placeholder(tf.float32, (), name='lr') with tf.variable_scope('discrim') as dvs: rew_input = self.obs_t if not self.state_only: rew_input = tf.concat([self.obs_t, self.act_t], axis=2) with tf.variable_scope('reward'): self.reward = reward_arch(tf.reshape(rew_input, [-1, rew_input.shape[2] ]), dout=1, **reward_arch_args) self.reward = tf.reshape(self.reward, [-1, self.number_obs, 1]) #energy_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name) # value function shaping with tf.variable_scope('vfn'): fitted_value_fn_n = value_fn_arch(self.nobs_t, dout=1) with tf.variable_scope('vfn', reuse=True): self.value_fn = fitted_value_fn = value_fn_arch(self.obs_t[:,0,:], dout=1) # Define log p_tau(a|s) = r + gamma * V(s') - V(s) gamma_coefs = np.ones([self.number_obs], dtype = np.float32) gamma_coefs[1:] *= self.gamma gamma_coefs = np.cumprod(gamma_coefs) gamma_coefs = np.expand_dims(gamma_coefs, axis=1) self.qfn = tf.reduce_sum(self.reward*gamma_coefs, axis=1) + (self.gamma**self.number_obs)*fitted_value_fn_n log_p_tau = self.qfn - fitted_value_fn log_q_tau = self.lprobs log_pq = tf.reduce_logsumexp([log_p_tau, log_q_tau], axis=0) self.discrim_output = tf.exp(log_p_tau-log_pq) cent_loss = -tf.reduce_mean(self.labels*(log_p_tau-log_pq) + (1-self.labels)*(log_q_tau-log_pq)) self.loss = cent_loss tot_loss = self.loss self.step = tf.train.AdamOptimizer(learning_rate=self.lr).minimize(tot_loss) self._make_param_ops(_vs) def _reorganize_states(self, paths, pad_val=0.0): for path in paths: if 'observations_next' in path: continue nobs = path['observations'][self.number_obs:] nact = path['actions'][self.number_obs:] nobs = np.r_[nobs, pad_val*np.ones(shape=[self.number_obs, self.dO ], dtype=np.float32 )] nact = np.r_[nact, pad_val*np.ones(shape=[self.number_obs, self.dU ], dtype=np.float32 )] path['observations_next'] = nobs path['actions_next'] = nact multiple_obs = np.ones( (path['observations'].shape[0], self.number_obs, self.dO), dtype=np.float32 ) multiple_act = np.ones( (path['actions'].shape[0], self.number_obs, self.dU), dtype=np.float32 ) for idx in range(path['observations'].shape[0]): if idx+self.number_obs < path['observations'].shape[0]: final_idx = idx+self.number_obs obs = path['observations'][idx:final_idx] act = path['actions'][idx:final_idx] else: final_idx = path['observations'].shape[0] delta_idx = self.number_obs - (final_idx - idx) obs = np.r_[ path['observations'][idx:final_idx] , np.ones(shape=[delta_idx, self.dO ], dtype=np.float32) ] act = np.r_[ path['actions'][idx:final_idx] , np.ones(shape=[delta_idx, self.dU ], dtype=np.float32) ] multiple_obs[idx,:,:] = obs multiple_act[idx,:,:] = act path['multi_observations'] = multiple_obs path['multi_actions'] = multiple_act return paths def fit(self, paths, policy=None, batch_size=32, logger=None, lr=1e-3,**kwargs): if self.fusion is not None: old_paths = self.fusion.sample_paths(n=len(paths)) self.fusion.add_paths(paths) paths = paths+old_paths # eval samples under current policy self._compute_path_probs(paths, insert=True) # eval expert log probs under current policy self.eval_expert_probs(self.expert_trajs, policy, insert=True) self._reorganize_states(paths) self._reorganize_states(self.expert_trajs) obs, obs_next, acts, acts_next, path_probs = \ self.extract_paths(paths, keys=('multi_observations', 'observations_next', 'multi_actions', 'actions_next', 'a_logprobs')) expert_obs, expert_obs_next, expert_acts, expert_acts_next, expert_probs = \ self.extract_paths(self.expert_trajs, keys=('multi_observations', 'observations_next', 'multi_actions', 'actions_next', 'a_logprobs')) # Train discriminator for it in TrainingIterator(self.max_itrs, heartbeat=5): nobs_batch, obs_batch, nact_batch, act_batch, lprobs_batch = \ self.sample_batch(obs_next, obs, acts_next, acts, path_probs, batch_size=batch_size) nexpert_obs_batch, expert_obs_batch, nexpert_act_batch, expert_act_batch, expert_lprobs_batch = \ self.sample_batch(expert_obs_next, expert_obs, expert_acts_next, expert_acts, expert_probs, batch_size=batch_size) # Build feed dict labels = np.zeros((batch_size*2, 1)) labels[batch_size:] = 1.0 obs_batch = np.concatenate([obs_batch, expert_obs_batch], axis=0) nobs_batch = np.concatenate([nobs_batch, nexpert_obs_batch], axis=0) act_batch = np.concatenate([act_batch, expert_act_batch], axis=0) nact_batch = np.concatenate([nact_batch, nexpert_act_batch], axis=0) lprobs_batch = np.expand_dims(np.concatenate([lprobs_batch, expert_lprobs_batch], axis=0), axis=1).astype(np.float32) feed_dict = { self.act_t: act_batch, self.obs_t: obs_batch, self.nobs_t: nobs_batch, self.nact_t: nact_batch, self.labels: labels, self.lprobs: lprobs_batch, self.lr: lr } loss, _ = tf.get_default_session().run([self.loss, self.step], feed_dict=feed_dict) it.record('loss', loss) if it.heartbeat: print(it.itr_message()) mean_loss = it.pop_mean('loss') print('\tLoss:%f' % mean_loss) if logger: logger.record_tabular('GCLDiscrimLoss', mean_loss) #obs_next = np.r_[obs_next, np.expand_dims(obs_next[-1], axis=0)] energy, logZ, dtau = tf.get_default_session().run([self.reward, self.value_fn, self.discrim_output], feed_dict={self.act_t: acts, self.obs_t: obs, self.nobs_t: obs_next, self.nact_t: acts_next, self.lprobs: np.expand_dims(path_probs, axis=1)}) energy = -energy logger.record_tabular('GCLLogZ', np.mean(logZ)) logger.record_tabular('GCLAverageEnergy', np.mean(energy)) logger.record_tabular('GCLAverageExpertLogPtau', np.mean(-np.mean(energy, axis=1) - logZ)) logger.record_tabular('GCLAverageLogQtau', np.mean(path_probs)) logger.record_tabular('GCLMedianLogQtau', np.median(path_probs)) logger.record_tabular('GCLAverageDtau', np.mean(dtau)) #expert_obs_next = np.r_[expert_obs_next, np.expand_dims(expert_obs_next[-1], axis=0)] energy, logZ, dtau = tf.get_default_session().run([self.reward, self.value_fn, self.discrim_output], feed_dict={self.act_t: expert_acts, self.obs_t: expert_obs, self.nobs_t: expert_obs_next, self.nact_t: expert_acts_next, self.lprobs: np.expand_dims(expert_probs, axis=1)}) energy = -energy logger.record_tabular('GCLAverageExpertEnergy', np.mean(energy)) logger.record_tabular('GCLAverageExpertLogPtau', np.mean(-np.mean(energy, axis=1) - logZ)) logger.record_tabular('GCLAverageExpertLogQtau', np.mean(expert_probs)) logger.record_tabular('GCLMedianExpertLogQtau', np.median(expert_probs)) logger.record_tabular('GCLAverageExpertDtau', np.mean(dtau)) return mean_loss def _compute_path_probs_modified(self, paths, pol_dist_type=None, insert=True, insert_key='a_logprobs'): """ Returns a N x T matrix of action probabilities """ if insert_key in paths[0]: return np.array([path[insert_key] for path in paths]) if pol_dist_type is None: # try to infer distribution type path0 = paths[0] if 'log_std' in path0['agent_infos']: pol_dist_type = DIST_GAUSSIAN elif 'prob' in path0['agent_infos']: pol_dist_type = DIST_CATEGORICAL else: raise NotImplementedError() # compute path probs Npath = len(paths) actions = [path['actions'][0] for path in paths] if pol_dist_type == DIST_GAUSSIAN: params = [(path['agent_infos']['mean'], path['agent_infos']['log_std']) for path in paths] path_probs = [gauss_log_pdf(params[i], actions[i]) for i in range(Npath)] elif pol_dist_type == DIST_CATEGORICAL: params = [(path['agent_infos']['prob'],) for path in paths] path_probs = [categorical_log_pdf(params[i], actions[i]) for i in range(Npath)] else: raise NotImplementedError("Unknown distribution type") if insert: for i, path in enumerate(paths): path[insert_key] = path_probs[i] return np.array(path_probs) def eval(self, paths, **kwargs): """ Return bonus """ if self.score_discrim: self._compute_path_probs_modified(paths, insert=True) obs, obs_next, acts, path_probs = self.extract_paths(paths, keys=('multi_observations', 'observations_next', 'multi_actions', 'a_logprobs')) path_probs = np.expand_dims(path_probs, axis=1) scores = tf.get_default_session().run(self.discrim_output, feed_dict={self.act_t: acts, self.obs_t: obs, self.nobs_t: obs_next, self.lprobs: path_probs}) score = np.log(scores) - np.log(1-scores) score = score[:,0] else: obs, acts = self.extract_paths(paths, keys=('multi_observations', 'multi_actions')) reward = tf.get_default_session().run(self.reward, feed_dict={self.act_t: acts, self.obs_t: obs}) score = reward[:,0,0] return self.unpack(score, paths) def eval_single(self, obs): reward = tf.get_default_session().run(self.reward, feed_dict={self.obs_t: obs}) ## DOUBLE CHECK score = reward[:, 0, 0] return score def debug_eval(self, paths, **kwargs): obs, acts = self.extract_paths(paths, keys=('multi_observations', 'multi_actions')) reward, v, qfn = tf.get_default_session().run([self.reward, self.value_fn, self.qfn], feed_dict={self.act_t: acts, self.obs_t: obs}) return { 'reward': reward, 'value': v, 'qfn': qfn, }
class EAIRL(SingleTimestepIRL): """ Args: fusion (bool): Use trajectories from old iterations to train. state_only (bool): Fix the learned reward to only depend on state. score_discrim (bool): Use log D - log 1-D as reward (if true you should not need to use an entropy bonus) max_itrs (int): Number of training iterations to run per fit step. """ def __init__(self, env, expert_trajs=None, reward_arch=relu_net, reward_arch_args=None, score_discrim=False, discount=1.0, state_only=False, max_itrs=100, fusion=False, name='eairl'): super(EAIRL, self).__init__() env_spec = env.spec if reward_arch_args is None: reward_arch_args = {} if fusion: self.fusion = RamFusionDistr(100, subsample_ratio=0.5) else: self.fusion = None self.dO = env_spec.observation_space.flat_dim self.dU = env_spec.action_space.flat_dim assert isinstance(env.action_space, Box) self.score_discrim = score_discrim self.gamma = discount self.set_demos(expert_trajs) self.state_only=state_only self.max_itrs=max_itrs # build energy model with tf.variable_scope(name) as _vs: # Should be batch_size x T x dO/dU self.obs_t = tf.placeholder(tf.float32, [None, self.dO], name='obs') self.nobs_t = tf.placeholder(tf.float32, [None, self.dO], name='nobs') self.act_t = tf.placeholder(tf.float32, [None, self.dU], name='act') self.nact_t = tf.placeholder(tf.float32, [None, self.dU], name='nact') self.labels = tf.placeholder(tf.float32, [None, 1], name='labels') self.lprobs = tf.placeholder(tf.float32, [None, 1], name='log_probs') self.lr = tf.placeholder(tf.float32, (), name='lr') self.vs = tf.placeholder(tf.float32, [None, 1], name='vs') self.vsp = tf.placeholder(tf.float32, [None, 1], name='vsp') with tf.variable_scope('discrim') as dvs: rew_input = self.obs_t if not self.state_only: rew_input = tf.concat([self.obs_t, self.act_t], axis=1) with tf.variable_scope('reward'): self.reward = reward_arch(rew_input, dout=1, **reward_arch_args) # Define log p_tau(a|s) = r + gamma * V(s') - V(s) self.qfn = self.reward + self.gamma*self.vsp log_p_tau = self.reward + self.gamma*self.vsp-self.vs log_q_tau = self.lprobs log_pq = tf.reduce_logsumexp([log_p_tau, log_q_tau], axis=0) self.discrim_output = tf.exp(log_p_tau-log_pq) cent_loss = -tf.reduce_mean(self.labels*(log_p_tau-log_pq) + (1-self.labels)*(log_q_tau-log_pq)) self.loss_irl = cent_loss tot_loss_irl = self.loss_irl self.step_irl = tf.train.AdamOptimizer(learning_rate=self.lr).minimize(tot_loss_irl) self._make_param_ops(_vs) def fit(self, paths, policy=None,empw_model=None,t_empw_model=None, batch_size=32, logger=None, lr=1e-3,**kwargs): if self.fusion is not None: old_paths = self.fusion.sample_paths(n=len(paths)) self.fusion.add_paths(paths) paths = paths+old_paths # eval samples under current policy self._compute_path_probs(paths, insert=True) # eval expert log probs under current policy self.eval_expert_probs(self.expert_trajs, policy, insert=True) self._insert_next_state(paths) self._insert_next_state(self.expert_trajs) obs, obs_next, acts, acts_next, path_probs = \ self.extract_paths(paths, keys=('observations', 'observations_next', 'actions', 'actions_next', 'a_logprobs')) expert_obs, expert_obs_next, expert_acts, expert_acts_next, expert_probs = \ self.extract_paths(self.expert_trajs, keys=('observations', 'observations_next', 'actions', 'actions_next', 'a_logprobs')) # Train discriminator for it in TrainingIterator(self.max_itrs, heartbeat=5): nobs_batch, obs_batch, nact_batch, act_batch, lprobs_batch = \ self.sample_batch(obs_next, obs, acts_next, acts, path_probs, batch_size=batch_size) nexpert_obs_batch, expert_obs_batch, nexpert_act_batch, expert_act_batch, expert_lprobs_batch = \ self.sample_batch(expert_obs_next, expert_obs, expert_acts_next, expert_acts, expert_probs, batch_size=batch_size) # Build feed dict labels = np.zeros((batch_size*2, 1)) labels[batch_size:] = 1.0 obs_batch = np.concatenate([obs_batch, expert_obs_batch], axis=0) nobs_batch = np.concatenate([nobs_batch, nexpert_obs_batch], axis=0) act_batch = np.concatenate([act_batch, expert_act_batch], axis=0) nact_batch = np.concatenate([nact_batch, nexpert_act_batch], axis=0) lprobs_batch = np.expand_dims(np.concatenate([lprobs_batch, expert_lprobs_batch], axis=0), axis=1).astype(np.float32) vs=empw_model.eval(obs_batch) vsp=t_empw_model.eval(nobs_batch) feed_dict = { self.act_t: act_batch, self.obs_t: obs_batch, self.nobs_t: nobs_batch, self.nact_t: nact_batch, self.labels: labels, self.lprobs: lprobs_batch, self.lr: lr, self.vs: vs, self.vsp: vsp } loss_irl, _ = tf.get_default_session().run([self.loss_irl, self.step_irl], feed_dict=feed_dict) it.record('loss_irl', loss_irl) if it.heartbeat: print(it.itr_message()) mean_loss_irl = it.pop_mean('loss_irl') print('\tLoss_irl:%f' % mean_loss_irl) return mean_loss_irl def next_state(self, paths,**kwargs): self._insert_next_state(paths) def eval(self, paths,empw_model=None,t_empw_model=None, **kwargs): """ Return bonus """ if self.score_discrim: self._compute_path_probs(paths, insert=True) obs, obs_next, acts, path_probs = self.extract_paths(paths, keys=('observations', 'observations_next', 'actions', 'a_logprobs')) path_probs = np.expand_dims(path_probs, axis=1) vs=empw_model.eval(obs).reshape(-1,1) vsp=empw_model.eval(obs_next).reshape(-1,1) path_probs = np.expand_dims(path_probs, axis=1) scores = tf.get_default_session().run(self.discrim_output, feed_dict={self.act_t: acts, self.obs_t: obs, self.nobs_t: obs_next, self.lprobs: path_probs.reshape(-1,1), self.vs:vs, self.vsp:vsp}) score = np.log(scores) - np.log(1-scores) score = score[:,0] else: obs, acts = self.extract_paths(paths) reward = tf.get_default_session().run(self.reward, feed_dict={self.act_t: acts, self.obs_t: obs}) score = reward[:,0] return self.unpack(score, paths) def eval_single(self, obs, acts): reward = tf.get_default_session().run(self.reward, feed_dict={self.act_t: acts, self.obs_t: obs}) score = reward[:, 0] return score def debug_eval(self, paths, **kwargs): obs, acts = self.extract_paths(paths) reward, v, qfn = tf.get_default_session().run([self.reward, self.value_fn, self.qfn], feed_dict={self.act_t: acts, self.obs_t: obs}) return { 'reward': reward, 'value': v, 'qfn': qfn, }
class AIRL_Bootstrap(SingleTimestepIRL): """ Args: fusion (bool): Use trajectories from old iterations to train. state_only (bool): Fix the learned reward to only depend on state. score_discrim (bool): Use log D - log 1-D as reward (if true you should not need to use an entropy bonus) max_itrs (int): Number of training iterations to run per fit step. """ def __init__(self, env, expert_trajs=None, reward_arch=relu_net, reward_arch_args=None, value_fn_arch=relu_net, score_discrim=False, sess=None, discount=1.0, max_nstep=10, n_value_funct=1, n_rew_funct=1, state_only=False, max_itrs=100, fusion=False, debug=False, score_method=None, name='airl'): super(AIRL_Bootstrap, self).__init__() env_spec = env.spec if reward_arch_args is None: reward_arch_args = {} if fusion: self.fusion = RamFusionDistr(100, subsample_ratio=0.5) else: self.fusion = None self.dO = env_spec.observation_space.flat_dim self.dU = env_spec.action_space.flat_dim assert isinstance(env.action_space, Box) self.score_discrim = score_discrim self.gamma = discount assert value_fn_arch is not None self.set_demos(expert_trajs) self.state_only = state_only self.max_itrs = max_itrs self.max_nstep = max_nstep self.n_value_funct = n_value_funct self.n_rew_funct = n_rew_funct self.reward_arch = reward_arch self.reward_arch_args = reward_arch_args self.value_fn_arch = value_fn_arch self.score_method = score_method self.debug = debug # build energy model with tf.variable_scope(name) as _vs: # Should be batch_size x T x dO/dU self.obs_t = tf.placeholder(tf.float32, [None, None, self.dO], name='obs') self.nobs_t = tf.placeholder(tf.float32, [None, self.dO], name='nobs') self.act_t = tf.placeholder(tf.float32, [None, None, self.dU], name='act') self.nact_t = tf.placeholder(tf.float32, [None, self.dU], name='nact') self.labels = tf.placeholder(tf.float32, [None, 1], name='labels') self.lprobs = tf.placeholder(tf.float32, [None, 1], name='log_probs') self.lr = tf.placeholder(tf.float32, (), name='lr') number_obs = tf.shape(self.obs_t)[1] with tf.variable_scope('discrim') as dvs: rew_input = self.obs_t if not self.state_only: rew_input = tf.concat([self.obs_t, self.act_t], axis=2) self.reward = [None for i in range(self.n_rew_funct)] self.value_fn = [None for i in range(self.n_value_funct)] fitted_value_fn_n = [None for i in range(self.n_value_funct)] self.qfn = [[None for i in range(self.n_value_funct)] for j in range(self.n_rew_funct)] self.discrim_output = [[ None for i in range(self.n_value_funct) ] for j in range(self.n_rew_funct)] self.loss = [[None for i in range(self.n_value_funct)] for j in range(self.n_rew_funct)] self.step = [[None for i in range(self.n_value_funct)] for j in range(self.n_rew_funct)] log_q_tau = self.lprobs for i in range(self.n_rew_funct): with tf.variable_scope('reward_%d' % (i), reuse=tf.AUTO_REUSE): self.reward[i] = self.reward_arch( tf.reshape(rew_input, [-1, rew_input.shape[2]]), dout=1, **self.reward_arch_args) self.reward[i] = tf.reshape(self.reward[i], [-1, number_obs, 1]) #energy_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name) for j in range(self.n_value_funct): # value function shaping with tf.variable_scope('vfn_%d' % (j), reuse=tf.AUTO_REUSE): fitted_value_fn_n[j] = self.value_fn_arch(self.nobs_t, dout=1) with tf.variable_scope('vfn_%d' % (j), reuse=tf.AUTO_REUSE): self.value_fn[j] = self.value_fn_arch(self.obs_t[:, 0, :], dout=1) self.avg_reward = tf.reduce_mean(tf.stack(self.reward), axis=0) gamma_coefs = tf.concat([ tf.ones([1], dtype=tf.float32), self.gamma * tf.ones([number_obs - 1], dtype=tf.float32) ], axis=0) gamma_coefs = tf.cumprod(gamma_coefs) gamma_coefs = tf.expand_dims(gamma_coefs, axis=1) for i in range(self.n_rew_funct): for j in range(self.n_value_funct): # Define log p_tau(a|s) = r + gamma * V(s') - V(s) self.qfn[i][j] = tf.reduce_sum( self.reward[i] * gamma_coefs, axis=1) + tf.math.pow( tf.constant(self.gamma), tf.to_float(number_obs)) * fitted_value_fn_n[j] log_p_tau = self.qfn[i][j] - self.value_fn[j] log_pq = tf.reduce_logsumexp([log_p_tau, log_q_tau], axis=0) self.discrim_output[i][j] = tf.exp(log_p_tau - log_pq) cent_loss = -tf.reduce_mean(self.labels * (log_p_tau - log_pq) + (1 - self.labels) * (log_q_tau - log_pq)) self.loss[i][j] = cent_loss self.step[i][j] = tf.train.AdamOptimizer( learning_rate=self.lr).minimize(self.loss[i][j]) self._combine_predictions() self._make_param_ops(_vs) def _combine_predictions(self): t0_reward = [None for i in range(self.n_rew_funct)] t0_value = [None for i in range(self.n_value_funct)] reward = [None for i in range(self.n_rew_funct)] fitted_value_fn_n = [None for i in range(self.n_value_funct)] log_p_tau = [[ None for i in range(self.n_rew_funct * self.n_value_funct) ] for j in range(self.max_nstep)] rew_input = self.obs_t if not self.state_only: rew_input = tf.concat([self.obs_t, self.act_t], axis=2) for i in range(self.n_rew_funct): with tf.variable_scope('reward_%d' % (i), reuse=tf.AUTO_REUSE): reward[i] = self.reward_arch(tf.reshape( rew_input, [-1, rew_input.shape[2]]), dout=1, **self.reward_arch_args) reward[i] = tf.reshape(reward[i], [-1, self.max_nstep, 1]) t0_reward[i] = reward[i][:, 0] #Master-student score method with tf.variable_scope('student_reward'): self.student_reward = self.reward_arch(rew_input[:, 0], dout=1, **self.reward_arch_args) v_input = tf.concat( [self.obs_t[:, 1:, :], tf.expand_dims(self.nobs_t, axis=1)], axis=1) for j in range(self.n_value_funct): # value function shaping with tf.variable_scope('vfn_%d' % (j), reuse=tf.AUTO_REUSE): fitted_value_fn_n[j] = self.value_fn_arch(tf.reshape( v_input, [-1, v_input.shape[2]]), dout=1) fitted_value_fn_n[j] = tf.reshape(fitted_value_fn_n[j], [-1, self.max_nstep, 1]) with tf.variable_scope('vfn_%d' % (j), reuse=tf.AUTO_REUSE): t0_value[j] = self.value_fn_arch(self.obs_t[:, 0, :], dout=1) #Master-student score method with tf.variable_scope('student_value', reuse=tf.AUTO_REUSE): self.student_value_n = self.value_fn_arch(v_input[:, 0], dout=1) with tf.variable_scope('student_value', reuse=tf.AUTO_REUSE): self.student_value = self.value_fn_arch(self.obs_t[:, 0, :], dout=1) gamma_coefs = np.ones([self.max_nstep], dtype=np.float32) gamma_coefs[1:] *= self.gamma gamma_coefs = np.cumprod(gamma_coefs) gamma_coefs = np.expand_dims(gamma_coefs, axis=1) log_q_tau = self.lprobs for i in range(self.n_rew_funct): for j in range(self.n_value_funct): for single_nsteps in range(1, self.max_nstep + 1): # Define log p_tau(a|s) = r + gamma * V(s') - V(s) qfn = tf.reduce_sum( reward[i][:, :single_nsteps] * gamma_coefs[:single_nsteps], axis=1) + (self.gamma**single_nsteps) * ( fitted_value_fn_n[j][:, (single_nsteps - 1)]) log_p_tau[single_nsteps - 1][i * self.n_value_funct + j] = qfn - t0_value[j] #Master-student score method student_qfn = self.student_reward + self.gamma * self.student_value_n student_log_p_tau = student_qfn - self.student_value mean_list = [None for i in range(self.max_nstep)] variance_list = [None for i in range(self.max_nstep)] for i in range(self.max_nstep): mean_list[i] = tf.reduce_mean(tf.stack(log_p_tau[i]), axis=0) variance_list[i] = tf.math.reduce_variance(tf.stack(log_p_tau[i]), axis=0) self.weights = tf.concat(variance_list, axis=1) self.weights = tf.nn.softmax(1. / (self.weights + 1e-8), axis=1) # self.weights = tf.constant( ([0.0]*(self.max_nstep-1)) + [1.0], dtype=tf.float32 ) log_p_tau = tf.reduce_sum(self.weights * tf.concat(mean_list, axis=1), axis=1, keepdims=True) log_pq = tf.reduce_logsumexp([log_p_tau, log_q_tau], axis=0) self.ensemble_discrim_output = tf.exp(log_p_tau - log_pq) #Master-student score method student_log_pq = tf.reduce_logsumexp([student_log_p_tau, log_q_tau], axis=0) self.student_discrim_output = tf.exp(student_log_p_tau - student_log_pq) self.student_loss = -tf.reduce_mean(tf.stop_gradient(self.ensemble_discrim_output)*(student_log_p_tau-student_log_pq) + \ (1-tf.stop_gradient(self.ensemble_discrim_output))*(log_q_tau-student_log_pq)) self.student_step = tf.train.AdamOptimizer( learning_rate=self.lr).minimize(self.student_loss) self.student_absolute_loss = -tf.reduce_mean( self.labels * (student_log_p_tau - student_log_pq) + (1 - self.labels) * (log_q_tau - student_log_pq)) self.ensemble_loss = -tf.reduce_mean(self.labels * (log_p_tau - log_pq) + (1 - self.labels) * (log_q_tau - log_pq)) self.t0_reward = tf.reduce_mean(tf.concat(t0_reward, axis=1), axis=1, keepdims=True) self.t0_value = tf.reduce_mean(tf.concat(t0_value, axis=1), axis=1, keepdims=True) def _reorganize_states(self, paths, number_obs=1, pad_val=0.0): for path in paths: if 'observations_next' in path: continue nobs = path['observations'][number_obs:] nact = path['actions'][number_obs:] nobs = np.r_[ nobs, pad_val * np.ones(shape=[number_obs, self.dO], dtype=np.float32)] nact = np.r_[ nact, pad_val * np.ones(shape=[number_obs, self.dU], dtype=np.float32)] path['observations_next'] = nobs path['actions_next'] = nact multiple_obs = np.ones( (path['observations'].shape[0], number_obs, self.dO), dtype=np.float32) multiple_act = np.ones( (path['actions'].shape[0], number_obs, self.dU), dtype=np.float32) for idx in range(path['observations'].shape[0]): if idx + number_obs < path['observations'].shape[0]: final_idx = idx + number_obs obs = path['observations'][idx:final_idx] act = path['actions'][idx:final_idx] else: final_idx = path['observations'].shape[0] delta_idx = number_obs - (final_idx - idx) obs = np.r_[ path['observations'][idx:final_idx], np.ones(shape=[delta_idx, self.dO], dtype=np.float32)] act = np.r_[ path['actions'][idx:final_idx], np.ones(shape=[delta_idx, self.dU], dtype=np.float32)] multiple_obs[idx, :, :] = obs multiple_act[idx, :, :] = act path['multi_observations'] = multiple_obs path['multi_actions'] = multiple_act return paths def fit(self, paths, policy=None, batch_size=32, logger=None, lr=1e-3, **kwargs): if self.fusion is not None: old_paths = self.fusion.sample_paths(n=len(paths)) self.fusion.add_paths(paths) paths = paths + old_paths # eval samples under current policy self._compute_path_probs(paths, insert=True) # eval expert log probs under current policy self.eval_expert_probs(self.expert_trajs, policy, insert=True) self._reorganize_states(paths, number_obs=self.max_nstep) self._reorganize_states(self.expert_trajs, number_obs=self.max_nstep) obs, obs_next, acts, acts_next, path_probs = \ self.extract_paths(paths, keys=('multi_observations', 'observations_next', 'multi_actions', 'actions_next', 'a_logprobs')) expert_obs, expert_obs_next, expert_acts, expert_acts_next, expert_probs = \ self.extract_paths(self.expert_trajs, keys=('multi_observations', 'observations_next', 'multi_actions', 'actions_next', 'a_logprobs')) all_obs = np.concatenate([obs, expert_obs], axis=0) all_nobs = np.concatenate([obs_next, expert_obs_next], axis=0) all_acts = np.concatenate([acts, expert_acts], axis=0) all_nacts = np.concatenate([acts_next, expert_acts_next], axis=0) all_probs = np.concatenate([path_probs, expert_probs], axis=0) all_labels = np.zeros((all_obs.shape[0], 1)) all_labels[obs.shape[0]:] = 1.0 # Train discriminator for it in TrainingIterator(self.max_itrs, heartbeat=5): if self.n_rew_funct < self.n_value_funct: delta = self.n_value_funct - self.n_rew_funct temp = np.arange(self.n_rew_funct) np.random.shuffle(temp) rew_idxs = np.r_[ temp, np.random.randint(self.n_rew_funct, size=delta)] val_idxs = np.arange(self.n_value_funct) else: delta = self.n_rew_funct - self.n_value_funct temp = np.arange(self.n_value_funct) np.random.shuffle(temp) val_idxs = np.r_[ temp, np.random.randint(self.n_value_funct, size=delta)] rew_idxs = np.arange(self.n_rew_funct) for idx in range(val_idxs.shape[0]): i = rew_idxs[idx] j = val_idxs[idx] for single_nstep in range(1, self.max_nstep + 1): nobs_batch, obs_batch, nact_batch, act_batch, lprobs_batch = \ self.sample_batch(obs_next, obs, acts_next, acts, path_probs, batch_size=batch_size) nexpert_obs_batch, expert_obs_batch, nexpert_act_batch, expert_act_batch, expert_lprobs_batch = \ self.sample_batch(expert_obs_next, expert_obs, expert_acts_next, expert_acts, expert_probs, batch_size=batch_size) # Build feed dict labels = np.zeros((batch_size * 2, 1)) labels[batch_size:] = 1.0 obs_batch = np.concatenate([obs_batch, expert_obs_batch], axis=0) nobs_batch = np.concatenate( [nobs_batch, nexpert_obs_batch], axis=0) act_batch = np.concatenate([act_batch, expert_act_batch], axis=0) nact_batch = np.concatenate( [nact_batch, nexpert_act_batch], axis=0) lprobs_batch = np.expand_dims(np.concatenate( [lprobs_batch, expert_lprobs_batch], axis=0), axis=1).astype(np.float32) feed_dict = { self.act_t: act_batch[:, :single_nstep], self.obs_t: obs_batch[:, :single_nstep], self.nobs_t: nobs_batch if self.max_nstep == single_nstep else obs_batch[:, single_nstep], self.nact_t: nact_batch if self.max_nstep == single_nstep else act_batch[:, single_nstep], self.labels: labels, self.lprobs: lprobs_batch, self.lr: lr } loss, _ = tf.get_default_session().run( [self.loss[i][j], self.step[i][j]], feed_dict=feed_dict) it.record('loss', loss) if self.score_discrim is False and self.score_method == 'teacher_student': nobs_batch, obs_batch, nact_batch, act_batch, lprobs_batch = \ self.sample_batch(obs_next, obs, acts_next, acts, path_probs, batch_size=batch_size) nexpert_obs_batch, expert_obs_batch, nexpert_act_batch, expert_act_batch, expert_lprobs_batch = \ self.sample_batch(expert_obs_next, expert_obs, expert_acts_next, expert_acts, expert_probs, batch_size=batch_size) # Build feed dict labels = np.zeros((batch_size * 2, 1)) labels[batch_size:] = 1.0 obs_batch = np.concatenate([obs_batch, expert_obs_batch], axis=0) nobs_batch = np.concatenate([nobs_batch, nexpert_obs_batch], axis=0) act_batch = np.concatenate([act_batch, expert_act_batch], axis=0) nact_batch = np.concatenate([nact_batch, nexpert_act_batch], axis=0) lprobs_batch = np.expand_dims(np.concatenate( [lprobs_batch, expert_lprobs_batch], axis=0), axis=1).astype(np.float32) feed_dict = { self.act_t: act_batch, self.obs_t: obs_batch, self.nobs_t: nobs_batch, self.nact_t: nact_batch, self.labels: labels, self.lprobs: lprobs_batch, self.lr: lr } rel_loss, abs_loss, ens_loss, _ = tf.get_default_session().run( [ self.student_loss, self.student_absolute_loss, self.ensemble_loss, self.student_step ], feed_dict=feed_dict) if it.heartbeat: print(it.itr_message()) mean_loss = it.pop_mean('loss') print('\tLoss:%f' % mean_loss) if logger: logger.record_tabular('GCLMeanDiscrimLoss', mean_loss) sess = tf.get_default_session() if self.score_discrim is False and self.score_method == 'teacher_student': logger.record_tabular('GCLStudentRelativeLoss', rel_loss) logger.record_tabular('GCLStudentAbsoluteLoss', abs_loss) logger.record_tabular('GCLEnsembleLoss', ens_loss) else: e_loss, weights = sess.run( [self.ensemble_loss, self.weights], feed_dict={ self.act_t: all_acts, self.obs_t: all_obs, self.nobs_t: all_nobs, self.nact_t: all_nacts, self.labels: all_labels, self.lprobs: np.expand_dims(all_probs, axis=1) }) logger.record_tabular('GCLEnsembleDiscrimLoss', e_loss) # logger.record_tabular('TimeWeights', weights) if self.score_discrim is False and self.score_method == 'teacher_student': energy, logZ, dtau, s_rew, s_val, s_dtau = sess.run([self.t0_reward, self.t0_value, self.ensemble_discrim_output, \ self.student_reward, self.student_value, self.student_discrim_output], feed_dict={self.act_t: acts, self.obs_t: obs, self.nobs_t: obs_next, self.nact_t: acts_next, self.lprobs: np.expand_dims(path_probs, axis=1)}) else: energy, logZ, dtau = sess.run( [ self.t0_reward, self.t0_value, self.ensemble_discrim_output ], feed_dict={ self.act_t: acts, self.obs_t: obs, self.nobs_t: obs_next, self.nact_t: acts_next, self.lprobs: np.expand_dims(path_probs, axis=1) }) energy = -energy logger.record_tabular('GCLLogZ', np.mean(logZ)) logger.record_tabular('GCLAverageEnergy', np.mean(energy)) logger.record_tabular('GCLAverageLogPtau', np.mean(-energy - logZ)) logger.record_tabular('GCLAverageLogQtau', np.mean(path_probs)) logger.record_tabular('GCLMedianLogQtau', np.median(path_probs)) logger.record_tabular('GCLAverageDtau', np.mean(dtau)) if self.score_discrim is False and self.score_method == 'teacher_student': logger.record_tabular('GCLAverageStudentEnergy', np.mean(-s_rew)) logger.record_tabular('GCLAverageStudentLogPtau', np.mean(s_rew - s_val)) logger.record_tabular('GCLAverageStudentDtau', np.mean(s_dtau)) if self.score_discrim is False and self.score_method == 'teacher_student': energy, logZ, dtau, s_rew, s_val, s_dtau = sess.run([self.t0_reward, self.t0_value, self.ensemble_discrim_output, \ self.student_reward, self.student_value, self.student_discrim_output], feed_dict={self.act_t: expert_acts, self.obs_t: expert_obs, self.nobs_t: expert_obs_next, self.nact_t: expert_acts_next, self.lprobs: np.expand_dims(expert_probs, axis=1)}) else: energy, logZ, dtau = sess.run( [ self.t0_reward, self.t0_value, self.ensemble_discrim_output ], feed_dict={ self.act_t: expert_acts, self.obs_t: expert_obs, self.nobs_t: expert_obs_next, self.nact_t: expert_acts_next, self.lprobs: np.expand_dims(expert_probs, axis=1) }) energy = -energy logger.record_tabular('GCLAverageExpertEnergy', np.mean(energy)) logger.record_tabular('GCLAverageExpertLogPtau', np.mean(-energy - logZ)) logger.record_tabular('GCLAverageExpertLogQtau', np.mean(expert_probs)) logger.record_tabular('GCLMedianExpertLogQtau', np.median(expert_probs)) logger.record_tabular('GCLAverageExpertDtau', np.mean(dtau)) if self.score_discrim is False and self.score_method == 'teacher_student': logger.record_tabular('GCLAverageStudentExpertEnergy', np.mean(-s_rew)) logger.record_tabular('GCLAverageStudentExpertLogPtau', np.mean(s_rew - s_val)) logger.record_tabular('GCLAverageStudentExpertDtau', np.mean(s_dtau)) return mean_loss def _compute_path_probs_modified(self, paths, pol_dist_type=None, insert=True, insert_key='a_logprobs'): """ Returns a N x T matrix of action probabilities """ if insert_key in paths[0]: return np.array([path[insert_key] for path in paths]) if pol_dist_type is None: # try to infer distribution type path0 = paths[0] if 'log_std' in path0['agent_infos']: pol_dist_type = DIST_GAUSSIAN elif 'prob' in path0['agent_infos']: pol_dist_type = DIST_CATEGORICAL else: raise NotImplementedError() # compute path probs Npath = len(paths) actions = [path['actions'][0] for path in paths] if pol_dist_type == DIST_GAUSSIAN: params = [(path['agent_infos']['mean'], path['agent_infos']['log_std']) for path in paths] path_probs = [ gauss_log_pdf(params[i], actions[i]) for i in range(Npath) ] elif pol_dist_type == DIST_CATEGORICAL: params = [(path['agent_infos']['prob'], ) for path in paths] path_probs = [ categorical_log_pdf(params[i], actions[i]) for i in range(Npath) ] else: raise NotImplementedError("Unknown distribution type") if insert: for i, path in enumerate(paths): path[insert_key] = path_probs[i] return np.array(path_probs) def eval(self, paths, **kwargs): """ Return bonus """ if self.score_discrim: self._compute_path_probs_modified(paths, insert=True) obs, obs_next, acts, path_probs = self.extract_paths( paths, keys=('multi_observations', 'observations_next', 'multi_actions', 'a_logprobs')) path_probs = np.expand_dims(path_probs, axis=1) scores = tf.get_default_session().run(self.ensemble_discrim_output, feed_dict={ self.act_t: acts, self.obs_t: obs, self.nobs_t: obs_next, self.lprobs: path_probs }) score = np.log(np.maximum(scores, 1e-8)) - np.log( np.maximum(1 - scores, 1e-8)) score = score[:, 0] else: if self.score_method == 'sample_rewards': obs, acts = self.extract_paths(paths, keys=('multi_observations', 'multi_actions')) sampled_idx = np.random.randint(self.n_rew_funct) reward = tf.get_default_session().run(self.reward[sampled_idx], feed_dict={ self.act_t: acts[:, :1], self.obs_t: obs[:, :1] }) score = reward[:, 0, 0] elif self.score_method == 'average_rewards': obs, acts = self.extract_paths(paths, keys=('multi_observations', 'multi_actions')) reward = tf.get_default_session().run(self.avg_reward, feed_dict={ self.act_t: acts[:, :1], self.obs_t: obs[:, :1] }) score = reward[:, 0, 0] elif self.score_method == 'teacher_student': obs, acts = self.extract_paths(paths, keys=('multi_observations', 'multi_actions')) reward = tf.get_default_session().run(self.student_reward, feed_dict={ self.act_t: acts[:, :1], self.obs_t: obs[:, :1] }) score = reward[:, 0] return self.unpack(score, paths)