예제 #1
0
    def __init__(self,
                 env,
                 expert_trajs=None,
                 reward_arch=relu_net,
                 reward_arch_args=None,
                 value_fn_arch=relu_net,
                 score_discrim=False,
                 discount=1.0,
                 state_only=False,
                 max_itrs=100,
                 fusion=False,
                 name='airl'):
        super(AIRL, self).__init__()
        if reward_arch_args is None:
            reward_arch_args = {}

        if fusion:
            self.fusion = RamFusionDistr(100, subsample_ratio=0.5)
        else:
            self.fusion = None
        env_spec = env.spec
        self.dO = env_spec.observation_space.flat_dim
        self.dU = env_spec.action_space.flat_dim
        assert isinstance(env_spec.action_space, Box)
        self.score_discrim = score_discrim
        self.gamma = discount
        assert value_fn_arch is not None
        self.set_demos(expert_trajs)
        self.state_only = state_only
        self.max_itrs = max_itrs

        # build energy model
        with tf.variable_scope(name) as _vs:
            # Should be batch_size x T x dO/dU
            self.obs_t = tf.placeholder(tf.float32, [None, self.dO],
                                        name='obs')
            self.nobs_t = tf.placeholder(tf.float32, [None, self.dO],
                                         name='nobs')
            self.act_t = tf.placeholder(tf.float32, [None, self.dU],
                                        name='act')
            self.nact_t = tf.placeholder(tf.float32, [None, self.dU],
                                         name='nact')
            self.labels = tf.placeholder(tf.float32, [None, 1], name='labels')
            self.lprobs = tf.placeholder(tf.float32, [None, 1],
                                         name='log_probs')
            self.lr = tf.placeholder(tf.float32, (), name='lr')

            with tf.variable_scope('discrim') as dvs:
                rew_input = self.obs_t
                if not self.state_only:
                    rew_input = tf.concat([self.obs_t, self.act_t], axis=1)
                with tf.variable_scope('reward'):
                    self.reward = reward_arch(rew_input,
                                              dout=1,
                                              **reward_arch_args)
                    #energy_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name)

                # value function shaping
                with tf.variable_scope('vfn'):
                    fitted_value_fn_n = value_fn_arch(self.nobs_t, dout=1)
                with tf.variable_scope('vfn', reuse=True):
                    self.value_fn = fitted_value_fn = value_fn_arch(self.obs_t,
                                                                    dout=1)

                # Define log p_tau(a|s) = r + gamma * V(s') - V(s)
                self.qfn = self.reward + self.gamma * fitted_value_fn_n
                log_p_tau = self.reward + self.gamma * fitted_value_fn_n - fitted_value_fn

            log_q_tau = self.lprobs

            log_pq = tf.reduce_logsumexp([log_p_tau, log_q_tau], axis=0)
            self.discrim_output = tf.exp(log_p_tau - log_pq)
            cent_loss = -tf.reduce_mean(self.labels * (log_p_tau - log_pq) +
                                        (1 - self.labels) *
                                        (log_q_tau - log_pq))

            self.loss = cent_loss
            tot_loss = self.loss
            self.step = tf.train.AdamOptimizer(
                learning_rate=self.lr).minimize(tot_loss)
            self._make_param_ops(_vs)
예제 #2
0
파일: irl.py 프로젝트: pidchen/atari-irl
    def __init__(
            self,
            *,
            env_spec,  # No good default, but we do need to have it
            expert_trajs=None,
            reward_arch=cnn_net,
            reward_arch_args={},
            value_fn_arch=cnn_net,
            score_discrim=False,
            discount=1.0,
            state_only=False,
            max_itrs=100,
            fusion=False,
            name='airl',
            drop_framestack=False,
            only_show_scores=False,
            rescore_expert_trajs=True,
            encoder_loc=None):
        super(AIRL, self).__init__()

        # Write down everything that we're going to need in order to restore
        # this. All of these arguments are serializable, so it's pretty easy
        self.init_args = dict(model=AtariAIRL,
                              env_spec=env_spec,
                              expert_trajs=expert_trajs,
                              reward_arch=reward_arch,
                              reward_arch_args=reward_arch_args,
                              value_fn_arch=value_fn_arch,
                              score_discrim=score_discrim,
                              discount=discount,
                              state_only=state_only,
                              max_itrs=max_itrs,
                              fusion=fusion,
                              name=name,
                              rescore_expert_trajs=rescore_expert_trajs,
                              drop_framestack=drop_framestack,
                              only_show_scores=only_show_scores,
                              encoder_loc=encoder_loc)

        self.encoder = None if not encoder_loc else encoding.VariationalAutoEncoder.load(
            encoder_loc)
        self.encode_fn = None
        if self.encoder:
            if state_only:
                self.encode_fn = self.encoder.base_vector
            else:
                self.encode_fn = self.encoder.encode

        if fusion:
            self.fusion = RamFusionDistr(100, subsample_ratio=0.5)
        else:
            self.fusion = None

        if self.encoder:
            self.dO = self.encoder.encoding_shape
            self.dOshape = self.encoder.encoding_shape
        else:
            self.dO = env_spec.observation_space.flat_dim
            self.dOshape = env_spec.observation_space.shape

        if drop_framestack:
            assert len(self.dOshape) == 3
            self.dOshape = (*self.dOshape[:-1], 1)

        self.dU = env_spec.action_space.flat_dim
        assert isinstance(env_spec.action_space, Box)
        self.score_discrim = score_discrim
        self.gamma = discount
        assert value_fn_arch is not None
        #self.set_demos(expert_trajs)
        self.expert_trajs = expert_trajs
        self.state_only = state_only
        self.max_itrs = max_itrs
        self.drop_framestack = drop_framestack
        self.only_show_scores = only_show_scores

        self.expert_cache = None
        self.rescore_expert_trajs = rescore_expert_trajs
        # build energy model
        with tf.variable_scope(name) as _vs:
            # Should be batch_size x T x dO/dU
            obs_dtype = tf.int8 if reward_arch == cnn_net else tf.float32
            self.obs_t = tf.placeholder(obs_dtype,
                                        list((None, ) + self.dOshape),
                                        name='obs')
            self.nobs_t = tf.placeholder(obs_dtype,
                                         list((None, ) + self.dOshape),
                                         name='nobs')
            self.act_t = tf.placeholder(tf.float32, [None, self.dU],
                                        name='act')
            self.nact_t = tf.placeholder(tf.float32, [None, self.dU],
                                         name='nact')
            self.labels = tf.placeholder(tf.float32, [None, 1], name='labels')
            self.lprobs = tf.placeholder(tf.float32, [None, 1],
                                         name='log_probs')
            self.lr = tf.placeholder(tf.float32, (), name='lr')

            with tf.variable_scope('discrim') as dvs:
                rew_input = self.obs_t
                with tf.variable_scope('reward'):
                    if self.state_only:
                        self.reward = reward_arch(rew_input,
                                                  dout=1,
                                                  **reward_arch_args)
                    else:
                        print("Not state only", self.act_t)
                        self.reward = reward_arch(rew_input,
                                                  actions=self.act_t,
                                                  dout=1,
                                                  **reward_arch_args)
                # value function shaping
                with tf.variable_scope('vfn'):
                    fitted_value_fn_n = value_fn_arch(self.nobs_t, dout=1)
                with tf.variable_scope('vfn', reuse=True):
                    self.value_fn = fitted_value_fn = value_fn_arch(self.obs_t,
                                                                    dout=1)

                # Define log p_tau(a|s) = r + gamma * V(s') - V(s)
                self.qfn = self.reward + self.gamma * fitted_value_fn_n
                log_p_tau = self.reward + self.gamma * fitted_value_fn_n - fitted_value_fn

            log_q_tau = self.lprobs

            log_pq = tf.reduce_logsumexp([log_p_tau, log_q_tau], axis=0)
            self.discrim_output = tf.exp(log_p_tau - log_pq)
            self.accuracy, self.update_accuracy = tf.metrics.accuracy(
                labels=self.labels, predictions=self.discrim_output > 0.5)
            self.loss = -tf.reduce_mean(self.labels * (log_p_tau - log_pq) +
                                        (1 - self.labels) *
                                        (log_q_tau - log_pq))
            self.step = tf.train.AdamOptimizer(learning_rate=self.lr).minimize(
                self.loss)
            self._make_param_ops(_vs)

            self.grad_reward = tf.gradients(self.reward,
                                            [self.obs_t, self.act_t])

            self.modify_obs = self.get_ablation_modifiers()

            self.score_mean = 0
            self.score_std = 1
예제 #3
0
class AIRL(SingleTimestepIRL):
    """ 


    Args:
        fusion (bool): Use trajectories from old iterations to train.
        state_only (bool): Fix the learned reward to only depend on state.
        score_discrim (bool): Use log D - log 1-D as reward (if true you should not need to use an entropy bonus)
        max_itrs (int): Number of training iterations to run per fit step.
    """
    def __init__(self,
                 env,
                 expert_trajs=None,
                 reward_arch=relu_net,
                 reward_arch_args=None,
                 value_fn_arch=relu_net,
                 score_discrim=False,
                 discount=1.0,
                 state_only=False,
                 max_itrs=100,
                 fusion=False,
                 name='airl'):
        super(AIRL, self).__init__()
        if reward_arch_args is None:
            reward_arch_args = {}

        if fusion:
            self.fusion = RamFusionDistr(100, subsample_ratio=0.5)
        else:
            self.fusion = None
        env_spec = env.spec
        self.dO = env_spec.observation_space.flat_dim
        self.dU = env_spec.action_space.flat_dim
        assert isinstance(env_spec.action_space, Box)
        self.score_discrim = score_discrim
        self.gamma = discount
        assert value_fn_arch is not None
        self.set_demos(expert_trajs)
        self.state_only = state_only
        self.max_itrs = max_itrs

        # build energy model
        with tf.variable_scope(name) as _vs:
            # Should be batch_size x T x dO/dU
            self.obs_t = tf.placeholder(tf.float32, [None, self.dO],
                                        name='obs')
            self.nobs_t = tf.placeholder(tf.float32, [None, self.dO],
                                         name='nobs')
            self.act_t = tf.placeholder(tf.float32, [None, self.dU],
                                        name='act')
            self.nact_t = tf.placeholder(tf.float32, [None, self.dU],
                                         name='nact')
            self.labels = tf.placeholder(tf.float32, [None, 1], name='labels')
            self.lprobs = tf.placeholder(tf.float32, [None, 1],
                                         name='log_probs')
            self.lr = tf.placeholder(tf.float32, (), name='lr')

            with tf.variable_scope('discrim') as dvs:
                rew_input = self.obs_t
                if not self.state_only:
                    rew_input = tf.concat([self.obs_t, self.act_t], axis=1)
                with tf.variable_scope('reward'):
                    self.reward = reward_arch(rew_input,
                                              dout=1,
                                              **reward_arch_args)
                    #energy_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name)

                # value function shaping
                with tf.variable_scope('vfn'):
                    fitted_value_fn_n = value_fn_arch(self.nobs_t, dout=1)
                with tf.variable_scope('vfn', reuse=True):
                    self.value_fn = fitted_value_fn = value_fn_arch(self.obs_t,
                                                                    dout=1)

                # Define log p_tau(a|s) = r + gamma * V(s') - V(s)
                self.qfn = self.reward + self.gamma * fitted_value_fn_n
                log_p_tau = self.reward + self.gamma * fitted_value_fn_n - fitted_value_fn

            log_q_tau = self.lprobs

            log_pq = tf.reduce_logsumexp([log_p_tau, log_q_tau], axis=0)
            self.discrim_output = tf.exp(log_p_tau - log_pq)
            cent_loss = -tf.reduce_mean(self.labels * (log_p_tau - log_pq) +
                                        (1 - self.labels) *
                                        (log_q_tau - log_pq))

            self.loss = cent_loss
            tot_loss = self.loss
            self.step = tf.train.AdamOptimizer(
                learning_rate=self.lr).minimize(tot_loss)
            self._make_param_ops(_vs)

    def fit(self,
            paths,
            policy=None,
            batch_size=32,
            logger=None,
            lr=1e-3,
            **kwargs):

        if self.fusion is not None:
            old_paths = self.fusion.sample_paths(n=len(paths))
            self.fusion.add_paths(paths)
            paths = paths + old_paths

        # eval samples under current policy
        self._compute_path_probs(paths, insert=True)

        # eval expert log probs under current policy
        self.eval_expert_probs(self.expert_trajs, policy, insert=True)

        self._insert_next_state(paths)
        self._insert_next_state(self.expert_trajs)
        obs, obs_next, acts, acts_next, path_probs = \
            self.extract_paths(paths,
                               keys=('observations', 'observations_next', 'actions', 'actions_next', 'a_logprobs'))
        expert_obs, expert_obs_next, expert_acts, expert_acts_next, expert_probs = \
            self.extract_paths(self.expert_trajs,
                               keys=('observations', 'observations_next', 'actions', 'actions_next', 'a_logprobs'))

        # Train discriminator
        for it in TrainingIterator(self.max_itrs, heartbeat=5):
            nobs_batch, obs_batch, nact_batch, act_batch, lprobs_batch = \
                self.sample_batch(obs_next, obs, acts_next, acts, path_probs, batch_size=batch_size)

            nexpert_obs_batch, expert_obs_batch, nexpert_act_batch, expert_act_batch, expert_lprobs_batch = \
                self.sample_batch(expert_obs_next, expert_obs, expert_acts_next, expert_acts, expert_probs, batch_size=batch_size)

            # Build feed dict
            labels = np.zeros((batch_size * 2, 1))
            labels[batch_size:] = 1.0
            obs_batch = np.concatenate([obs_batch, expert_obs_batch], axis=0)
            nobs_batch = np.concatenate([nobs_batch, nexpert_obs_batch],
                                        axis=0)
            act_batch = np.concatenate([act_batch, expert_act_batch], axis=0)
            nact_batch = np.concatenate([nact_batch, nexpert_act_batch],
                                        axis=0)
            lprobs_batch = np.expand_dims(np.concatenate(
                [lprobs_batch, expert_lprobs_batch], axis=0),
                                          axis=1).astype(np.float32)
            feed_dict = {
                self.act_t: act_batch,
                self.obs_t: obs_batch,
                self.nobs_t: nobs_batch,
                self.nact_t: nact_batch,
                self.labels: labels,
                self.lprobs: lprobs_batch,
                self.lr: lr
            }

            loss, _ = tf.get_default_session().run([self.loss, self.step],
                                                   feed_dict=feed_dict)
            it.record('loss', loss)
            if it.heartbeat:
                print(it.itr_message())
                mean_loss = it.pop_mean('loss')
                print('\tLoss:%f' % mean_loss)

        if logger:
            logger.record_tabular('GCLDiscrimLoss', mean_loss)
            #obs_next = np.r_[obs_next, np.expand_dims(obs_next[-1], axis=0)]
            energy, logZ, dtau = tf.get_default_session().run(
                [self.reward, self.value_fn, self.discrim_output],
                feed_dict={
                    self.act_t: acts,
                    self.obs_t: obs,
                    self.nobs_t: obs_next,
                    self.nact_t: acts_next,
                    self.lprobs: np.expand_dims(path_probs, axis=1)
                })
            energy = -energy
            logger.record_tabular('GCLLogZ', np.mean(logZ))
            logger.record_tabular('GCLAverageEnergy', np.mean(energy))
            logger.record_tabular('GCLAverageLogPtau', np.mean(-energy - logZ))
            logger.record_tabular('GCLAverageLogQtau', np.mean(path_probs))
            logger.record_tabular('GCLMedianLogQtau', np.median(path_probs))
            logger.record_tabular('GCLAverageDtau', np.mean(dtau))

            #expert_obs_next = np.r_[expert_obs_next, np.expand_dims(expert_obs_next[-1], axis=0)]
            energy, logZ, dtau = tf.get_default_session().run(
                [self.reward, self.value_fn, self.discrim_output],
                feed_dict={
                    self.act_t: expert_acts,
                    self.obs_t: expert_obs,
                    self.nobs_t: expert_obs_next,
                    self.nact_t: expert_acts_next,
                    self.lprobs: np.expand_dims(expert_probs, axis=1)
                })
            energy = -energy
            logger.record_tabular('GCLAverageExpertEnergy', np.mean(energy))
            logger.record_tabular('GCLAverageExpertLogPtau',
                                  np.mean(-energy - logZ))
            logger.record_tabular('GCLAverageExpertLogQtau',
                                  np.mean(expert_probs))
            logger.record_tabular('GCLMedianExpertLogQtau',
                                  np.median(expert_probs))
            logger.record_tabular('GCLAverageExpertDtau', np.mean(dtau))
        return mean_loss

    def eval(self, paths, **kwargs):
        """
        Return bonus
        """
        if self.score_discrim:
            self._compute_path_probs(paths, insert=True)
            obs, obs_next, acts, path_probs = self.extract_paths(
                paths,
                keys=('observations', 'observations_next', 'actions',
                      'a_logprobs'))
            path_probs = np.expand_dims(path_probs, axis=1)
            scores = tf.get_default_session().run(self.discrim_output,
                                                  feed_dict={
                                                      self.act_t: acts,
                                                      self.obs_t: obs,
                                                      self.nobs_t: obs_next,
                                                      self.lprobs: path_probs
                                                  })
            test1 = np.log(1 - scores)
            score = np.log(scores) - np.log(1 - scores)
            score = score[:, 0]
        else:
            obs, acts = self.extract_paths(paths)
            reward = tf.get_default_session().run(self.reward,
                                                  feed_dict={
                                                      self.act_t: acts,
                                                      self.obs_t: obs
                                                  })
            score = reward[:, 0]
        return self.unpack(score, paths)

    def eval_single(self, obs):
        reward = tf.get_default_session().run(self.reward,
                                              feed_dict={self.obs_t: obs})
        score = reward[:, 0]
        return score

    def debug_eval(self, paths, **kwargs):
        obs, acts = self.extract_paths(paths)
        reward, v, qfn = tf.get_default_session().run(
            [self.reward, self.value_fn, self.qfn],
            feed_dict={
                self.act_t: acts,
                self.obs_t: obs
            })
        return {
            'reward': reward,
            'value': v,
            'qfn': qfn,
        }