Ejemplo n.º 1
0
    def __init__(
            self,
            env,
            emp_fn_arch=relu_net,  #_dropout,
            scope='efn',
            max_itrs=100,
            fusion=False,
            name='empowerment'):
        super(Empowerment, self).__init__()
        env_spec = env.spec

        if fusion:
            self.fusion = RamFusionDistr(100, subsample_ratio=0.5)
        else:
            self.fusion = None
        self.dO = env_spec.observation_space.flat_dim
        self.dU = env_spec.action_space.flat_dim
        assert isinstance(env.action_space, Box)
        assert emp_fn_arch is not None
        self.max_itrs = max_itrs

        # build energy model
        with tf.variable_scope(name) as _vs:
            # Should be batch_size x T x dO/dU
            self.obs_t = tf.placeholder(tf.float32, [None, self.dO],
                                        name='obs')
            self.act_qvar = tf.placeholder(tf.float32, [None, 1],
                                           name='act_qvar')
            self.act_policy = tf.placeholder(tf.float32, [None, 1],
                                             name='act_policy')
            self.lr = tf.placeholder(tf.float32, (), name='lr')

            # empowerment function
            with tf.variable_scope(scope):
                self.empwerment = emp_fn_arch(self.obs_t, dout=1)

            cent_loss = tf.losses.mean_squared_error(
                predictions=(self.empwerment + self.act_policy),
                labels=self.act_qvar)

            self.loss_emp = cent_loss
            tot_loss_emp = self.loss_emp
            self.step_emp = tf.train.AdamOptimizer(
                learning_rate=self.lr).minimize(tot_loss_emp)

            self._make_param_ops(_vs)
Ejemplo n.º 2
0
    def __init__(self, env,
                 expert_trajs=None,
                 qvar=None,
                 score_discrim=False,
                 discount=1.0,
                 state_only=False,
                 max_itrs=100,
                 fusion=False,
                 name='qvar'):
        super(Qvar, self).__init__()
        env_spec = env.spec
        if qvar is not None:
            self.qvar = qvar

        if fusion:
            self.fusion = RamFusionDistr(100, subsample_ratio=0.5)
        else:
            self.fusion = None
        self.dO = env_spec.observation_space.flat_dim
        self.dU = env_spec.action_space.flat_dim
        assert isinstance(env.action_space, Box)
        self.set_demos(expert_trajs)
        self.max_itrs=max_itrs

        # build energy model
        with tf.variable_scope(name) as _vs:
            # Should be batch_size x T x dO/dU
            self.act_t = tf.placeholder(tf.float32, [None, self.dU], name='act')
            self.obs_t = tf.placeholder(tf.float32, [None, self.dO], name='obs')
            self.nobs_t = tf.placeholder(tf.float32, [None, self.dO], name='nobs')
            self.lr = tf.placeholder(tf.float32, (), name='lr') 

            with tf.variable_scope('q_var') as dvs:
                q_input = tf.concat([self.obs_t,self.nobs_t],axis=1)
                self.act_predicted=self.qvar.dist_info_sym(q_input,None)

            self.loss_q = tf.losses.mean_squared_error(predictions=self.act_predicted["mean"],labels=self.act_t)
            tot_loss_q = self.loss_q


            self.step_q = tf.train.AdamOptimizer(learning_rate=self.lr).minimize(tot_loss_q)
            self._make_param_ops(_vs)
Ejemplo n.º 3
0
    def __init__(self, env,
                 expert_trajs=None,
                 reward_arch=relu_net,
                 reward_arch_args=None,
                 value_fn_arch=relu_net,
                 score_discrim=False,
                 discount=1.0,
                 state_only=False,
                 max_itrs=100,
                 fusion=False,
                 name='airl'):
        super(AIRL, self).__init__()
        env_spec = env.spec
        if reward_arch_args is None:
            reward_arch_args = {}

        if fusion:
            self.fusion = RamFusionDistr(100, subsample_ratio=0.5)
        else:
            self.fusion = None
        self.dO = env_spec.observation_space.flat_dim
        self.dU = env_spec.action_space.flat_dim
        assert isinstance(env.action_space, Box)
        self.score_discrim = score_discrim
        self.gamma = discount
        assert value_fn_arch is not None
        self.set_demos(expert_trajs)
        self.state_only=state_only
        self.max_itrs=max_itrs

        # build energy model
        with tf.variable_scope(name) as _vs:
            # Should be batch_size x T x dO/dU
            self.obs_t = tf.placeholder(tf.float32, [None, self.dO], name='obs')
            self.nobs_t = tf.placeholder(tf.float32, [None, self.dO], name='nobs')
            self.act_t = tf.placeholder(tf.float32, [None, self.dU], name='act')
            self.nact_t = tf.placeholder(tf.float32, [None, self.dU], name='nact')
            self.labels = tf.placeholder(tf.float32, [None, 1], name='labels')
            self.lprobs = tf.placeholder(tf.float32, [None, 1], name='log_probs')
            self.lr = tf.placeholder(tf.float32, (), name='lr')

            with tf.variable_scope('discrim') as dvs:
                rew_input = self.obs_t
                if not self.state_only:
                    rew_input = tf.concat([self.obs_t, self.act_t], axis=1)
                with tf.variable_scope('reward'):
                    self.reward = reward_arch(rew_input, dout=1, **reward_arch_args)
                    #energy_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name)

                # value function shaping
                with tf.variable_scope('vfn'):
                    fitted_value_fn_n = value_fn_arch(self.nobs_t, dout=1)
                with tf.variable_scope('vfn', reuse=True):
                    self.value_fn = fitted_value_fn = value_fn_arch(self.obs_t, dout=1)

                # Define log p_tau(a|s) = r + gamma * V(s') - V(s)
                self.qfn = self.reward + self.gamma*fitted_value_fn_n
                log_p_tau = self.reward  + self.gamma*fitted_value_fn_n - fitted_value_fn

            log_q_tau = self.lprobs

            log_pq = tf.reduce_logsumexp([log_p_tau, log_q_tau], axis=0)
            self.discrim_output = tf.exp(log_p_tau-log_pq)
            cent_loss = -tf.reduce_mean(self.labels*(log_p_tau-log_pq) + (1-self.labels)*(log_q_tau-log_pq))

            self.loss = cent_loss
            tot_loss = self.loss
            self.step = tf.train.AdamOptimizer(learning_rate=self.lr).minimize(tot_loss)
            self._make_param_ops(_vs)
Ejemplo n.º 4
0
    def __init__(self,
                 env,
                 expert_trajs=None,
                 discrim_arch=relu_net,
                 discrim_arch_args={},
                 normalize_reward=False,
                 score_dtau=False,
                 init_itrs=None,
                 discount=1.0,
                 l2_reg=0,
                 state_only=False,
                 shaping_with_actions=False,
                 max_itrs=100,
                 fusion=False,
                 fusion_subsample=0.5,
                 action_penalty=0.0,
                 name='trajprior'):
        super(AIRL, self).__init__()
        env_spec = env.spec
        if fusion:
            self.fusion = RamFusionDistr(100, subsample_ratio=fusion_subsample)
        else:
            self.fusion = None
        self.dO = env_spec.observation_space.flat_dim
        self.dU = env_spec.action_space.flat_dim
        if isinstance(env.action_space, Box):
            self.continuous = True
        else:
            self.continuous = False
        self.normalize_reward = normalize_reward
        self.score_dtau = score_dtau
        self.init_itrs = init_itrs
        self.gamma = discount
        #assert fitted_value_fn_arch is not None
        self.set_demos(expert_trajs)
        self.state_only = state_only
        self.max_itrs = max_itrs

        # build energy model
        with tf.variable_scope(name) as _vs:
            # Should be batch_size x T x dO/dU
            self.obs_t = tf.placeholder(tf.float32, [None, self.dO],
                                        name='obs')
            self.nobs_t = tf.placeholder(tf.float32, [None, self.dO],
                                         name='nobs')
            self.act_t = tf.placeholder(tf.float32, [None, self.dU],
                                        name='act')
            self.nact_t = tf.placeholder(tf.float32, [None, self.dU],
                                         name='nact')
            self.labels = tf.placeholder(tf.float32, [None, 1], name='labels')
            self.lprobs = tf.placeholder(tf.float32, [None, 1],
                                         name='log_probs')
            self.lr = tf.placeholder(tf.float32, (), name='lr')

            #obs_act = tf.concat([self.obs_t, self.act_t], axis=1)
            with tf.variable_scope('discrim') as dvs:
                if self.state_only:
                    with tf.variable_scope('energy') as vs:
                        # reward function (or q-function)
                        self.energy = discrim_arch(self.obs_t,
                                                   dout=1,
                                                   **discrim_arch_args)
                        energy_vars = tf.get_collection(
                            tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name)
                else:
                    if self.continuous:
                        obs_act = tf.concat([self.obs_t, self.act_t], axis=1)
                        with tf.variable_scope('energy') as vs:
                            # reward function (or q-function)
                            self.energy = discrim_arch(obs_act,
                                                       dout=1,
                                                       **discrim_arch_args)
                            energy_vars = tf.get_collection(
                                tf.GraphKeys.TRAINABLE_VARIABLES,
                                scope=vs.name)
                    else:
                        raise ValueError()

                if shaping_with_actions:
                    nobs_act = tf.concat([self.nobs_t, self.nact_t], axis=1)
                    obs_act = tf.concat([self.obs_t, self.act_t], axis=1)
                else:
                    nobs_act = self.nobs_t
                    obs_act = self.obs_t

                # with tf.variable_scope('vfn'):
                #     fitted_value_fn_n = fitted_value_fn_arch(nobs_act, dout=1)
                # with tf.variable_scope('vfn', reuse=True):
                #     self.value_fn = fitted_value_fn = fitted_value_fn_arch(obs_act, dout=1)

                self.value_fn = tf.zeros(shape=[])

                # Define log p_tau(a|s) = r + gamma * V(s') - V(s)

                if action_penalty > 0:
                    self.r = r = -self.energy + action_penalty * tf.reduce_sum(
                        tf.square(self.act_t), axis=1, keepdims=True)
                else:
                    self.r = r = -self.energy

                self.qfn = r  #+self.gamma*fitted_value_fn_n
                log_p_tau = r  #  + self.gamma*fitted_value_fn_n - fitted_value_fn
                discrim_vars = tf.get_collection('reg_vars', scope=dvs.name)

            log_q_tau = self.lprobs

            if l2_reg > 0:
                reg_loss = l2_reg * tf.reduce_sum(
                    [tf.reduce_sum(tf.square(var)) for var in discrim_vars])
            else:
                reg_loss = 0

            log_pq = tf.reduce_logsumexp([log_p_tau, log_q_tau], axis=0)
            self.d_tau = tf.exp(log_p_tau - log_pq)
            cent_loss = -tf.reduce_mean(self.labels * (log_p_tau - log_pq) +
                                        (1 - self.labels) *
                                        (log_q_tau - log_pq))

            self.loss = cent_loss
            tot_loss = self.loss + reg_loss
            self.step = tf.train.AdamOptimizer(
                learning_rate=self.lr).minimize(tot_loss)
            self._make_param_ops(_vs)
Ejemplo n.º 5
0
    def __init__(self,
                 env,
                 expert_trajs=None,
                 reward_arch=relu_net,
                 reward_arch_args=None,
                 value_fn_arch=relu_net,
                 score_discrim=False,
                 sess=None,
                 discount=1.0,
                 max_nstep=10,
                 n_value_funct=1,
                 n_rew_funct=1,
                 state_only=False,
                 max_itrs=100,
                 fusion=False,
                 debug=False,
                 score_method=None,
                 name='airl'):
        super(AIRL_Bootstrap, self).__init__()
        env_spec = env.spec
        if reward_arch_args is None:
            reward_arch_args = {}

        if fusion:
            self.fusion = RamFusionDistr(100, subsample_ratio=0.5)
        else:
            self.fusion = None
        self.dO = env_spec.observation_space.flat_dim
        self.dU = env_spec.action_space.flat_dim
        assert isinstance(env.action_space, Box)
        self.score_discrim = score_discrim
        self.gamma = discount
        assert value_fn_arch is not None
        self.set_demos(expert_trajs)
        self.state_only = state_only
        self.max_itrs = max_itrs
        self.max_nstep = max_nstep
        self.n_value_funct = n_value_funct
        self.n_rew_funct = n_rew_funct

        self.reward_arch = reward_arch
        self.reward_arch_args = reward_arch_args
        self.value_fn_arch = value_fn_arch

        self.score_method = score_method

        self.debug = debug

        # build energy model
        with tf.variable_scope(name) as _vs:
            # Should be batch_size x T x dO/dU
            self.obs_t = tf.placeholder(tf.float32, [None, None, self.dO],
                                        name='obs')
            self.nobs_t = tf.placeholder(tf.float32, [None, self.dO],
                                         name='nobs')
            self.act_t = tf.placeholder(tf.float32, [None, None, self.dU],
                                        name='act')
            self.nact_t = tf.placeholder(tf.float32, [None, self.dU],
                                         name='nact')
            self.labels = tf.placeholder(tf.float32, [None, 1], name='labels')
            self.lprobs = tf.placeholder(tf.float32, [None, 1],
                                         name='log_probs')
            self.lr = tf.placeholder(tf.float32, (), name='lr')

            number_obs = tf.shape(self.obs_t)[1]

            with tf.variable_scope('discrim') as dvs:
                rew_input = self.obs_t
                if not self.state_only:
                    rew_input = tf.concat([self.obs_t, self.act_t], axis=2)

                self.reward = [None for i in range(self.n_rew_funct)]
                self.value_fn = [None for i in range(self.n_value_funct)]
                fitted_value_fn_n = [None for i in range(self.n_value_funct)]
                self.qfn = [[None for i in range(self.n_value_funct)]
                            for j in range(self.n_rew_funct)]
                self.discrim_output = [[
                    None for i in range(self.n_value_funct)
                ] for j in range(self.n_rew_funct)]
                self.loss = [[None for i in range(self.n_value_funct)]
                             for j in range(self.n_rew_funct)]
                self.step = [[None for i in range(self.n_value_funct)]
                             for j in range(self.n_rew_funct)]

                log_q_tau = self.lprobs

                for i in range(self.n_rew_funct):
                    with tf.variable_scope('reward_%d' % (i),
                                           reuse=tf.AUTO_REUSE):
                        self.reward[i] = self.reward_arch(
                            tf.reshape(rew_input, [-1, rew_input.shape[2]]),
                            dout=1,
                            **self.reward_arch_args)
                        self.reward[i] = tf.reshape(self.reward[i],
                                                    [-1, number_obs, 1])
                        #energy_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name)
                for j in range(self.n_value_funct):
                    # value function shaping
                    with tf.variable_scope('vfn_%d' % (j),
                                           reuse=tf.AUTO_REUSE):
                        fitted_value_fn_n[j] = self.value_fn_arch(self.nobs_t,
                                                                  dout=1)
                    with tf.variable_scope('vfn_%d' % (j),
                                           reuse=tf.AUTO_REUSE):
                        self.value_fn[j] = self.value_fn_arch(self.obs_t[:,
                                                                         0, :],
                                                              dout=1)

                self.avg_reward = tf.reduce_mean(tf.stack(self.reward), axis=0)

                gamma_coefs = tf.concat([
                    tf.ones([1], dtype=tf.float32),
                    self.gamma * tf.ones([number_obs - 1], dtype=tf.float32)
                ],
                                        axis=0)
                gamma_coefs = tf.cumprod(gamma_coefs)
                gamma_coefs = tf.expand_dims(gamma_coefs, axis=1)

                for i in range(self.n_rew_funct):
                    for j in range(self.n_value_funct):
                        # Define log p_tau(a|s) = r + gamma * V(s') - V(s)
                        self.qfn[i][j] = tf.reduce_sum(
                            self.reward[i] * gamma_coefs,
                            axis=1) + tf.math.pow(
                                tf.constant(self.gamma),
                                tf.to_float(number_obs)) * fitted_value_fn_n[j]
                        log_p_tau = self.qfn[i][j] - self.value_fn[j]

                        log_pq = tf.reduce_logsumexp([log_p_tau, log_q_tau],
                                                     axis=0)
                        self.discrim_output[i][j] = tf.exp(log_p_tau - log_pq)
                        cent_loss = -tf.reduce_mean(self.labels *
                                                    (log_p_tau - log_pq) +
                                                    (1 - self.labels) *
                                                    (log_q_tau - log_pq))

                        self.loss[i][j] = cent_loss

                        self.step[i][j] = tf.train.AdamOptimizer(
                            learning_rate=self.lr).minimize(self.loss[i][j])

                self._combine_predictions()
            self._make_param_ops(_vs)
Ejemplo n.º 6
0
    def __init__(
            self,
            *,
            env_spec,  # No good default, but we do need to have it
            expert_trajs=None,
            reward_arch=cnn_net,
            reward_arch_args={},
            value_fn_arch=cnn_net,
            score_discrim=False,
            discount=1.0,
            state_only=False,
            max_itrs=100,
            fusion=False,
            name='airl',
            drop_framestack=False,
            only_show_scores=False,
            rescore_expert_trajs=True,
            encoder_loc=None):
        super(AIRL, self).__init__()

        # Write down everything that we're going to need in order to restore
        # this. All of these arguments are serializable, so it's pretty easy
        self.init_args = dict(model=AtariAIRL,
                              env_spec=env_spec,
                              expert_trajs=expert_trajs,
                              reward_arch=reward_arch,
                              reward_arch_args=reward_arch_args,
                              value_fn_arch=value_fn_arch,
                              score_discrim=score_discrim,
                              discount=discount,
                              state_only=state_only,
                              max_itrs=max_itrs,
                              fusion=fusion,
                              name=name,
                              rescore_expert_trajs=rescore_expert_trajs,
                              drop_framestack=drop_framestack,
                              only_show_scores=only_show_scores,
                              encoder_loc=encoder_loc)

        self.encoder = None if not encoder_loc else encoding.VariationalAutoEncoder.load(
            encoder_loc)
        self.encode_fn = None
        if self.encoder:
            if state_only:
                self.encode_fn = self.encoder.base_vector
            else:
                self.encode_fn = self.encoder.encode

        if fusion:
            self.fusion = RamFusionDistr(100, subsample_ratio=0.5)
        else:
            self.fusion = None

        if self.encoder:
            self.dO = self.encoder.encoding_shape
            self.dOshape = self.encoder.encoding_shape
        else:
            self.dO = env_spec.observation_space.flat_dim
            self.dOshape = env_spec.observation_space.shape

        if drop_framestack:
            assert len(self.dOshape) == 3
            self.dOshape = (*self.dOshape[:-1], 1)

        self.dU = env_spec.action_space.flat_dim
        assert isinstance(env_spec.action_space, Box)
        self.score_discrim = score_discrim
        self.gamma = discount
        assert value_fn_arch is not None
        #self.set_demos(expert_trajs)
        self.expert_trajs = expert_trajs
        self.state_only = state_only
        self.max_itrs = max_itrs
        self.drop_framestack = drop_framestack
        self.only_show_scores = only_show_scores

        self.expert_cache = None
        self.rescore_expert_trajs = rescore_expert_trajs
        # build energy model
        with tf.variable_scope(name) as _vs:
            # Should be batch_size x T x dO/dU
            obs_dtype = tf.int8 if reward_arch == cnn_net else tf.float32
            self.obs_t = tf.placeholder(obs_dtype,
                                        list((None, ) + self.dOshape),
                                        name='obs')
            self.nobs_t = tf.placeholder(obs_dtype,
                                         list((None, ) + self.dOshape),
                                         name='nobs')
            self.act_t = tf.placeholder(tf.float32, [None, self.dU],
                                        name='act')
            self.nact_t = tf.placeholder(tf.float32, [None, self.dU],
                                         name='nact')
            self.labels = tf.placeholder(tf.float32, [None, 1], name='labels')
            self.lprobs = tf.placeholder(tf.float32, [None, 1],
                                         name='log_probs')
            self.lr = tf.placeholder(tf.float32, (), name='lr')

            with tf.variable_scope('discrim') as dvs:
                rew_input = self.obs_t
                with tf.variable_scope('reward'):
                    if self.state_only:
                        self.reward = reward_arch(rew_input,
                                                  dout=1,
                                                  **reward_arch_args)
                    else:
                        print("Not state only", self.act_t)
                        self.reward = reward_arch(rew_input,
                                                  actions=self.act_t,
                                                  dout=1,
                                                  **reward_arch_args)
                # value function shaping
                with tf.variable_scope('vfn'):
                    fitted_value_fn_n = value_fn_arch(self.nobs_t, dout=1)
                with tf.variable_scope('vfn', reuse=True):
                    self.value_fn = fitted_value_fn = value_fn_arch(self.obs_t,
                                                                    dout=1)

                # Define log p_tau(a|s) = r + gamma * V(s') - V(s)
                self.qfn = self.reward + self.gamma * fitted_value_fn_n
                log_p_tau = self.reward + self.gamma * fitted_value_fn_n - fitted_value_fn

            log_q_tau = self.lprobs

            log_pq = tf.reduce_logsumexp([log_p_tau, log_q_tau], axis=0)
            self.discrim_output = tf.exp(log_p_tau - log_pq)
            self.accuracy, self.update_accuracy = tf.metrics.accuracy(
                labels=self.labels, predictions=self.discrim_output > 0.5)
            self.loss = -tf.reduce_mean(self.labels * (log_p_tau - log_pq) +
                                        (1 - self.labels) *
                                        (log_q_tau - log_pq))
            self.step = tf.train.AdamOptimizer(learning_rate=self.lr).minimize(
                self.loss)
            self._make_param_ops(_vs)

            self.grad_reward = tf.gradients(self.reward,
                                            [self.obs_t, self.act_t])

            self.modify_obs = self.get_ablation_modifiers()

            self.score_mean = 0
            self.score_std = 1
Ejemplo n.º 7
0
    def __init__(self,
                 env,
                 expert_trajs=None,
                 reward_arch=relu_net,
                 reward_arch_args=None,
                 value_fn_arch=relu_net,
                 score_discrim=False,
                 discount=1.0,
                 state_only=False,
                 max_itrs=100,
                 fusion=False,
                 name='airl'):
        super(AIRL, self).__init__()
        env_spec = env.spec
        if reward_arch_args is None:
            reward_arch_args = {}

        if fusion:
            self.fusion = RamFusionDistr(100, subsample_ratio=0.5)
        else:
            self.fusion = None
        self.dO = env_spec.observation_space.flat_dim
        self.dU = env_spec.action_space.flat_dim
        assert isinstance(env.action_space, Box)
        self.score_discrim = score_discrim
        self.gamma = discount
        assert value_fn_arch is not None
        self.set_demos(expert_trajs)
        self.state_only = state_only
        self.max_itrs = max_itrs

        # build energy model
        with tf.variable_scope(name) as _vs:
            # Should be batch_size x T x dO/dU
            self.obs_t = tf.placeholder(tf.float32, [None, self.dO],
                                        name='state/obs')
            self.nobs_t = tf.placeholder(tf.float32, [None, self.dO],
                                         name='next_state/nobs')
            self.act_t = tf.placeholder(tf.float32, [None, self.dU],
                                        name='action/act')
            self.nact_t = tf.placeholder(tf.float32, [None, self.dU],
                                         name='next_action/nact')
            self.labels = tf.placeholder(tf.float32, [None, 1], name='labels')
            self.lprobs = tf.placeholder(tf.float32, [None, 1],
                                         name='log_probs')
            self.lr = tf.placeholder(tf.float32, (), name='lr')

            with tf.variable_scope('discrim') as dvs:
                # if r(s)
                rew_input = self.obs_t
                # if r(s,a)
                if not self.state_only:
                    rew_input = tf.concat([self.obs_t, self.act_t], axis=1)
                ######################
                # (1) compute r(s)/r(s,a):
                ######################
                with tf.variable_scope('reward'):
                    self.reward = reward_arch(rew_input,
                                              dout=1,
                                              **reward_arch_args)
                    #energy_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name)

                ########################
                # (2) value function shaping
                ########################
                # V(s')
                with tf.variable_scope('vfn'):
                    fitted_value_fn_n = value_fn_arch(self.nobs_t, dout=1)
                # V(s)
                with tf.variable_scope('vfn', reuse=True):
                    self.value_fn = fitted_value_fn = value_fn_arch(self.obs_t,
                                                                    dout=1)

                ######################################################
                # (3) compute f(s,a,s')
                #
                # Define log p_tau(a|s) = r + gamma * V(s') - V(s)
                ######################################################

                # log p_tau(a|s) likelihood of action given state
                # self.qfn = Q(s,a) = r + \gamma * V(s')
                # log p_tau = Q(s,a) - V(s) = A(s,a) = f(s,a,s')

                # computes Q(s,a)
                self.qfn = self.reward + self.gamma * fitted_value_fn_n
                # computes f(s,a,s') = log p_tau
                log_p_tau = self.reward + self.gamma * fitted_value_fn_n - fitted_value_fn

            # log pi(a|s)
            log_q_tau = self.lprobs

            # np.log(np.sum(np.exp(a)))
            #x = tf.constant([[0., 0., 0.], [0., 0., 0.]])
            # tf.reduce_logsumexp(x)  # log(6)
            # tf.reduce_logsumexp(x, axis = 0)  # [log(2), log(2), log(2)]
            log_pq = tf.reduce_logsumexp([log_p_tau, log_q_tau], axis=0)

            # computes D = e ( f/(f+pi) )
            self.discrim_output = tf.exp(log_p_tau - log_pq)

            # loss = - reward = - (log D - log(1-D))
            cent_loss = -tf.reduce_mean(self.labels * (log_p_tau - log_pq) +
                                        (1 - self.labels) *
                                        (log_q_tau - log_pq))

            self.loss = cent_loss
            tot_loss = self.loss
            self.step = tf.train.AdamOptimizer(
                learning_rate=self.lr).minimize(tot_loss)
            self._make_param_ops(_vs)