def __init__(
        self,
        scope,
        ob_space,
        ac_space,
        policy_size="normal",
        extrahid=True,
        hidsize=128,
        memsize=128,
        rec_gate_init=0.0,
        update_ob_stats_independently_per_gpu=True,
        proportion_of_exp_used_for_predictor_update=1.0,
        dynamics_bonus=False,
        meta_rl=False,
    ):
        StochasticPolicy.__init__(self,
                                  scope,
                                  ob_space,
                                  ac_space,
                                  meta_rl=meta_rl)
        self.proportion_of_exp_used_for_predictor_update = (
            proportion_of_exp_used_for_predictor_update)
        enlargement = {"small": 1, "normal": 2, "large": 4}[policy_size]
        rep_size = 512
        self.ph_mean = tf.placeholder(dtype=tf.float32,
                                      shape=list(ob_space.shape[:2]) + [1],
                                      name="obmean")
        self.ph_std = tf.placeholder(dtype=tf.float32,
                                     shape=list(ob_space.shape[:2]) + [1],
                                     name="obstd")
        memsize *= enlargement
        hidsize *= enlargement
        convfeat = 16 * enlargement
        self.ob_rms = RunningMeanStd(
            shape=list(ob_space.shape[:2]) + [1],
            use_mpi=not update_ob_stats_independently_per_gpu,
        )
        ph_istate = tf.placeholder(dtype=tf.float32,
                                   shape=(None, memsize),
                                   name="state")
        pdparamsize = self.pdtype.param_shape()[0]
        self.memsize = memsize

        # Inputs to policy and value function will have different shapes depending on whether it is rollout
        # or optimization time, so we treat separately.
        (
            self.pdparam_opt,
            self.vpred_int_opt,
            self.vpred_ext_opt,
            self.snext_opt,
        ) = self.apply_policy(
            self.ph_ob['obs'][:, :-1],
            reuse=False,
            scope=scope,
            hidsize=hidsize,
            memsize=memsize,
            extrahid=extrahid,
            sy_nenvs=self.sy_nenvs,
            sy_nsteps=self.sy_nsteps - 1,
            pdparamsize=pdparamsize,
            additional_inputs=self.ph_ob,
        )
        (
            self.pdparam_rollout,
            self.vpred_int_rollout,
            self.vpred_ext_rollout,
            self.snext_rollout,
        ) = self.apply_policy(
            self.ph_ob['obs'],
            reuse=True,
            scope=scope,
            hidsize=hidsize,
            memsize=memsize,
            extrahid=extrahid,
            sy_nenvs=self.sy_nenvs,
            sy_nsteps=self.sy_nsteps,
            pdparamsize=pdparamsize,
            additional_inputs=self.ph_ob,
        )
        if dynamics_bonus:
            self.define_dynamics_prediction_rew(convfeat=convfeat,
                                                rep_size=rep_size,
                                                enlargement=enlargement)
        else:
            self.define_self_prediction_rew(convfeat=convfeat,
                                            rep_size=rep_size,
                                            enlargement=enlargement)

        pd = self.pdtype.pdfromflat(self.pdparam_rollout)
        self.a_samp = pd.sample()
        self.nlp_samp = pd.neglogp(self.a_samp)
        self.entropy_rollout = pd.entropy()
        self.pd_rollout = pd

        self.pd_opt = self.pdtype.pdfromflat(self.pdparam_opt)

        self.ph_istate = ph_istate
Example #2
0
    def __init__(
        self,
        scope,
        ob_space,
        ac_space,
        policy_size='normal',
        maxpool=False,
        extrahid=True,
        hidsize=128,
        memsize=128,
        rec_gate_init=0.0,
        update_ob_stats_independently_per_gpu=True,
        proportion_of_exp_used_for_predictor_update=1.,
        dynamics_bonus=False,
    ):
        StochasticPolicy.__init__(self, scope, ob_space, ac_space)
        self.proportion_of_exp_used_for_predictor_update = proportion_of_exp_used_for_predictor_update
        enlargement = {'small': 1, 'normal': 2, 'large': 4}[policy_size]
        rep_size = 512
        self.ph_mean = tf.placeholder(dtype=tf.float32,
                                      shape=list(ob_space.shape[:2]) + [1],
                                      name="obmean")
        self.ph_std = tf.placeholder(dtype=tf.float32,
                                     shape=list(ob_space.shape[:2]) + [1],
                                     name="obstd")
        memsize *= enlargement  #256
        hidsize *= enlargement  #256
        convfeat = 16 * enlargement
        self.ob_rms = RunningMeanStd(
            shape=list(ob_space.shape[:2]) + [1],
            use_mpi=not update_ob_stats_independently_per_gpu)
        ph_istate = tf.placeholder(dtype=tf.float32,
                                   shape=(None, memsize),
                                   name='state')
        pdparamsize = self.pdtype.param_shape()[0]
        self.memsize = memsize

        self.pdparam_opt, self.vpred_int_opt, self.vpred_ext_opt, self.snext_opt = \
            self.apply_policy(self.ph_ob[None][:,:-1],
                              ph_new=self.ph_new,
                              ph_istate=ph_istate,
                              reuse=False,
                              scope=scope,
                              hidsize=hidsize,
                              memsize=memsize,
                              extrahid=extrahid,
                              sy_nenvs=self.sy_nenvs,
                              sy_nsteps=self.sy_nsteps - 1,
                              pdparamsize=pdparamsize,
                              rec_gate_init=rec_gate_init
                              )
        self.pdparam_rollout, self.vpred_int_rollout, self.vpred_ext_rollout, self.snext_rollout = \
            self.apply_policy(self.ph_ob[None],
                              ph_new=self.ph_new,
                              ph_istate=ph_istate,
                              reuse=True,
                              scope=scope,
                              hidsize=hidsize,
                              memsize=memsize,
                              extrahid=extrahid,
                              sy_nenvs=self.sy_nenvs,
                              sy_nsteps=self.sy_nsteps,
                              pdparamsize=pdparamsize,
                              rec_gate_init=rec_gate_init
                              )
        if dynamics_bonus:
            self.define_dynamics_prediction_rew(convfeat=convfeat,
                                                rep_size=rep_size,
                                                enlargement=enlargement)
        else:
            self.define_self_prediction_rew(convfeat=convfeat,
                                            rep_size=rep_size,
                                            enlargement=enlargement)
            self.step_prediction(convfeat=convfeat,
                                 rep_size=rep_size,
                                 enlargement=enlargement)

        pd = self.pdtype.pdfromflat(self.pdparam_rollout)
        self.a_samp = pd.sample()
        self.nlp_samp = pd.neglogp(self.a_samp)
        self.entropy_rollout = pd.entropy()
        self.pd_rollout = pd

        self.pd_opt = self.pdtype.pdfromflat(self.pdparam_opt)

        self.ph_istate = ph_istate
    def __init__(self,
                 scope,
                 ob_space,
                 ac_space,
                 policy_size='normal',
                 maxpool=False,
                 extrahid=True,
                 hidsize=128,
                 memsize=128,
                 rec_gate_init=0.0,
                 update_ob_stats_independently_per_gpu=True,
                 proportion_of_exp_used_for_predictor_update=1.,
                 dynamics_bonus=False,
                 num_agents=1,
                 rnd_type='rnd',
                 div_type='oracle',
                 indep_rnd=False,
                 indep_policy=False,
                 sd_type='oracle',
                 rnd_mask_prob=1.):
        StochasticPolicy.__init__(self, scope, ob_space, ac_space)
        self.proportion_of_exp_used_for_predictor_update = proportion_of_exp_used_for_predictor_update
        enlargement = {'small': 1, 'normal': 2, 'large': 4}[policy_size]
        rep_size = 512

        self.rnd_mask = tf.placeholder(dtype=tf.float32,
                                       shape=(None, None, num_agents),
                                       name="rnd_mask")
        self.new_rnd_mask = tf.placeholder(dtype=tf.float32,
                                           shape=(None, None),
                                           name="new_rnd_mask")
        self.div_train_mask = tf.placeholder(dtype=tf.float32,
                                             shape=(None, None),
                                             name="div_train_mask")
        self.sample_agent_prob = tf.placeholder(dtype=tf.float32,
                                                shape=(
                                                    None,
                                                    None,
                                                ),
                                                name="sample_agent_prob")
        self.stage_label = tf.placeholder(dtype=tf.int32,
                                          shape=(None, None),
                                          name="stage_label")

        self.ph_mean = tf.placeholder(dtype=tf.float32,
                                      shape=list(ob_space.shape[:2]) + [1],
                                      name="obmean")
        self.ph_std = tf.placeholder(dtype=tf.float32,
                                     shape=list(ob_space.shape[:2]) + [1],
                                     name="obstd")
        self.ph_count = tf.placeholder(dtype=tf.float32,
                                       shape=(),
                                       name="obcount")

        self.sep_ph_mean = tf.placeholder(dtype=tf.float32,
                                          shape=(
                                              None,
                                              None,
                                          ) + ob_space.shape[:2] + (1, ),
                                          name="sep_obmean")
        self.sep_ph_std = tf.placeholder(dtype=tf.float32,
                                         shape=(
                                             None,
                                             None,
                                         ) + ob_space.shape[:2] + (1, ),
                                         name="sep_obstd")
        self.sep_ph_count = tf.placeholder(dtype=tf.float32,
                                           shape=(),
                                           name="sep_obcount")

        self.game_score = tf.placeholder(dtype=tf.float32,
                                         shape=(None, None),
                                         name="game_score")
        self.last_rew_ob = tf.placeholder(dtype=ob_space.dtype,
                                          shape=(None, None) +
                                          tuple(ob_space.shape),
                                          name="last_rew_ob")

        self.div_ph_mean = tf.placeholder(dtype=tf.float32,
                                          shape=list(ob_space.shape[:2]) + [1],
                                          name="div_obmean")
        self.div_ph_std = tf.placeholder(dtype=tf.float32,
                                         shape=list(ob_space.shape[:2]) + [1],
                                         name="div_obstd")

        self.idle_agent_label = tf.placeholder(dtype=tf.int32,
                                               shape=(
                                                   None,
                                                   None,
                                               ),
                                               name="idle_agent_label")
        self.rew_agent_label = tf.placeholder(dtype=tf.int32,
                                              shape=(
                                                  None,
                                                  None,
                                              ),
                                              name="rew_agent_label")

        #self.var_ph_mean = tf.get_variable("var_ph_mean", list(ob_space.shape[:2])+[1], initializer=tf.constant_initializer(0.0))
        #self.var_ph_std = tf.get_variable("var_ph_std", list(ob_space.shape[:2])+[1], initializer=tf.constant_initializer(0.0))
        #self.var_ph_count = tf.get_variable("var_ph_count", (), initializer=tf.constant_initializer(0.0))

        self.sd_ph_mean = tf.placeholder(dtype=tf.float32,
                                         shape=list(ob_space.shape[:2]) + [1],
                                         name="sd_obmean")
        self.sd_ph_std = tf.placeholder(dtype=tf.float32,
                                        shape=list(ob_space.shape[:2]) + [1],
                                        name="sd_obstd")

        memsize *= enlargement
        hidsize *= enlargement
        convfeat = 16 * enlargement

        self.ob_rms_list = [RunningMeanStd(shape=list(ob_space.shape[:2])+[1], use_mpi= not update_ob_stats_independently_per_gpu) \
                                for _ in range(num_agents)]
        self.ob_rms = RunningMeanStd(
            shape=list(ob_space.shape[:2]) + [1],
            use_mpi=not update_ob_stats_independently_per_gpu)

        self.diversity_ob_rms = RunningMeanStd(
            shape=list(ob_space.shape[:2]) + [1],
            use_mpi=not update_ob_stats_independently_per_gpu)

        ph_istate = tf.placeholder(dtype=tf.float32,
                                   shape=(None, memsize),
                                   name='state')
        pdparamsize = self.pdtype.param_shape()[0]

        self.memsize = memsize
        self.num_agents = num_agents
        self.indep_rnd = indep_rnd
        self.indep_policy = indep_policy

        self.num_agents = num_agents

        if num_agents <= 0:

            self.pdparam_opt, self.vpred_int_opt, self.vpred_ext_opt, self.snext_opt = \
                self.apply_policy(self.ph_ob[None][:,:-1],
                                  ph_new=self.ph_new,
                                  ph_istate=ph_istate,
                                  reuse=False,
                                  scope=scope,
                                  hidsize=hidsize,
                                  memsize=memsize,
                                  extrahid=extrahid,
                                  sy_nenvs=self.sy_nenvs,
                                  sy_nsteps=self.sy_nsteps - 1,
                                  pdparamsize=pdparamsize,
                                  rec_gate_init=rec_gate_init
                                  )
            self.pdparam_rollout, self.vpred_int_rollout, self.vpred_ext_rollout, self.snext_rollout = \
                self.apply_policy(self.ph_ob[None],
                                  ph_new=self.ph_new,
                                  ph_istate=ph_istate,
                                  reuse=True,
                                  scope=scope,
                                  hidsize=hidsize,
                                  memsize=memsize,
                                  extrahid=extrahid,
                                  sy_nenvs=self.sy_nenvs,
                                  sy_nsteps=self.sy_nsteps,
                                  pdparamsize=pdparamsize,
                                  rec_gate_init=rec_gate_init
                                  )
        else:

            self.pdparam_opt, self.vpred_int_opt, self.vpred_ext_opt, self.snext_opt = \
                self.apply_multi_head_policy(self.ph_ob[None][:,:-1],
                                  ph_new=self.ph_new,
                                  ph_istate=ph_istate,
                                  reuse=False,
                                  scope=scope,
                                  hidsize=hidsize,
                                  memsize=memsize,
                                  extrahid=extrahid,
                                  sy_nenvs=self.sy_nenvs,
                                  sy_nsteps=self.sy_nsteps - 1,
                                  pdparamsize=pdparamsize,
                                  rec_gate_init=rec_gate_init
                                  )
            self.pdparam_rollout, self.vpred_int_rollout, self.vpred_ext_rollout, self.snext_rollout = \
                self.apply_multi_head_policy(self.ph_ob[None],
                                  ph_new=self.ph_new,
                                  ph_istate=ph_istate,
                                  reuse=True,
                                  scope=scope,
                                  hidsize=hidsize,
                                  memsize=memsize,
                                  extrahid=extrahid,
                                  sy_nenvs=self.sy_nenvs,
                                  sy_nsteps=self.sy_nsteps,
                                  pdparamsize=pdparamsize,
                                  rec_gate_init=rec_gate_init
                                  )

        if dynamics_bonus:
            self.define_dynamics_prediction_rew(convfeat=convfeat,
                                                rep_size=rep_size,
                                                enlargement=enlargement)
        else:
            #self.define_self_prediction_rew(convfeat=convfeat, rep_size=rep_size, enlargement=enlargement)
            self.aux_loss, self.int_rew, self.feat_var, self.max_feat = self.define_multi_head_self_prediction_rew(
                convfeat=convfeat, rep_size=rep_size, enlargement=enlargement)

        self.stage_rnd = tf.constant(1.)
        self.stage_prob = tf.constant(1.)

        if div_type == 'cls':
            with tf.variable_scope("div", reuse=False):
                #self.define_rew_discriminator(convfeat=convfeat, rep_size=256)
                with tf.variable_scope("int", reuse=False):
                    self.disc_logits, self.all_div_prob, self.sp_prob, self.div_rew, self.disc_pd, self.disc_nlp = self.define_rew_discriminator_v2(
                        convfeat=convfeat, rep_size=512, use_rew=True)
        else:
            self.div_rew = tf.constant(0.)

        pd = self.pdtype.pdfromflat(self.pdparam_rollout)
        self.a_samp = pd.sample()
        self.nlp_samp = pd.neglogp(self.a_samp)
        self.entropy_rollout = pd.entropy()
        self.pd_rollout = pd

        self.pd_opt = self.pdtype.pdfromflat(self.pdparam_opt)

        self.ph_istate = ph_istate
Example #4
0
    def __init__(self, scope, ob_space, ac_space,
                 policy_size='normal', maxpool=False, extrahid=True, hidsize=128, memsize=128, rec_gate_init=0.0,
                 update_ob_stats_independently_per_gpu=True,
                 proportion_of_exp_used_for_predictor_update=1.,
                 exploration_type='bottleneck', beta=0.001, rew_counter=None
                 ):
        StochasticPolicy.__init__(self, scope, ob_space, ac_space)

        self.proportion_of_exp_used_for_predictor_update = proportion_of_exp_used_for_predictor_update
        enlargement = {
            'small': 1,
            'normal': 2,
            'large': 4
        }[policy_size]
        rep_size = 512
        self.ph_mean = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2])+[1], name="obmean")  # (84, 84, 1)
        self.ph_std = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2])+[1], name="obstd")    # (84, 84, 1)
        memsize *= enlargement          # memsize = 256
        hidsize *= enlargement          # hidsize = 256
        convfeat = 16*enlargement       # covfeat = 32
        self.ob_rms = RunningMeanStd(shape=list(ob_space.shape[:2])+[1], use_mpi=not update_ob_stats_independently_per_gpu)
        ph_istate = tf.placeholder(dtype=tf.float32,shape=(None, memsize), name='state')  # (None,256)
        pdparamsize = self.pdtype.param_shape()[0]     # 18 等于动作维度
        self.memsize = memsize

        # Inputs to policy and value function will have different shapes depending on whether it is rollout or optimization time, so we treat separately.
        
        # pdparam_opt.shape=(None, None, 18), vpred_int_opt.shape=(None, None), vpred_ext_opt.shape=(None, None), snext_opt.shape=(None, 256)
        self.pdparam_opt, self.vpred_int_opt, self.vpred_ext_opt, self.snext_opt = \
            self.apply_policy(self.ph_ob[None][:,:-1],
                              reuse=False,
                              scope=scope,
                              hidsize=hidsize,                  # 256
                              memsize=memsize,                  # 256
                              extrahid=extrahid,                # True
                              sy_nenvs=self.sy_nenvs,
                              sy_nsteps=self.sy_nsteps - 1,
                              pdparamsize=pdparamsize)           # 18
                              
        self.pdparam_rollout, self.vpred_int_rollout, self.vpred_ext_rollout, self.snext_rollout = \
            self.apply_policy(self.ph_ob[None],
                              reuse=True,
                              scope=scope,
                              hidsize=hidsize,
                              memsize=memsize,
                              extrahid=extrahid,
                              sy_nenvs=self.sy_nenvs,
                              sy_nsteps=self.sy_nsteps,
                              pdparamsize=pdparamsize)

        self.exploration_type = exploration_type
        self.max_table = 0

        self.define_bottleneck_rew(convfeat=convfeat, rep_size=rep_size/8, enlargement=enlargement, beta=beta, rew_counter=rew_counter)

        pd = self.pdtype.pdfromflat(self.pdparam_rollout)    # 输出策略 softmax 的分布.

        self.a_samp = pd.sample()                 # 采样动作
        self.nlp_samp = pd.neglogp(self.a_samp)   # 输出动作
        self.entropy_rollout = pd.entropy()
        self.pd_rollout = pd

        self.pd_opt = self.pdtype.pdfromflat(self.pdparam_opt)
        self.a_samp_opt = self.pd_opt.sample()

        self.ph_istate = ph_istate

        self.scope = scope

        
        #############################################
        ########## 以下过程实际并未使用 ################
        #############################################
        # for gradcam policy
        a_one_hot = tf.one_hot(self.ph_ac, self.ac_space.n, axis=2)   # (None,None) -> (None,None,18)
        # 相当于取出 one_hot 执行的动作的位置的 pdparam_opt
        loss_cam_pol = tf.reduce_mean(tf.multiply(self.pdparam_opt, a_one_hot))  # (None,)
        
        self.conv_out = tf.get_default_graph().get_tensor_by_name('ppo/pol/Relu_2:0')
        self.grads = tf.gradients(loss_cam_pol, self.conv_out)[0]
        # for gradcam aux
        loss_cam_aux = self.kl
        if int(str(tf.__version__).split('.')[1]) < 10:
            self.conv_aux_out = tf.get_default_graph().get_tensor_by_name('ppo/LeakyRelu_2/Maximum:0')
        else:
            self.conv_aux_out = tf.get_default_graph().get_tensor_by_name('ppo/LeakyRelu_2:0')
        self.grads_aux = tf.abs(tf.gradients(loss_cam_aux, self.conv_aux_out)[0])

        # self.cams 实际并未使用
        weights = tf.reduce_mean(tf.reduce_mean(self.grads, 2), 1)
        weights = tf.expand_dims(tf.expand_dims(weights, axis=1), axis=1)
        weights = tf.tile(weights, [1, 6, 6, 1])
        cams = tf.reduce_sum((weights * self.conv_out), axis=3)
        self.cams = tf.maximum(cams, tf.zeros_like(cams))

        # self.cans_aux 实际并未使用
        weights_aux = tf.reduce_mean(tf.reduce_mean(self.grads_aux, 2), 1)
        weights_aux = tf.expand_dims(tf.expand_dims(weights_aux, axis=1), axis=1)
        weights_aux = tf.tile(weights_aux, [1, 7, 7, 1])
        cams_aux = tf.nn.relu(tf.reduce_sum((weights_aux * self.conv_aux_out), axis=3))
        self.cams_aux = tf.maximum(cams_aux, tf.zeros_like(cams_aux))
Example #5
0
    def __init__(self,
                 scope,
                 ob_space,
                 ac_space,
                 policy_size='normal',
                 maxpool=False,
                 extrahid=True,
                 hidsize=128,
                 memsize=128,
                 rec_gate_init=0.0,
                 update_ob_stats_independently_per_gpu=True,
                 proportion_of_exp_used_for_predictor_update=1.,
                 dynamics_bonus=False,
                 action_balance_coef=1.,
                 array_action=True):
        StochasticPolicy.__init__(self, scope, ob_space, ac_space)
        self.proportion_of_exp_used_for_predictor_update = proportion_of_exp_used_for_predictor_update
        self.action_balance_coef = action_balance_coef
        self.array_action = array_action

        self.enlargement = {'small': 1, 'normal': 2, 'large': 4}[policy_size]
        self.rep_size = 512
        self.ph_mean = tf.placeholder(dtype=tf.float32,
                                      shape=list(ob_space.shape[:2]) + [1],
                                      name="obmean")
        self.ph_std = tf.placeholder(dtype=tf.float32,
                                     shape=list(ob_space.shape[:2]) + [1],
                                     name="obstd")
        memsize *= self.enlargement
        hidsize *= self.enlargement
        self.convfeat = 16 * self.enlargement
        self.ob_rms = RunningMeanStd(
            shape=list(ob_space.shape[:2]) + [1],
            use_mpi=not update_ob_stats_independently_per_gpu)
        ph_istate = tf.placeholder(dtype=tf.float32,
                                   shape=(None, memsize),
                                   name='state')
        pdparamsize = self.pdtype.param_shape()[0]
        self.memsize = memsize

        # self.int_rew_ab = None
        # self.int_rew_ab_opt = None
        if self.action_balance_coef is not None:
            # self.action_one_hot_list_rollout = get_action_one_hot_list(self.ac_space.n, self.sy_nenvs, self.sy_nsteps)
            # self.action_one_hot_list_opt = get_action_one_hot_list(self.ac_space.n, self.sy_nenvs, self.sy_nsteps - 1)
            # with tf.device('/cpu:0'):
            self.action_one_hot_rollout = get_action_one_hot(
                self.ac_space.n, self.sy_nenvs, self.sy_nsteps)
            # self.action_one_hot_list_opt = get_action_one_hot(self.ac_space.n, self.sy_nenvs, self.sy_nsteps - 1)

            if self.array_action:
                # with tf.device('/cpu:0'):
                self.action_encode_array_rollout = get_action_encode_array(
                    self.ac_space.n, self.sy_nenvs, self.sy_nsteps,
                    ob_space.shape[:2])
                # self.action_encode_array_rollout, self.split_lengths = get_action_encode_array(
                #     self.ac_space.n, self.sy_nenvs, self.sy_nsteps, ob_space.shape[:2])

            self.feat_var_ab, self.max_feat_ab, self.int_rew_ab, self.int_rew_ab_rollout, self.aux_loss_ab = \
                self.define_action_balance_rew(ph_ob=self.ph_ob[None],
                                               action_one_hot=self.action_one_hot_rollout,
                                               convfeat=self.convfeat,
                                               rep_size=self.rep_size, enlargement=self.enlargement,
                                               sy_nenvs=self.sy_nenvs,
                                               sy_nsteps=self.sy_nsteps,
                                               )
            # self.feat_var_ab_opt, self.max_feat_ab_opt, self.int_rew_ab_opt, self.aux_loss_ab = \
            #     self.define_action_balance_rew(ph_ob=self.ph_ob[None][:, :-1],
            #                                    action_one_hot=self.action_one_hot_list_opt,
            #                                    convfeat=self.convfeat,
            #                                    rep_size=self.rep_size, enlargement=self.enlargement,
            #                                    sy_nenvs=self.sy_nenvs,
            #                                    sy_nsteps=self.sy_nsteps - 1,
            #                                    )

            self.pd_ab = self.pdtype.pdfromflat(self.int_rew_ab)

        # Inputs to policy and value function will have different shapes depending on whether it is rollout
        # or optimization time, so we treat separately.
        self.pdparam_opt, self.vpred_int_opt, self.vpred_ext_opt, self.snext_opt, self.logits_raw_opt = \
            self.apply_policy(self.ph_ob[None][:, :-1],
                              reuse=False,
                              scope=scope,
                              hidsize=hidsize,
                              memsize=memsize,
                              extrahid=extrahid,
                              sy_nenvs=self.sy_nenvs,
                              sy_nsteps=self.sy_nsteps - 1,
                              pdparamsize=pdparamsize
                              )
        self.pdparam_rollout, self.vpred_int_rollout, self.vpred_ext_rollout, self.snext_rollout, _ = \
            self.apply_policy(self.ph_ob[None],
                              reuse=True,
                              scope=scope,
                              hidsize=hidsize,
                              memsize=memsize,
                              extrahid=extrahid,
                              sy_nenvs=self.sy_nenvs,
                              sy_nsteps=self.sy_nsteps,
                              pdparamsize=pdparamsize
                              )
        if dynamics_bonus:
            self.define_dynamics_prediction_rew(convfeat=self.convfeat,
                                                rep_size=self.rep_size,
                                                enlargement=self.enlargement)
        else:
            self.define_self_prediction_rew(convfeat=self.convfeat,
                                            rep_size=self.rep_size,
                                            enlargement=self.enlargement)

        pd = self.pdtype.pdfromflat(self.pdparam_rollout)
        self.a_samp = pd.sample()
        self.nlp_samp = pd.neglogp(self.a_samp)
        self.entropy_rollout = pd.entropy()
        self.pd_rollout = pd

        self.pd_opt = self.pdtype.pdfromflat(self.pdparam_opt)

        self.ph_istate = ph_istate