class CNNPolicy(object):
    def __init__(self,sess,p, train_phase=True,has_state = False):
        with tf.variable_scope("model",reuse = train_phase) as scope: # Reuse = true for training phase
            # Initialization of placeholders
            X = tf.placeholder(tf.uint8, p.OBS_SHAPE) #obs
            S = tf.placeholder(tf.float32,p.STATE_SHAPE)
            scaled_x = tf.cast(X, tf.float32) / 255.

            # Additional Functions which may be needed
            relu_activ = tf.nn.relu #Relu Activation
            normalize = lambda layer,phase :  tf.layers.batch_normalization(layer, center=True,scale=True, training=train_phase) # Batch Normalization
            # Model Details
            #h1 = relu_activ(conv(scaled_x,scope = 'conv1', nf = 10, rf = 5, stride = 1,init_scale=np.sqrt(2)))
            #h2 = relu_activ(conv(h1,scope = 'conv2', nf = 10, rf = 3, stride = 1))
            flattened_x = conv_to_fc(scaled_x)
            h1 = relu_activ(fc(flattened_x,scope = 'fc1', nh = 20,init_scale=np.sqrt(2)))
            h2 = relu_activ(fc(h1,scope = 'fc2', nh = 15,init_scale=np.sqrt(2)))
            hconcat = tf.concat([h2,S],axis=1)
            h3 = relu_activ(fc(hconcat,scope = 'fc3', nh = 10,init_scale=np.sqrt(2)))
            hcommon = relu_activ(fc(h3,scope = 'fcommon', nh = 10,init_scale=np.sqrt(2)))
            pi = fc(hcommon, scope = "policy" , nh = 3,init_scale=0.01)
            vf = fc(hcommon, scope = "value"  , nh = 1)

        self.pd_type = CategoricalPdType(p.NUM_ACTIONS)
        self.pd = self.pd_type.pdfromflat(pi) # Sampling from action distribution as per baselines

        # Sample from the distribution
        v0 = vf[:, 0] # To remove extra dimension
        a0 = self.pd.sample() # Sample from distribution
        neglogp0 = self.pd.neglogp(a0) #Self entropy of selected action
        self.initial_state = None # Not required for CNN (only for RNN Models)

        # Interfaces to the outer world
        def step(ob, state, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob,S:state})
            return a, v, neglogp

        def value(ob,state, *_args, **_kwargs):
            return sess.run(v0, {X:ob,S:state})

        def hidden_value(ob,state,*_args, **_kwargs):
            """
            Created for debugging purposes
            """
            #amodel = np.argmax(np.array(sess.run([pi], {X:ob,S:state})).flatten())
            #a =  sess.run([a0], {X:ob,S:state})
            #adict = {"amodel":amodel,"asampler":a}
            
            return sess.run([hcommon], {X:ob,S:state})


        self.pi = pi
        self.vf = vf
        self.X = X
        self.S = S
        self.step = step
        self.value = value
        self.hidden_value = hidden_value # Required for debugging purpose
Exemple #2
0
def build_act(make_obs_ph,
              q_func,
              hr_func,
              num_actions,
              scope="deepq",
              reuse=None):
    """Creates the act function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that take a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    """
    with tf.variable_scope(scope, reuse=reuse):
        observations_ph = make_obs_ph("observation")
        stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic")
        update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps")
        update_rl_importance_ph = tf.placeholder(tf.float32, (),
                                                 name="update_rl_importance")

        eps = tf.get_variable("eps", (),
                              initializer=tf.constant_initializer(0))
        rl_importance = tf.get_variable("rl_importance", (),
                                        initializer=tf.constant_initializer(0))

        q_values = q_func(observations_ph.get(), num_actions, scope="q_func")
        deterministic_actions = tf.argmax(q_values, axis=1)

        batch_size = tf.shape(observations_ph.get())[0]
        random_actions = tf.random_uniform(tf.stack([batch_size]),
                                           minval=0,
                                           maxval=num_actions,
                                           dtype=tf.int64)
        chose_random = tf.random_uniform(
            tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps
        stochastic_actions = tf.where(chose_random, random_actions,
                                      deterministic_actions)
        rl_actions = tf.cond(stochastic_ph, lambda: stochastic_actions,
                             lambda: deterministic_actions)

        predicted_feedback = hr_func(observations_ph.get(),
                                     num_actions,
                                     scope="hr_func")
        fb_logit_constant = 10
        hr_pdtype = CategoricalPdType(num_actions)
        hr_pd = hr_pdtype.pdfromflat(predicted_feedback * fb_logit_constant)
        hr_actions = hr_pd.sample()

        chose_rl = tf.random_uniform(
            tf.stack([batch_size
                      ]), minval=0, maxval=1, dtype=tf.float32) < rl_importance
        output_actions = tf.where(chose_rl, rl_actions, hr_actions)

        update_eps_expr = eps.assign(
            tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps))
        update_rl_importance_expr = rl_importance.assign(
            tf.cond(update_rl_importance_ph >= 0,
                    lambda: update_rl_importance_ph, lambda: rl_importance))
        _act = U.function(inputs=[
            observations_ph, stochastic_ph, update_eps_ph,
            update_rl_importance_ph
        ],
                          outputs=output_actions,
                          givens={
                              update_eps_ph: -1.0,
                              update_rl_importance_ph: -1.0,
                              stochastic_ph: True
                          },
                          updates=[update_eps_expr, update_rl_importance_expr])

        def act(ob,
                stochastic=True,
                update_eps=-1,
                update_rl_importance_expr=-1):
            return _act(ob, stochastic, update_eps, update_rl_importance_expr)

        return act
Exemple #3
0
def build_act_with_param_noise(make_obs_ph,
                               q_func,
                               hr_func,
                               num_actions,
                               scope="deepq",
                               reuse=None,
                               param_noise_filter_func=None):
    """Creates the act function with support for parameter space noise exploration (https://arxiv.org/abs/1706.01905):

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that take a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.
    param_noise_filter_func: tf.Variable -> bool
        function that decides whether or not a variable should be perturbed. Only applicable
        if param_noise is True. If set to None, default_param_noise_filter is used by default.

    Returns
    -------
    act: (tf.Variable, bool, float, bool, float, bool) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    """
    if param_noise_filter_func is None:
        param_noise_filter_func = default_param_noise_filter

    with tf.variable_scope(scope, reuse=reuse):
        observations_ph = make_obs_ph("observation")
        stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic")
        update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps")
        update_param_noise_threshold_ph = tf.placeholder(
            tf.float32, (), name="update_param_noise_threshold")
        update_param_noise_scale_ph = tf.placeholder(
            tf.bool, (), name="update_param_noise_scale")
        reset_ph = tf.placeholder(tf.bool, (), name="reset")

        update_rl_importance_ph = tf.placeholder(tf.float32, (),
                                                 name="update_rl_importance")

        eps = tf.get_variable("eps", (),
                              initializer=tf.constant_initializer(0))
        param_noise_scale = tf.get_variable(
            "param_noise_scale", (),
            initializer=tf.constant_initializer(0.01),
            trainable=False)
        param_noise_threshold = tf.get_variable(
            "param_noise_threshold", (),
            initializer=tf.constant_initializer(0.05),
            trainable=False)

        rl_importance = tf.get_variable("rl_importance", (),
                                        initializer=tf.constant_initializer(0))

        # Unmodified Q.
        q_values = q_func(observations_ph.get(), num_actions, scope="q_func")

        # Perturbable Q used for the actual rollout.
        q_values_perturbed = q_func(observations_ph.get(),
                                    num_actions,
                                    scope="perturbed_q_func")

        # We have to wrap this code into a function due to the way tf.cond() works. See
        # https://stackoverflow.com/questions/37063952/confused-by-the-behavior-of-tf-cond for
        # a more detailed discussion.
        def perturb_vars(original_scope, perturbed_scope):
            all_vars = scope_vars(absolute_scope_name(original_scope))
            all_perturbed_vars = scope_vars(
                absolute_scope_name(perturbed_scope))
            assert len(all_vars) == len(all_perturbed_vars)
            perturb_ops = []
            for var, perturbed_var in zip(all_vars, all_perturbed_vars):
                if param_noise_filter_func(perturbed_var):
                    # Perturb this variable.
                    op = tf.assign(
                        perturbed_var,
                        var + tf.random_normal(shape=tf.shape(var),
                                               mean=0.,
                                               stddev=param_noise_scale))
                else:
                    # Do not perturb, just assign.
                    op = tf.assign(perturbed_var, var)
                perturb_ops.append(op)
            assert len(perturb_ops) == len(all_vars)
            return tf.group(*perturb_ops)

        # Set up functionality to re-compute `param_noise_scale`. This perturbs yet another copy
        # of the network and measures the effect of that perturbation in action space. If the perturbation
        # is too big, reduce scale of perturbation, otherwise increase.
        q_values_adaptive = q_func(observations_ph.get(),
                                   num_actions,
                                   scope="adaptive_q_func")
        perturb_for_adaption = perturb_vars(original_scope="q_func",
                                            perturbed_scope="adaptive_q_func")
        kl = tf.reduce_sum(tf.nn.softmax(q_values) *
                           (tf.log(tf.nn.softmax(q_values)) -
                            tf.log(tf.nn.softmax(q_values_adaptive))),
                           axis=-1)
        mean_kl = tf.reduce_mean(kl)

        def update_scale():
            with tf.control_dependencies([perturb_for_adaption]):
                update_scale_expr = tf.cond(
                    mean_kl < param_noise_threshold,
                    lambda: param_noise_scale.assign(param_noise_scale * 1.01),
                    lambda: param_noise_scale.assign(param_noise_scale / 1.01),
                )
            return update_scale_expr

        # Functionality to update the threshold for parameter space noise.
        update_param_noise_threshold_expr = param_noise_threshold.assign(
            tf.cond(update_param_noise_threshold_ph >= 0,
                    lambda: update_param_noise_threshold_ph,
                    lambda: param_noise_threshold))

        # Put everything together.
        deterministic_actions = tf.argmax(q_values_perturbed, axis=1)
        batch_size = tf.shape(observations_ph.get())[0]
        random_actions = tf.random_uniform(tf.stack([batch_size]),
                                           minval=0,
                                           maxval=num_actions,
                                           dtype=tf.int64)
        chose_random = tf.random_uniform(
            tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps
        stochastic_actions = tf.where(chose_random, random_actions,
                                      deterministic_actions)

        rl_actions = tf.cond(stochastic_ph, lambda: stochastic_actions,
                             lambda: deterministic_actions)

        predicted_feedback = hr_func(observations_ph.get(),
                                     num_actions,
                                     scope="hr_func")
        fb_logit_constant = 10
        hr_pdtype = CategoricalPdType(num_actions)
        hr_pd = hr_pdtype.pdfromflat(predicted_feedback * fb_logit_constant)
        hr_actions = hr_pd.sample()

        chose_rl = tf.random_uniform(
            tf.stack([batch_size
                      ]), minval=0, maxval=1, dtype=tf.float32) < rl_importance
        output_actions = tf.where(chose_rl, rl_actions, hr_actions)

        update_eps_expr = eps.assign(
            tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps))
        update_rl_importance_expr = rl_importance.assign(
            tf.cond(update_rl_importance_ph >= 0,
                    lambda: update_rl_importance_ph, lambda: rl_importance))
        updates = [
            update_eps_expr,
            tf.cond(
                reset_ph,
                lambda: perturb_vars(original_scope="q_func",
                                     perturbed_scope="perturbed_q_func"),
                lambda: tf.group(*[])),
            tf.cond(update_param_noise_scale_ph, lambda: update_scale(),
                    lambda: tf.Variable(0., trainable=False)),
            update_param_noise_threshold_expr,
            update_rl_importance_expr,
        ]
        _act = U.function(inputs=[
            observations_ph, stochastic_ph, update_eps_ph, reset_ph,
            update_param_noise_threshold_ph, update_param_noise_scale_ph,
            update_rl_importance_ph
        ],
                          outputs=output_actions,
                          givens={
                              update_eps_ph: -1.0,
                              stochastic_ph: True,
                              reset_ph: False,
                              update_param_noise_threshold_ph: False,
                              update_param_noise_scale_ph: False,
                              update_rl_importance_ph: -1.0
                          },
                          updates=updates)

        def act(ob,
                reset=False,
                update_param_noise_threshold=False,
                update_param_noise_scale=False,
                stochastic=True,
                update_eps=-1,
                update_rl_importance=-1):
            return _act(ob, stochastic, update_eps, reset,
                        update_param_noise_threshold, update_param_noise_scale,
                        update_rl_importance)

        return act
    def define_rew_discriminator_v2(self, convfeat, rep_size, use_rew=False):

        output_shape = [self.sy_nenvs * (self.sy_nsteps - 1)]

        sample_prob = tf.reshape(self.sample_agent_prob,
                                 tf.stack(output_shape))
        game_score = tf.reshape(
            self.game_score,
            tf.stack([self.sy_nenvs * (self.sy_nsteps - 1), 1]))

        rew_agent_label = tf.reshape(
            self.rew_agent_label,
            tf.stack([self.sy_nenvs * (self.sy_nsteps - 1), 1]))

        #rew_agent_label = tf.one_hot(self.rew_agent_label, self.num_agents, axis=-1)
        #rew_agent_label = tf.reshape(rew_agent_label,(-1,self.num_agents ))

        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 5:  # B,T,H,W,C

                phi = ph[:, 1:]
                phi = tf.cast(phi, tf.float32)
                phi = tf.reshape(phi, (-1, *ph.shape.as_list()[-3:]))[:, :, :,
                                                                      -1:]
                phi = phi / 255.

                last_rew_ob = self.last_rew_ob
                last_rew_ob = tf.cast(last_rew_ob, tf.float32)
                last_rew_ob = tf.reshape(
                    last_rew_ob,
                    (-1, *last_rew_ob.shape.as_list()[-3:]))[:, :, :, -1:]
                last_rew_ob = last_rew_ob / 255.

                if use_rew:
                    phi = tf.concat([phi, last_rew_ob], axis=-1)

                phi = tf.nn.leaky_relu(
                    conv(phi,
                         'c1r',
                         nf=convfeat * 1,
                         rf=8,
                         stride=4,
                         init_scale=np.sqrt(2)))
                #[20,20] [8,8]
                phi = tf.nn.leaky_relu(
                    conv(phi,
                         'c2r',
                         nf=convfeat * 2 * 1,
                         rf=4,
                         stride=2,
                         init_scale=np.sqrt(2)))
                #[9,9] [7,7]
                phi = tf.nn.leaky_relu(
                    conv(phi,
                         'c3r',
                         nf=convfeat * 2 * 1,
                         rf=3,
                         stride=1,
                         init_scale=np.sqrt(2)))
                phi = to2d(phi)

                phi = tf.nn.relu(
                    fc(phi, 'fc1r', nh=rep_size, init_scale=np.sqrt(2)))
                phi = tf.nn.relu(
                    fc(phi, 'fc2r', nh=rep_size, init_scale=np.sqrt(2)))
                disc_logits = fc(phi,
                                 'fc3r',
                                 nh=self.num_agents,
                                 init_scale=np.sqrt(2))

        one_hot_gidx = tf.one_hot(self.ph_agent_idx, self.num_agents, axis=-1)
        one_hot_gidx = tf.reshape(one_hot_gidx, (-1, self.num_agents))

        flatten_all_div_prob = tf.nn.softmax(disc_logits, axis=-1)
        all_div_prob = tf.reshape(
            flatten_all_div_prob,
            (self.sy_nenvs, self.sy_nsteps - 1, self.num_agents))

        sp_prob = tf.reduce_sum(one_hot_gidx * flatten_all_div_prob, axis=1)
        sp_prob = tf.reshape(sp_prob, (self.sy_nenvs, self.sy_nsteps - 1))

        div_rew = -1 * tf.nn.softmax_cross_entropy_with_logits_v2(
            logits=disc_logits, labels=one_hot_gidx)
        base_rew = tf.log(0.01)
        div_rew = div_rew - tf.log(sample_prob)

        div_rew = tf.reshape(div_rew, (self.sy_nenvs, self.sy_nsteps - 1))

        disc_pdtype = CategoricalPdType(self.num_agents)
        disc_pd = disc_pdtype.pdfromflat(disc_logits)

        disc_nlp = disc_pd.neglogp(rew_agent_label)

        return disc_logits, all_div_prob, sp_prob, div_rew, disc_pd, disc_nlp