예제 #1
0
    def __init__(self,sess,p, train_phase=True,has_state = False):
        with tf.variable_scope("model",reuse = train_phase) as scope: # Reuse = true for training phase
            # Initialization of placeholders
            X = tf.placeholder(tf.uint8, p.OBS_SHAPE) #obs
            S = tf.placeholder(tf.float32,p.STATE_SHAPE)
            scaled_x = tf.cast(X, tf.float32) / 255.

            # Additional Functions which may be needed
            relu_activ = tf.nn.relu #Relu Activation
            normalize = lambda layer,phase :  tf.layers.batch_normalization(layer, center=True,scale=True, training=train_phase) # Batch Normalization
            # Model Details
            #h1 = relu_activ(conv(scaled_x,scope = 'conv1', nf = 10, rf = 5, stride = 1,init_scale=np.sqrt(2)))
            #h2 = relu_activ(conv(h1,scope = 'conv2', nf = 10, rf = 3, stride = 1))
            flattened_x = conv_to_fc(scaled_x)
            h1 = relu_activ(fc(flattened_x,scope = 'fc1', nh = 20,init_scale=np.sqrt(2)))
            h2 = relu_activ(fc(h1,scope = 'fc2', nh = 15,init_scale=np.sqrt(2)))
            hconcat = tf.concat([h2,S],axis=1)
            h3 = relu_activ(fc(hconcat,scope = 'fc3', nh = 10,init_scale=np.sqrt(2)))
            hcommon = relu_activ(fc(h3,scope = 'fcommon', nh = 10,init_scale=np.sqrt(2)))
            pi = fc(hcommon, scope = "policy" , nh = 3,init_scale=0.01)
            vf = fc(hcommon, scope = "value"  , nh = 1)

        self.pd_type = CategoricalPdType(p.NUM_ACTIONS)
        self.pd = self.pd_type.pdfromflat(pi) # Sampling from action distribution as per baselines

        # Sample from the distribution
        v0 = vf[:, 0] # To remove extra dimension
        a0 = self.pd.sample() # Sample from distribution
        neglogp0 = self.pd.neglogp(a0) #Self entropy of selected action
        self.initial_state = None # Not required for CNN (only for RNN Models)

        # Interfaces to the outer world
        def step(ob, state, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob,S:state})
            return a, v, neglogp

        def value(ob,state, *_args, **_kwargs):
            return sess.run(v0, {X:ob,S:state})

        def hidden_value(ob,state,*_args, **_kwargs):
            """
            Created for debugging purposes
            """
            #amodel = np.argmax(np.array(sess.run([pi], {X:ob,S:state})).flatten())
            #a =  sess.run([a0], {X:ob,S:state})
            #adict = {"amodel":amodel,"asampler":a}
            
            return sess.run([hcommon], {X:ob,S:state})


        self.pi = pi
        self.vf = vf
        self.X = X
        self.S = S
        self.step = step
        self.value = value
        self.hidden_value = hidden_value # Required for debugging purpose
예제 #2
0
def make_pdtype(ac_space):
    from cadm import spaces as custom_spaces
    from gym import spaces
    if isinstance(ac_space, custom_spaces.Box):
        assert len(ac_space.shape) == 1
        return DiagGaussianPdType(ac_space.shape[0])
    elif isinstance(ac_space, spaces.Box):
        assert len(ac_space.shape) == 1
        return DiagGaussianPdType(ac_space.shape[0])
    elif isinstance(ac_space, spaces.Discrete):
        return CategoricalPdType(ac_space.n)
    elif isinstance(ac_space, spaces.MultiDiscrete):
        return MultiCategoricalPdType(ac_space.nvec)
    elif isinstance(ac_space, spaces.MultiBinary):
        return BernoulliPdType(ac_space.n)
    else:
        raise NotImplementedError
예제 #3
0
    def _build(self):
        num_primitives = self.num_primitives
        num_hid_layers = self._num_hid_layers
        hid_size = self._hid_size

        self._obs = {}
        for ob_name, ob_shape in self._ob_shape.items():
            self._obs[ob_name] = U.get_placeholder(
                name="ob_{}".format(ob_name),
                dtype=tf.float32,
                shape=[None] + self._ob_shape[ob_name])
        self._prev_primitive = prev_primitive = U.get_placeholder(
            name="prev_primitive", dtype=tf.int32, shape=[None])

        with tf.variable_scope(self.name):
            self._scope = tf.get_variable_scope().name

            self.ob_rms = {}
            for ob_name in self.ob_type:
                with tf.variable_scope("ob_rms_{}".format(ob_name)):
                    self.ob_rms[ob_name] = RunningMeanStd(
                        shape=self._ob_shape[ob_name])
            obz = [(self._obs[ob_name] - self.ob_rms[ob_name].mean) /
                   self.ob_rms[ob_name].std for ob_name in self.ob_type]
            obz = [tf.clip_by_value(ob, -5.0, 5.0) for ob in obz]
            obz = tf.concat(obz, -1)

            prev_primitive_one_hot = tf.one_hot(prev_primitive,
                                                num_primitives,
                                                name="prev_primitive_one_hot")
            obz = tf.concat([obz, prev_primitive_one_hot], -1)

            # value function
            with tf.variable_scope("vf"):
                _ = obz
                for i in range(num_hid_layers):
                    _ = self._activation(
                        tf.layers.dense(
                            _,
                            hid_size,
                            name="fc%d" % (i + 1),
                            kernel_initializer=U.normc_initializer(1.0)))
                self.vpred = tf.layers.dense(
                    _,
                    1,
                    name="vpred",
                    kernel_initializer=U.normc_initializer(1.0))[:, 0]

            # meta policy
            with tf.variable_scope("pol"):
                _ = obz
                for i in range(num_hid_layers):
                    _ = self._activation(
                        tf.layers.dense(
                            _,
                            hid_size,
                            name="fc%i" % (i + 1),
                            kernel_initializer=U.normc_initializer(1.0)))
                self.selector = tf.layers.dense(
                    _,
                    num_primitives,
                    name="action",
                    kernel_initializer=U.normc_initializer(0.01))
                self.pdtype = pdtype = CategoricalPdType(num_primitives)
                self.pd = pdtype.pdfromflat(self.selector)

        # sample action
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self.obs = [self._obs[ob_name] for ob_name in self.ob_type]
        self._act = U.function([stochastic, self._prev_primitive] + self.obs,
                               [ac, self.vpred])
예제 #4
0
    def _init(self, ob_space, ac_space, kind, atom_type_num, args):
        self.pdtype = MultiCatCategoricalPdType
        ### 0 Get input
        ob = {
            'adj':
            U.get_placeholder(
                name="adj",
                dtype=tf.float32,
                shape=[None, ob_space['adj'].shape[0], None, None]),
            'node':
            U.get_placeholder(name="node",
                              dtype=tf.float32,
                              shape=[None, 1, None, ob_space['node'].shape[2]])
        }
        # only when evaluating given action, at training time
        self.ac_real = U.get_placeholder(name='ac_real',
                                         dtype=tf.int64,
                                         shape=[None,
                                                4])  # feed groudtruth action
        ob_node = tf.compat.v1.layers.dense(ob['node'],
                                            8,
                                            activation=None,
                                            use_bias=False,
                                            name='emb')  # embedding layer
        if args.bn == 1:
            ob_node = tf.compat.v1.layers.batch_normalization(ob_node, axis=-1)
        if args.has_concat == 1:
            emb_node = tf.concat(
                (GCN_batch(ob['adj'],
                           ob_node,
                           args.emb_size,
                           name='gcn1',
                           aggregate=args.gcn_aggregate), ob_node),
                axis=-1)
        else:
            emb_node = GCN_batch(ob['adj'],
                                 ob_node,
                                 args.emb_size,
                                 name='gcn1',
                                 aggregate=args.gcn_aggregate)
        if args.bn == 1:
            emb_node = tf.compat.v1.layers.batch_normalization(emb_node,
                                                               axis=-1)
        for i in range(args.layer_num_g - 2):
            if args.has_residual == 1:
                emb_node = GCN_batch(
                    ob['adj'],
                    emb_node,
                    args.emb_size,
                    name='gcn1_' + str(i + 1),
                    aggregate=args.gcn_aggregate) + self.emb_node1
            elif args.has_concat == 1:
                emb_node = tf.concat(
                    (GCN_batch(ob['adj'],
                               emb_node,
                               args.emb_size,
                               name='gcn1_' + str(i + 1),
                               aggregate=args.gcn_aggregate), self.emb_node1),
                    axis=-1)
            else:
                emb_node = GCN_batch(ob['adj'],
                                     emb_node,
                                     args.emb_size,
                                     name='gcn1_' + str(i + 1),
                                     aggregate=args.gcn_aggregate)
            if args.bn == 1:
                emb_node = tf.compat.v1.layers.batch_normalization(emb_node,
                                                                   axis=-1)
        emb_node = GCN_batch(ob['adj'],
                             emb_node,
                             args.emb_size,
                             is_act=False,
                             is_normalize=(args.bn == 0),
                             name='gcn2',
                             aggregate=args.gcn_aggregate)
        emb_node = tf.squeeze(emb_node, axis=1)  # B*n*f

        ### 1 only keep effective nodes
        # ob_mask = tf.cast(tf.transpose(tf.reduce_sum(ob['node'],axis=-1),[0,2,1]),dtype=tf.bool) # B*n*1
        ob_len = tf.reduce_sum(tf.squeeze(tf.cast(tf.cast(tf.reduce_sum(
            ob['node'], axis=-1),
                                                          dtype=tf.bool),
                                                  dtype=tf.float32),
                                          axis=-2),
                               axis=-1)  # B
        ob_len_first = ob_len - atom_type_num
        logits_mask = tf.sequence_mask(ob_len, maxlen=tf.shape(
            ob['node'])[2])  # mask all valid entry
        logits_first_mask = tf.sequence_mask(
            ob_len_first, maxlen=tf.shape(
                ob['node'])[2])  # mask valid entry -3 (rm isolated nodes)

        if args.mask_null == 1:
            emb_node_null = tf.zeros(tf.shape(emb_node))
            emb_node = tf.where(condition=tf.tile(
                tf.expand_dims(logits_mask, axis=-1),
                (1, 1, emb_node.get_shape()[-1])),
                                x=emb_node,
                                y=emb_node_null)

        ## get graph embedding
        emb_graph = tf.reduce_sum(emb_node, axis=1, keepdims=True)
        if args.graph_emb == 1:
            emb_graph = tf.tile(emb_graph, [1, tf.shape(emb_node)[1], 1])
            emb_node = tf.concat([emb_node, emb_graph], axis=2)

        ### 2 predict stop
        emb_stop = tf.compat.v1.layers.dense(emb_node,
                                             args.emb_size,
                                             activation=tf.nn.relu,
                                             use_bias=False,
                                             name='linear_stop1')
        if args.bn == 1:
            emb_stop = tf.compat.v1.layers.batch_normalization(emb_stop,
                                                               axis=-1)
        self.logits_stop = tf.reduce_sum(emb_stop, axis=1)
        self.logits_stop = tf.compat.v1.layers.dense(
            self.logits_stop, 2, activation=None, name='linear_stop2_1')  # B*2
        # explicitly show node num
        # self.logits_stop = tf.concat((tf.reduce_mean(tf.compat.v1.layers.dense(emb_node, 32, activation=tf.nn.relu, name='linear_stop1'),axis=1),tf.reshape(ob_len_first/5,[-1,1])),axis=1)
        # self.logits_stop = tf.compat.v1.layers.dense(self.logits_stop, 2, activation=None, name='linear_stop2')  # B*2

        stop_shift = tf.constant([[0, args.stop_shift]], dtype=tf.float32)
        pd_stop = CategoricalPdType(-1).pdfromflat(flat=self.logits_stop +
                                                   stop_shift)
        ac_stop = pd_stop.sample()

        ### 3.1: select first (active) node
        # rules: only select effective nodes
        self.logits_first = tf.compat.v1.layers.dense(emb_node,
                                                      args.emb_size,
                                                      activation=tf.nn.relu,
                                                      name='linear_select1')
        self.logits_first = tf.squeeze(tf.compat.v1.layers.dense(
            self.logits_first, 1, activation=None, name='linear_select2'),
                                       axis=-1)  # B*n
        logits_first_null = tf.ones(tf.shape(self.logits_first)) * -1000
        self.logits_first = tf.where(condition=logits_first_mask,
                                     x=self.logits_first,
                                     y=logits_first_null)
        # using own prediction
        pd_first = CategoricalPdType(-1).pdfromflat(flat=self.logits_first)
        ac_first = pd_first.sample()
        mask = tf.one_hot(ac_first,
                          depth=tf.shape(emb_node)[1],
                          dtype=tf.bool,
                          on_value=True,
                          off_value=False)
        emb_first = tf.boolean_mask(emb_node, mask)
        emb_first = tf.expand_dims(emb_first, axis=1)
        # using groud truth action
        ac_first_real = self.ac_real[:, 0]
        mask_real = tf.one_hot(ac_first_real,
                               depth=tf.shape(emb_node)[1],
                               dtype=tf.bool,
                               on_value=True,
                               off_value=False)
        emb_first_real = tf.boolean_mask(emb_node, mask_real)
        emb_first_real = tf.expand_dims(emb_first_real, axis=1)

        ### 3.2: select second node
        # rules: do not select first node
        # using own prediction

        # mlp
        emb_cat = tf.concat(
            [tf.tile(emb_first, [1, tf.shape(emb_node)[1], 1]), emb_node],
            axis=2)
        self.logits_second = tf.compat.v1.layers.dense(emb_cat,
                                                       args.emb_size,
                                                       activation=tf.nn.relu,
                                                       name='logits_second1')
        self.logits_second = tf.compat.v1.layers.dense(self.logits_second,
                                                       1,
                                                       activation=None,
                                                       name='logits_second2')
        # # bilinear
        # self.logits_second = tf.transpose(bilinear(emb_first, emb_node, name='logits_second'), [0, 2, 1])

        self.logits_second = tf.squeeze(self.logits_second, axis=-1)
        ac_first_mask = tf.one_hot(ac_first,
                                   depth=tf.shape(emb_node)[1],
                                   dtype=tf.bool,
                                   on_value=False,
                                   off_value=True)
        logits_second_mask = tf.logical_and(logits_mask, ac_first_mask)
        logits_second_null = tf.ones(tf.shape(self.logits_second)) * -1000
        self.logits_second = tf.where(condition=logits_second_mask,
                                      x=self.logits_second,
                                      y=logits_second_null)

        pd_second = CategoricalPdType(-1).pdfromflat(flat=self.logits_second)
        ac_second = pd_second.sample()
        mask = tf.one_hot(ac_second,
                          depth=tf.shape(emb_node)[1],
                          dtype=tf.bool,
                          on_value=True,
                          off_value=False)
        emb_second = tf.boolean_mask(emb_node, mask)
        emb_second = tf.expand_dims(emb_second, axis=1)

        # using groudtruth
        # mlp
        emb_cat = tf.concat(
            [tf.tile(emb_first_real, [1, tf.shape(emb_node)[1], 1]), emb_node],
            axis=2)
        self.logits_second_real = tf.compat.v1.layers.dense(
            emb_cat,
            args.emb_size,
            activation=tf.nn.relu,
            name='logits_second1',
            reuse=True)
        self.logits_second_real = tf.compat.v1.layers.dense(
            self.logits_second_real,
            1,
            activation=None,
            name='logits_second2',
            reuse=True)
        # # bilinear
        # self.logits_second_real = tf.transpose(bilinear(emb_first_real, emb_node, name='logits_second'), [0, 2, 1])

        self.logits_second_real = tf.squeeze(self.logits_second_real, axis=-1)
        ac_first_mask_real = tf.one_hot(ac_first_real,
                                        depth=tf.shape(emb_node)[1],
                                        dtype=tf.bool,
                                        on_value=False,
                                        off_value=True)
        logits_second_mask_real = tf.logical_and(logits_mask,
                                                 ac_first_mask_real)
        self.logits_second_real = tf.where(condition=logits_second_mask_real,
                                           x=self.logits_second_real,
                                           y=logits_second_null)

        ac_second_real = self.ac_real[:, 1]
        mask_real = tf.one_hot(ac_second_real,
                               depth=tf.shape(emb_node)[1],
                               dtype=tf.bool,
                               on_value=True,
                               off_value=False)
        emb_second_real = tf.boolean_mask(emb_node, mask_real)
        emb_second_real = tf.expand_dims(emb_second_real, axis=1)

        ### 3.3 predict edge type
        # using own prediction
        # MLP
        emb_cat = tf.concat([emb_first, emb_second], axis=-1)
        self.logits_edge = tf.compat.v1.layers.dense(emb_cat,
                                                     args.emb_size,
                                                     activation=tf.nn.relu,
                                                     name='logits_edge1')
        self.logits_edge = tf.compat.v1.layers.dense(self.logits_edge,
                                                     ob['adj'].get_shape()[1],
                                                     activation=None,
                                                     name='logits_edge2')
        self.logits_edge = tf.squeeze(self.logits_edge, axis=1)
        # # bilinear
        # self.logits_edge = tf.reshape(bilinear_multi(emb_first,emb_second,out_dim=ob['adj'].get_shape()[1]),[-1,ob['adj'].get_shape()[1]])
        pd_edge = CategoricalPdType(-1).pdfromflat(self.logits_edge)
        ac_edge = pd_edge.sample()

        # using ground truth
        # MLP
        emb_cat = tf.concat([emb_first_real, emb_second_real], axis=-1)
        self.logits_edge_real = tf.compat.v1.layers.dense(
            emb_cat,
            args.emb_size,
            activation=tf.nn.relu,
            name='logits_edge1',
            reuse=True)
        self.logits_edge_real = tf.compat.v1.layers.dense(
            self.logits_edge_real,
            ob['adj'].get_shape()[1],
            activation=None,
            name='logits_edge2',
            reuse=True)
        self.logits_edge_real = tf.squeeze(self.logits_edge_real, axis=1)
        # # bilinear
        # self.logits_edge_real = tf.reshape(bilinear_multi(emb_first_real, emb_second_real, out_dim=ob['adj'].get_shape()[1]),
        #                               [-1, ob['adj'].get_shape()[1]])

        # ncat_list = [tf.shape(logits_first),ob_space['adj'].shape[-1],ob_space['adj'].shape[0]]
        self.pd = self.pdtype(-1).pdfromflat([
            self.logits_first, self.logits_second_real, self.logits_edge_real,
            self.logits_stop
        ])
        self.vpred = tf.compat.v1.layers.dense(emb_node,
                                               args.emb_size,
                                               use_bias=False,
                                               activation=tf.nn.relu,
                                               name='value1')
        if args.bn == 1:
            self.vpred = tf.compat.v1.layers.batch_normalization(self.vpred,
                                                                 axis=-1)
        self.vpred = tf.reduce_max(self.vpred, axis=1)
        self.vpred = tf.compat.v1.layers.dense(self.vpred,
                                               1,
                                               activation=None,
                                               name='value2')

        self.state_in = []
        self.state_out = []

        self.ac = tf.concat(
            (tf.expand_dims(ac_first, axis=1), tf.expand_dims(
                ac_second, axis=1), tf.expand_dims(
                    ac_edge, axis=1), tf.expand_dims(ac_stop, axis=1)),
            axis=1)

        debug = {}
        debug['ob_node'] = tf.shape(ob['node'])
        debug['ob_adj'] = tf.shape(ob['adj'])
        debug['emb_node'] = emb_node
        debug['logits_stop'] = self.logits_stop
        debug['logits_second'] = self.logits_second
        debug['ob_len'] = ob_len
        debug['logits_first_mask'] = logits_first_mask
        debug['logits_second_mask'] = logits_second_mask
        # debug['pd'] = self.pd.logp(self.ac)
        debug['ac'] = self.ac

        stochastic = tf.compat.v1.placeholder(dtype=tf.bool, shape=())
        self._act = U.function(
            [stochastic, ob['adj'], ob['node']],
            [self.ac, self.vpred, debug])  # add debug in second arg if needed
예제 #5
0
def build_act_with_param_noise(make_obs_ph,
                               q_func,
                               hr_func,
                               num_actions,
                               scope="deepq",
                               reuse=None,
                               param_noise_filter_func=None):
    """Creates the act function with support for parameter space noise exploration (https://arxiv.org/abs/1706.01905):

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that take a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.
    param_noise_filter_func: tf.Variable -> bool
        function that decides whether or not a variable should be perturbed. Only applicable
        if param_noise is True. If set to None, default_param_noise_filter is used by default.

    Returns
    -------
    act: (tf.Variable, bool, float, bool, float, bool) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    """
    if param_noise_filter_func is None:
        param_noise_filter_func = default_param_noise_filter

    with tf.variable_scope(scope, reuse=reuse):
        observations_ph = make_obs_ph("observation")
        stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic")
        update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps")
        update_param_noise_threshold_ph = tf.placeholder(
            tf.float32, (), name="update_param_noise_threshold")
        update_param_noise_scale_ph = tf.placeholder(
            tf.bool, (), name="update_param_noise_scale")
        reset_ph = tf.placeholder(tf.bool, (), name="reset")

        update_rl_importance_ph = tf.placeholder(tf.float32, (),
                                                 name="update_rl_importance")

        eps = tf.get_variable("eps", (),
                              initializer=tf.constant_initializer(0))
        param_noise_scale = tf.get_variable(
            "param_noise_scale", (),
            initializer=tf.constant_initializer(0.01),
            trainable=False)
        param_noise_threshold = tf.get_variable(
            "param_noise_threshold", (),
            initializer=tf.constant_initializer(0.05),
            trainable=False)

        rl_importance = tf.get_variable("rl_importance", (),
                                        initializer=tf.constant_initializer(0))

        # Unmodified Q.
        q_values = q_func(observations_ph.get(), num_actions, scope="q_func")

        # Perturbable Q used for the actual rollout.
        q_values_perturbed = q_func(observations_ph.get(),
                                    num_actions,
                                    scope="perturbed_q_func")

        # We have to wrap this code into a function due to the way tf.cond() works. See
        # https://stackoverflow.com/questions/37063952/confused-by-the-behavior-of-tf-cond for
        # a more detailed discussion.
        def perturb_vars(original_scope, perturbed_scope):
            all_vars = scope_vars(absolute_scope_name(original_scope))
            all_perturbed_vars = scope_vars(
                absolute_scope_name(perturbed_scope))
            assert len(all_vars) == len(all_perturbed_vars)
            perturb_ops = []
            for var, perturbed_var in zip(all_vars, all_perturbed_vars):
                if param_noise_filter_func(perturbed_var):
                    # Perturb this variable.
                    op = tf.assign(
                        perturbed_var,
                        var + tf.random_normal(shape=tf.shape(var),
                                               mean=0.,
                                               stddev=param_noise_scale))
                else:
                    # Do not perturb, just assign.
                    op = tf.assign(perturbed_var, var)
                perturb_ops.append(op)
            assert len(perturb_ops) == len(all_vars)
            return tf.group(*perturb_ops)

        # Set up functionality to re-compute `param_noise_scale`. This perturbs yet another copy
        # of the network and measures the effect of that perturbation in action space. If the perturbation
        # is too big, reduce scale of perturbation, otherwise increase.
        q_values_adaptive = q_func(observations_ph.get(),
                                   num_actions,
                                   scope="adaptive_q_func")
        perturb_for_adaption = perturb_vars(original_scope="q_func",
                                            perturbed_scope="adaptive_q_func")
        kl = tf.reduce_sum(tf.nn.softmax(q_values) *
                           (tf.log(tf.nn.softmax(q_values)) -
                            tf.log(tf.nn.softmax(q_values_adaptive))),
                           axis=-1)
        mean_kl = tf.reduce_mean(kl)

        def update_scale():
            with tf.control_dependencies([perturb_for_adaption]):
                update_scale_expr = tf.cond(
                    mean_kl < param_noise_threshold,
                    lambda: param_noise_scale.assign(param_noise_scale * 1.01),
                    lambda: param_noise_scale.assign(param_noise_scale / 1.01),
                )
            return update_scale_expr

        # Functionality to update the threshold for parameter space noise.
        update_param_noise_threshold_expr = param_noise_threshold.assign(
            tf.cond(update_param_noise_threshold_ph >= 0,
                    lambda: update_param_noise_threshold_ph,
                    lambda: param_noise_threshold))

        # Put everything together.
        deterministic_actions = tf.argmax(q_values_perturbed, axis=1)
        batch_size = tf.shape(observations_ph.get())[0]
        random_actions = tf.random_uniform(tf.stack([batch_size]),
                                           minval=0,
                                           maxval=num_actions,
                                           dtype=tf.int64)
        chose_random = tf.random_uniform(
            tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps
        stochastic_actions = tf.where(chose_random, random_actions,
                                      deterministic_actions)

        rl_actions = tf.cond(stochastic_ph, lambda: stochastic_actions,
                             lambda: deterministic_actions)

        predicted_feedback = hr_func(observations_ph.get(),
                                     num_actions,
                                     scope="hr_func")
        fb_logit_constant = 10
        hr_pdtype = CategoricalPdType(num_actions)
        hr_pd = hr_pdtype.pdfromflat(predicted_feedback * fb_logit_constant)
        hr_actions = hr_pd.sample()

        chose_rl = tf.random_uniform(
            tf.stack([batch_size
                      ]), minval=0, maxval=1, dtype=tf.float32) < rl_importance
        output_actions = tf.where(chose_rl, rl_actions, hr_actions)

        update_eps_expr = eps.assign(
            tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps))
        update_rl_importance_expr = rl_importance.assign(
            tf.cond(update_rl_importance_ph >= 0,
                    lambda: update_rl_importance_ph, lambda: rl_importance))
        updates = [
            update_eps_expr,
            tf.cond(
                reset_ph,
                lambda: perturb_vars(original_scope="q_func",
                                     perturbed_scope="perturbed_q_func"),
                lambda: tf.group(*[])),
            tf.cond(update_param_noise_scale_ph, lambda: update_scale(),
                    lambda: tf.Variable(0., trainable=False)),
            update_param_noise_threshold_expr,
            update_rl_importance_expr,
        ]
        _act = U.function(inputs=[
            observations_ph, stochastic_ph, update_eps_ph, reset_ph,
            update_param_noise_threshold_ph, update_param_noise_scale_ph,
            update_rl_importance_ph
        ],
                          outputs=output_actions,
                          givens={
                              update_eps_ph: -1.0,
                              stochastic_ph: True,
                              reset_ph: False,
                              update_param_noise_threshold_ph: False,
                              update_param_noise_scale_ph: False,
                              update_rl_importance_ph: -1.0
                          },
                          updates=updates)

        def act(ob,
                reset=False,
                update_param_noise_threshold=False,
                update_param_noise_scale=False,
                stochastic=True,
                update_eps=-1,
                update_rl_importance=-1):
            return _act(ob, stochastic, update_eps, reset,
                        update_param_noise_threshold, update_param_noise_scale,
                        update_rl_importance)

        return act
예제 #6
0
def build_act(make_obs_ph,
              q_func,
              hr_func,
              num_actions,
              scope="deepq",
              reuse=None):
    """Creates the act function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that take a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    """
    with tf.variable_scope(scope, reuse=reuse):
        observations_ph = make_obs_ph("observation")
        stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic")
        update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps")
        update_rl_importance_ph = tf.placeholder(tf.float32, (),
                                                 name="update_rl_importance")

        eps = tf.get_variable("eps", (),
                              initializer=tf.constant_initializer(0))
        rl_importance = tf.get_variable("rl_importance", (),
                                        initializer=tf.constant_initializer(0))

        q_values = q_func(observations_ph.get(), num_actions, scope="q_func")
        deterministic_actions = tf.argmax(q_values, axis=1)

        batch_size = tf.shape(observations_ph.get())[0]
        random_actions = tf.random_uniform(tf.stack([batch_size]),
                                           minval=0,
                                           maxval=num_actions,
                                           dtype=tf.int64)
        chose_random = tf.random_uniform(
            tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps
        stochastic_actions = tf.where(chose_random, random_actions,
                                      deterministic_actions)
        rl_actions = tf.cond(stochastic_ph, lambda: stochastic_actions,
                             lambda: deterministic_actions)

        predicted_feedback = hr_func(observations_ph.get(),
                                     num_actions,
                                     scope="hr_func")
        fb_logit_constant = 10
        hr_pdtype = CategoricalPdType(num_actions)
        hr_pd = hr_pdtype.pdfromflat(predicted_feedback * fb_logit_constant)
        hr_actions = hr_pd.sample()

        chose_rl = tf.random_uniform(
            tf.stack([batch_size
                      ]), minval=0, maxval=1, dtype=tf.float32) < rl_importance
        output_actions = tf.where(chose_rl, rl_actions, hr_actions)

        update_eps_expr = eps.assign(
            tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps))
        update_rl_importance_expr = rl_importance.assign(
            tf.cond(update_rl_importance_ph >= 0,
                    lambda: update_rl_importance_ph, lambda: rl_importance))
        _act = U.function(inputs=[
            observations_ph, stochastic_ph, update_eps_ph,
            update_rl_importance_ph
        ],
                          outputs=output_actions,
                          givens={
                              update_eps_ph: -1.0,
                              update_rl_importance_ph: -1.0,
                              stochastic_ph: True
                          },
                          updates=[update_eps_expr, update_rl_importance_expr])

        def act(ob,
                stochastic=True,
                update_eps=-1,
                update_rl_importance_expr=-1):
            return _act(ob, stochastic, update_eps, update_rl_importance_expr)

        return act
    def define_rew_discriminator_v2(self, convfeat, rep_size, use_rew=False):

        output_shape = [self.sy_nenvs * (self.sy_nsteps - 1)]

        sample_prob = tf.reshape(self.sample_agent_prob,
                                 tf.stack(output_shape))
        game_score = tf.reshape(
            self.game_score,
            tf.stack([self.sy_nenvs * (self.sy_nsteps - 1), 1]))

        rew_agent_label = tf.reshape(
            self.rew_agent_label,
            tf.stack([self.sy_nenvs * (self.sy_nsteps - 1), 1]))

        #rew_agent_label = tf.one_hot(self.rew_agent_label, self.num_agents, axis=-1)
        #rew_agent_label = tf.reshape(rew_agent_label,(-1,self.num_agents ))

        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 5:  # B,T,H,W,C

                phi = ph[:, 1:]
                phi = tf.cast(phi, tf.float32)
                phi = tf.reshape(phi, (-1, *ph.shape.as_list()[-3:]))[:, :, :,
                                                                      -1:]
                phi = phi / 255.

                last_rew_ob = self.last_rew_ob
                last_rew_ob = tf.cast(last_rew_ob, tf.float32)
                last_rew_ob = tf.reshape(
                    last_rew_ob,
                    (-1, *last_rew_ob.shape.as_list()[-3:]))[:, :, :, -1:]
                last_rew_ob = last_rew_ob / 255.

                if use_rew:
                    phi = tf.concat([phi, last_rew_ob], axis=-1)

                phi = tf.nn.leaky_relu(
                    conv(phi,
                         'c1r',
                         nf=convfeat * 1,
                         rf=8,
                         stride=4,
                         init_scale=np.sqrt(2)))
                #[20,20] [8,8]
                phi = tf.nn.leaky_relu(
                    conv(phi,
                         'c2r',
                         nf=convfeat * 2 * 1,
                         rf=4,
                         stride=2,
                         init_scale=np.sqrt(2)))
                #[9,9] [7,7]
                phi = tf.nn.leaky_relu(
                    conv(phi,
                         'c3r',
                         nf=convfeat * 2 * 1,
                         rf=3,
                         stride=1,
                         init_scale=np.sqrt(2)))
                phi = to2d(phi)

                phi = tf.nn.relu(
                    fc(phi, 'fc1r', nh=rep_size, init_scale=np.sqrt(2)))
                phi = tf.nn.relu(
                    fc(phi, 'fc2r', nh=rep_size, init_scale=np.sqrt(2)))
                disc_logits = fc(phi,
                                 'fc3r',
                                 nh=self.num_agents,
                                 init_scale=np.sqrt(2))

        one_hot_gidx = tf.one_hot(self.ph_agent_idx, self.num_agents, axis=-1)
        one_hot_gidx = tf.reshape(one_hot_gidx, (-1, self.num_agents))

        flatten_all_div_prob = tf.nn.softmax(disc_logits, axis=-1)
        all_div_prob = tf.reshape(
            flatten_all_div_prob,
            (self.sy_nenvs, self.sy_nsteps - 1, self.num_agents))

        sp_prob = tf.reduce_sum(one_hot_gidx * flatten_all_div_prob, axis=1)
        sp_prob = tf.reshape(sp_prob, (self.sy_nenvs, self.sy_nsteps - 1))

        div_rew = -1 * tf.nn.softmax_cross_entropy_with_logits_v2(
            logits=disc_logits, labels=one_hot_gidx)
        base_rew = tf.log(0.01)
        div_rew = div_rew - tf.log(sample_prob)

        div_rew = tf.reshape(div_rew, (self.sy_nenvs, self.sy_nsteps - 1))

        disc_pdtype = CategoricalPdType(self.num_agents)
        disc_pd = disc_pdtype.pdfromflat(disc_logits)

        disc_nlp = disc_pd.neglogp(rew_agent_label)

        return disc_logits, all_div_prob, sp_prob, div_rew, disc_pd, disc_nlp
예제 #8
0
    def _init(self, ob_space, ac_space):
        with tf.variable_scope(self.scope):
            self.pdtype = pdtype = CategoricalPdType(ac_space)
            ob = U.get_placeholder(name="ob",
                                   dtype=tf.float32,
                                   shape=[None, ob_space])

            out = ob

            # 进入激活函数前进行batch_normalization,加了4层
            out = layers.fully_connected(
                out,
                num_outputs=256,
                activation_fn=None,
                weights_initializer=U.normc_initializer(1.0))
            axes1 = list(range(len(out.get_shape()) - 1))
            mean1, variance1 = tf.nn.moments(out, axes1)
            out = tf.nn.batch_normalization(out,
                                            mean1,
                                            variance1,
                                            offset=None,
                                            scale=None,
                                            variance_epsilon=0.001)
            out = tf.nn.relu(out)

            out = layers.fully_connected(
                out,
                num_outputs=128,
                activation_fn=None,
                weights_initializer=U.normc_initializer(1.0))
            axes2 = list(range(len(out.get_shape()) - 1))
            mean2, variance2 = tf.nn.moments(out, axes2)
            out = tf.nn.batch_normalization(out,
                                            mean2,
                                            variance2,
                                            offset=None,
                                            scale=None,
                                            variance_epsilon=0.001)
            out = tf.nn.relu(out)

            axes4 = list(range(len(out.get_shape()) - 1))
            mean4, variance4 = tf.nn.moments(out, axes4)
            out = tf.nn.batch_normalization(out,
                                            mean4,
                                            variance4,
                                            offset=None,
                                            scale=None,
                                            variance_epsilon=0.001)

            self.batch_size = 1
            self.time_steps = tf.shape(out)[0]
            self.cell_size = 128
            out = tf.reshape(out, [-1, self.time_steps, self.cell_size],
                             name='2_3D')
            lstm_cell = tf.contrib.rnn.BasicLSTMCell(self.cell_size,
                                                     forget_bias=1.0,
                                                     state_is_tuple=True)
            state = lstm_cell.zero_state(self.batch_size, tf.float32)

            out, state = tf.nn.dynamic_rnn(lstm_cell,
                                           out,
                                           initial_state=state,
                                           time_major=False)
            out = tf.reshape(out, [-1, self.cell_size], name='2_2D')

            out = tf.nn.dropout(out, keep_prob=0.6)

            axes3 = list(range(len(out.get_shape()) - 1))
            mean3, variance3 = tf.nn.moments(out, axes3)
            out = tf.nn.batch_normalization(out,
                                            mean3,
                                            variance3,
                                            offset=None,
                                            scale=None,
                                            variance_epsilon=0.001)

            out = layers.fully_connected(
                out,
                num_outputs=128,
                activation_fn=tf.nn.relu,
                weights_initializer=U.normc_initializer(1.0))

            pdparam = U.dense(out, pdtype.param_shape()[0], "polfinal")
            self.vpred = U.dense(out, 1, "value")[:, 0]
            self.pd = pdtype.pdfromflat(pdparam)

            self.state_in = []
            self.state_out = []

            stochastic = tf.placeholder(dtype=tf.bool,
                                        shape=(),
                                        name="stochastic")

            update_eps = tf.placeholder(tf.float32, (), name="update_eps")
            deterministic_actions = self.pd.full_sample(
            )  # tf.argmax(q_values, axis=1)
            random_actions = tf.random_uniform(tf.shape(deterministic_actions),
                                               minval=-1,
                                               maxval=1,
                                               dtype=tf.float32)
            chose_random = tf.random_uniform(tf.shape(deterministic_actions),
                                             minval=0,
                                             maxval=1,
                                             dtype=tf.float32) < update_eps
            stochastic_actions = tf.where(chose_random, random_actions,
                                          deterministic_actions)

            ac = U.switch(stochastic, stochastic_actions, self.pd.flatparam())
            self._act = U.function(inputs=[stochastic, update_eps, ob],
                                   outputs=[ac, self.vpred, state],
                                   givens={
                                       update_eps: -1.0,
                                       stochastic: True
                                   })
예제 #9
0
    def _build(self):
        ac_space = self._ac_space
        num_hid_layers = self._num_hid_layers
        hid_size = self._hid_size
        gaussian_fixed_var = self._gaussian_fixed_var

        # obs
        self._obs = {}
        for ob_name, ob_shape in self._ob_shape.items():
            self._obs[ob_name] = U.get_placeholder(
                name="ob_{}".format(ob_name),
                dtype=tf.float32,
                shape=[None] + self._ob_shape[ob_name])
        self._cur_primitive = cur_primitive = \
            U.get_placeholder(name="cur_primitive", dtype=tf.int32, shape=[None])

        # obs normalization
        self.ob_rms = {}
        for ob_name in self.ob_type:
            with tf.variable_scope("ob_rms_{}".format(ob_name)):
                self.ob_rms[ob_name] = RunningMeanStd(
                    shape=self._ob_shape[ob_name])
        obz = [(self._obs[ob_name] - self.ob_rms[ob_name].mean) /
               self.ob_rms[ob_name].std for ob_name in self.ob_type]
        obz = [tf.clip_by_value(ob, -5.0, 5.0) for ob in obz]
        obz = tf.concat(obz, -1)

        cur_primitive_one_hot = tf.one_hot(cur_primitive,
                                           self._num_primitives,
                                           name="cur_primitive_one_hot")
        obz = tf.concat([obz, cur_primitive_one_hot], -1)

        # value function
        with tf.variable_scope("vf"):
            last_out = obz
            for i in range(num_hid_layers):
                last_out = self._activation(
                    tf.layers.dense(
                        last_out,
                        hid_size,
                        name="fc%i" % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))
            self.vpred = tf.layers.dense(
                last_out,
                1,
                name="final",
                kernel_initializer=U.normc_initializer(1.0))[:, 0]

        # primitive policy
        self.pdtype = pdtype = make_pdtype(ac_space)
        with tf.variable_scope("pol"):
            last_out = obz
            for i in range(num_hid_layers):
                last_out = self._activation(
                    tf.layers.dense(
                        last_out,
                        hid_size,
                        name="fc%i" % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))

            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0] // 2,
                    name="final",
                    kernel_initializer=U.normc_initializer(0.01))
                logstd = tf.get_variable(
                    name="logstd",
                    shape=[1, pdtype.param_shape()[0] // 2],
                    initializer=tf.zeros_initializer())
                pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            else:
                pdparam = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0],
                    name="final",
                    kernel_initializer=U.normc_initializer(0.01))

            if self.term_activation == 'sigmoid':
                self.term_pred = tf.sigmoid(
                    tf.layers.dense(
                        last_out,
                        1,
                        name="term_final",
                        kernel_initializer=U.normc_initializer(1.0))[:, 0])
                stochastic_act = tf.less_equal(
                    (1 / (2 * self._config.trans_term_prob)) *
                    tf.random_uniform(tf.shape(self.term_pred)),
                    self.term_pred)
                determinstic_act = tf.less_equal(
                    (1 - self._config.trans_term_prob) *
                    tf.ones_like(self.term_pred), self.term_pred)
            else:
                self.term_pred = tf.layers.dense(
                    last_out,
                    2,
                    name="term_final",
                    kernel_initializer=U.normc_initializer(0.01))
                self.term_pdtype = term_pdtype = CategoricalPdType(2)
                self.term_pd = term_pdtype.pdfromflat(self.term_pred)
                stochastic_act = self.term_pd.sample()
                determinstic_act = self.term_pd.mode()
        self.pd = pdtype.pdfromflat(pdparam)

        # sample action
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self.obs = [self._obs[ob_name] for ob_name in self.ob_type]
        term = U.switch(stochastic, stochastic_act, determinstic_act)
        self._act = U.function([stochastic, cur_primitive] + self.obs,
                               [ac, self.vpred, term])
        self._value = U.function([cur_primitive] + self.obs, self.vpred)
        self._term_pred = U.function([stochastic, cur_primitive] + self.obs,
                                     self.term_pred)