def img_encoder(self, x, kind):
     if kind == 'small':  # from A3C paper
         x = max_pool(
             tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [1, 1], pad="VALID")),
             4)
         x = max_pool(
             tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [1, 1], pad="VALID")),
             2)
         x = U.flattenallbut0(x)
         x = tf.nn.relu(
             tf.layers.dense(x,
                             256,
                             name='lin',
                             kernel_initializer=U.normc_initializer(1.0)))
     elif kind == 'large':  # Nature DQN
         x = max_pool(
             tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [1, 1], pad="VALID")),
             4)
         x = max_pool(
             tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [1, 1], pad="VALID")),
             2)
         x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID"))
         x = U.flattenallbut0(x)
         x = tf.nn.relu(
             tf.layers.dense(x,
                             512,
                             name='lin',
                             kernel_initializer=U.normc_initializer(1.0)))
     else:
         raise NotImplementedError
     return x
Beispiel #2
0
    def _init(self, ob_space, ac_space, kind):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        x = ob / 255.0
        if kind == 'small': # from A3C paper
            x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0)))
        elif kind == 'large': # Nature DQN
            x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(tf.layers.dense(x, 512, name='lin', kernel_initializer=U.normc_initializer(1.0)))
        else:
            raise NotImplementedError

        logits = tf.layers.dense(x, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01))
        self.pd = pdtype.pdfromflat(logits)
        self.vpred = tf.layers.dense(x, 1, name='value', kernel_initializer=U.normc_initializer(1.0))[:,0]

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = self.pd.sample() # XXX
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Beispiel #3
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False):  #pylint: disable=W0613
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc)
        nact = ac_space.n
        X = tf.placeholder(tf.float32, ob_shape)  #obs
        print(ob_shape)

        self.pdtype = pdtype = make_pdtype(ac_space)
        with tf.variable_scope("model", reuse=reuse):
            '''
            h = conv(X, 'c1', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), pad="SAME")
            h2 = conv(h, 'c2', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), pad="SAME")
            h3 = conv(h2, 'c3', nf=128, rf=3, stride=1, init_scale=np.sqrt(2), pad="SAME")
            h3 = conv_to_fc(h3)
            h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))

            hh = conv(X, 'xc1', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), pad="SAME")
            hh2 = conv(hh, 'xc2', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), pad="SAME")
            hh3 = conv(hh2, 'xc3', nf=128, rf=3, stride=1, init_scale=np.sqrt(2), pad="SAME")
            hh3 = conv_to_fc(hh3)
            hh4 = fc(hh3, 'xfc1', nh=512, init_scale=np.sqrt(2))
            pi = fc(h4, 'pi', nact, act=lambda x:x, init_scale=0.01)
            vf = fc(hh4, 'v', 1, act=lambda x:x)[:,0]

            '''
            x = tf.nn.relu(U.conv2d(X, 32, "l1", [3, 3], [1, 1], pad="SAME"))
            x = tf.nn.relu(U.conv2d(x, 64, "l2", [3, 3], [1, 1], pad="SAME"))
            x = tf.nn.relu(U.conv2d(x, 128, "l3", [3, 3], [1, 1], pad="SAME"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(U.dense(x, 512, 'lin', U.normc_initializer(1.0)))

            y = tf.nn.relu(U.conv2d(X, 32, "yl1", [3, 3], [1, 1], pad="SAME"))
            y = tf.nn.relu(U.conv2d(y, 64, "yl2", [3, 3], [1, 1], pad="SAME"))
            y = tf.nn.relu(U.conv2d(y, 128, "yl3", [3, 3], [1, 1], pad="SAME"))
            y = U.flattenallbut0(y)
            y = tf.nn.relu(U.dense(y, 512, 'ylin', U.normc_initializer(1.0)))

            pi = U.dense(x,
                         pdtype.param_shape()[0], "logits",
                         U.normc_initializer(0.01))
            vf = U.dense(y, 1, "value", U.normc_initializer(1.0))[:, 0]

        self.pd = self.pdtype.pdfromflat(pi)

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X: ob})

        self.X = X
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
    def _init(self, ob_space, ac_space):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        obscaled = ob / 255.0

        with tf.variable_scope("pol"):
            x = obscaled
            x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0)))
            logits = U.dense(x, pdtype.param_shape()[0], "logits", U.normc_initializer(0.01))
            self.pd = pdtype.pdfromflat(logits)
        with tf.variable_scope("vf"):
            x = obscaled
            x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0)))
            self.vpred = U.dense(x, 1, "value", U.normc_initializer(1.0))
            self.vpredz = self.vpred

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = self.pd.sample() # XXX
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Beispiel #5
0
    def _init(self, ob_space, ac_space, kind):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
    
        x = ob / 255.0
        if kind == 'small': # from A3C paper
            x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(U.dense(x, 256, 'lin', U.normc_initializer(1.0)))
        elif kind == 'large': # Nature DQN
            x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(U.dense(x, 512, 'lin', U.normc_initializer(1.0)))
        else:
            raise NotImplementedError

        logits = U.dense(x, pdtype.param_shape()[0], "logits", U.normc_initializer(0.01))
        self.pd = pdtype.pdfromflat(logits)
        self.vpred = U.dense(x, 1, "value", U.normc_initializer(1.0))[:,0]

        self.state_in = []
        self.state_out = []

        stochastic = tf.compat.v1.placeholder(dtype=tf.bool, shape=())
        ac = self.pd.sample() # XXX
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Beispiel #6
0
    def _init(self, ob_space, ac_space):
        self.pdtype = distributions.make_pdtype(ac_space)

        ob = U.get_placeholder(name='ob', dtype=tf.int32, shape=[None] + list(ob_space.shape))
        next_blocks, my_grid, opp_grid = tf.split(ob, [16, 12 * 6, 12 * 6], axis=1)

        with tf.variable_scope('next_blocks'):
            next_blocks = tf.one_hot(next_blocks, depth=5)
            next_blocks = U.flattenallbut0(next_blocks)
            next_blocks = tf.nn.leaky_relu(tf.layers.dense(next_blocks, 12, name='l1', kernel_initializer=U.normc_initializer(1.0)), alpha=0.1)
            next_blocks = tf.nn.leaky_relu(tf.layers.dense(next_blocks, 12, name='l2', kernel_initializer=U.normc_initializer(1.0)), alpha=0.1)

        with tf.variable_scope('grids', reuse=False):
            my_grid = _grid_cnn(my_grid)

        with tf.variable_scope('grids', reuse=True):
            opp_grid = _grid_cnn(opp_grid)

        x = tf.concat([next_blocks, my_grid, opp_grid], axis=1)
        x = tf.nn.leaky_relu(tf.layers.dense(x, 64, name='lin', kernel_initializer=U.normc_initializer(1.0)), alpha=0.1)

        logits = tf.layers.dense(x, self.pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01))
        self.pd = self.pdtype.pdfromflat(logits)
        self.vpred = tf.layers.dense(x, 1, name='value', kernel_initializer=U.normc_initializer(1.0))[:, 0]

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Beispiel #7
0
def _grid_cnn(x):
    x = tf.reshape(x, shape=[-1, 12, 6])
    x = tf.one_hot(x, depth=7)
    x = tf.nn.leaky_relu(U.conv2d(x, 12, 'l1', [3, 3], [1, 1], pad='VALID'), alpha=0.1)
    x = tf.nn.leaky_relu(U.conv2d(x, 12, 'l2', [3, 3], [1, 1], pad='VALID'), alpha=0.1)
    x = U.flattenallbut0(x)
    return x
    def build_graph(self, obs_ph, acs_ph, reuse=False):
        """
        obs_ph: tf tensor shape of [None,84,84,4]
        acs_ph: tf tensor shape of [None,ac_dim] #one hot encoding

        """
        with tf.variable_scope(self.scope):
            if reuse:
                tf.get_variable_scope().reuse_variables()

            one_hot_ac = tf.one_hot(acs_ph, self.num_actions, dtype=tf.float32)
            x = tf.concat([
                obs_ph / 255.0,
                tf.tile(one_hot_ac[:, None, None, :], [1, 84, 84, 1])
            ],
                          axis=3)  #[None,84,84,4+ac_dim]
            x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.tanh(
                tf.layers.dense(x,
                                512,
                                name='lin1',
                                kernel_initializer=U.normc_initializer(1.0)))
            logits = tf.layers.dense(
                x, 1, name='lin2', kernel_initializer=U.normc_initializer(1.0))
        return logits
Beispiel #9
0
    def _create_network(self):
        l = self.ob / 255.0
        if self.kind == 'small':  # from A3C paper
            l = tf.nn.relu(U.conv2d(l, 16, "l1", [8, 8], [4, 4], pad="VALID"))
            l = tf.nn.relu(U.conv2d(l, 32, "l2", [4, 4], [2, 2], pad="VALID"))
            l = U.flattenallbut0(l)
            l = tf.nn.relu(U.dense(l, 256, 'lin', U.normc_initializer(1.0)))
        elif self.kind == 'large':  # Nature DQN
            l = tf.nn.relu(U.conv2d(l, 32, "l1", [8, 8], [4, 4], pad="VALID"))
            l = tf.nn.relu(U.conv2d(l, 64, "l2", [4, 4], [2, 2], pad="VALID"))
            l = tf.nn.relu(U.conv2d(l, 64, "l3", [3, 3], [1, 1], pad="VALID"))
            l = U.flattenallbut0(l)
            l = tf.nn.relu(U.dense(l, 512, 'lin', U.normc_initializer(1.0)))
        else:
            raise NotImplementedError

        self._create_logit_value(l, l)
 def _build(name, x):
     if kind == 'small':  # from A3C paper
         x = tf.nn.relu(
             U.conv2d(x,
                      16,
                      "%s_l1" % name, [8, 8], [4, 4],
                      pad="VALID"))
         x = tf.nn.relu(
             U.conv2d(x,
                      32,
                      "%s_l2" % name, [4, 4], [2, 2],
                      pad="VALID"))
         x = U.flattenallbut0(x)
         x = tf.nn.relu(
             tf.layers.dense(
                 x,
                 256,
                 name='lin',
                 kernel_initializer=U.normc_initializer(1.0)))
     elif kind == 'large':  # Nature DQN
         x = tf.nn.relu(
             U.conv2d(x,
                      32,
                      "%s_l1" % name, [8, 8], [4, 4],
                      pad="VALID"))
         x = tf.nn.relu(
             U.conv2d(x,
                      64,
                      "%s_l2" % name, [4, 4], [2, 2],
                      pad="VALID"))
         x = tf.nn.relu(
             U.conv2d(x,
                      64,
                      "%s_l3" % name, [3, 3], [1, 1],
                      pad="VALID"))
         x = U.flattenallbut0(x)
         x = tf.nn.relu(
             tf.layers.dense(
                 x,
                 512,
                 name='%s_lin' % name,
                 kernel_initializer=U.normc_initializer(1.0)))
     else:
         raise NotImplementedError
     return x
Beispiel #11
0
def vggm1234(x, TRAIN_COVN=True):

    net = slim.convolution(x,
                           96, [7, 7],
                           2,
                           padding='VALID',
                           scope='conv1',
                           activation_fn=tf.nn.relu,
                           reuse=tf.AUTO_REUSE,
                           trainable=TRAIN_COVN)
    net = tf.nn.lrn(net, depth_radius=5, bias=2, alpha=1e-4 * 1, beta=0.75)
    net = slim.pool(net, [3, 3],
                    'MAX',
                    stride=2,
                    padding='VALID',
                    scope='pool1')

    net = slim.convolution(net,
                           256, [5, 5],
                           2,
                           padding='VALID',
                           scope='conv2',
                           activation_fn=tf.nn.relu,
                           reuse=tf.AUTO_REUSE,
                           trainable=TRAIN_COVN)
    net = tf.nn.lrn(net, depth_radius=5, bias=2, alpha=1e-4 * 1, beta=0.75)
    net = slim.pool(net, [3, 3],
                    'MAX',
                    stride=2,
                    padding='VALID',
                    scope='pool2')

    net = slim.convolution(net,
                           512, [3, 3],
                           1,
                           padding='VALID',
                           scope='conv3',
                           activation_fn=tf.nn.relu,
                           reuse=tf.AUTO_REUSE,
                           trainable=TRAIN_COVN)

    net = slim.convolution(net,
                           512, [3, 3],
                           1,
                           padding='VALID',
                           scope='conv4',
                           activation_fn=tf.nn.relu,
                           reuse=tf.AUTO_REUSE,
                           trainable=TRAIN_COVN)

    return U.flattenallbut0(net)
    def _init(self, ob_space, ac_space):
        """

        :param ob_space: (Gym Space) The observation space of the environment
        :param ac_space: (Gym Space) The action space of the environment
        """
        obs, pdtype = self.get_obs_and_pdtype(ob_space, ac_space)

        obs_normalized = obs / 255.0

        with tf.variable_scope(self.name + "/pol", reuse=self.reuse):
            layer_1 = tf.nn.relu(tf_utils.conv2d(obs_normalized, 8, "l1", [8, 8], [4, 4], pad="VALID"))
            layer_2 = tf.nn.relu(tf_utils.conv2d(layer_1, 16, "l2", [4, 4], [2, 2], pad="VALID"))
            layer_2 = tf_utils.flattenallbut0(layer_2)
            layer_3 = tf.nn.relu(tf.layers.dense(layer_2, 128, name='lin',
                                                 kernel_initializer=tf_utils.normc_initializer(1.0)))
            logits = tf.layers.dense(layer_3, pdtype.param_shape()[0], name='logits',
                                     kernel_initializer=tf_utils.normc_initializer(0.01))
            self.proba_distribution = pdtype.proba_distribution_from_flat(logits)
        with tf.variable_scope(self.name + "/vf", reuse=self.reuse):
            layer_1 = tf.nn.relu(tf_utils.conv2d(obs_normalized, 8, "l1", [8, 8], [4, 4], pad="VALID"))
            layer_2 = tf.nn.relu(tf_utils.conv2d(layer_1, 16, "l2", [4, 4], [2, 2], pad="VALID"))
            layer_2 = tf_utils.flattenallbut0(layer_2)
            layer_3 = tf.nn.relu(tf.layers.dense(layer_2, 128, name='lin',
                                                 kernel_initializer=tf_utils.normc_initializer(1.0)))
            self.vpred = tf.layers.dense(layer_3, 1, name='value',
                                         kernel_initializer=tf_utils.normc_initializer(1.0))
            self.vpredz = self.vpred

        self.state_in = []
        self.state_out = []

        if self.stochastic_ph is None:
            self.stochastic_ph = tf.placeholder(dtype=tf.bool, shape=())
        action = self.proba_distribution.sample()
        self._act = tf_utils.function([self.stochastic_ph, obs], [action, self.vpred])
 def img_encoder(self, img, kind, mode="input"):
     """mode denote where add the coord conv:
         "input" means add only after input tensor
         "all" means add after all-level tensors
     """
     _, num_rows, num_cols, _ = img.get_shape().as_list()
     addcoord = AddCoords(x_dim=num_cols,
                           y_dim=num_rows,
                           with_r=False,
                           skiptile=True)
     img_coord = addcoord(img)
     x = tf.nn.relu(U.conv2d(img_coord, 32, "l1", [8, 8], [4, 4], pad="VALID"))
     x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID"))
     x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID"))
     
     x = U.flattenallbut0(x)
     x = tf.nn.relu(tf.layers.dense(x, 512, name='lin', kernel_initializer=U.normc_initializer(1.0)))
     return x
Beispiel #14
0
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, lstm_hid_size, kind):
        print("This is lstm policy for only sensors.")
        assert isinstance(ob_space, tuple)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None
        
        ob_p = U.get_placeholder(name="ob_physics", dtype=tf.float32, shape=[sequence_length] + list(ob_space[0].shape))
        ob_f= U.get_placeholder(name="ob_frames", dtype=tf.float32, shape=[sequence_length]+list(ob_space[1].shape))

        #process ob_p
        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape = ob_space[0].shape)
        obpz = tf.clip_by_value((ob_p - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            

        #process ob_f
        x = ob_f / 255.0

        if kind == 'small': # from A3C paper
            x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0)))
        elif kind == 'large': # Nature DQN
            x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(tf.layers.dense(x, 512, name='lin', kernel_initializer=U.normc_initializer(1.0)))
        else:
            raise NotImplementedError

        # lstm layer for memmory
        lstm_cell = tf.contrib.rnn.BasicLSTMCell(lstm_hid_size, state_is_tuple=True, name = "rnn")
        c_init = np.zeros((1, lstm_cell.state_size.c), np.float32)
        h_init = np.zeros((1, lstm_cell.state_size.h), np.float32)
        self.state_init = (c_init, h_init)
        c_in = U.get_placeholder(name="state_c", dtype=tf.float32,shape=(None, lstm_cell.state_size.c))
        h_in = U.get_placeholder(name="state_h", dtype=tf.float32,shape=(None, lstm_cell.state_size.h))
        self.state_in = (c_in, h_in)

        state_in = tf.contrib.rnn.LSTMStateTuple(c_in, h_in)
        lstm_outputs, lstm_states = lstm_cell(x, state_in)
        lstm_c, lstm_h = lstm_states
        self.state_out = (lstm_c, lstm_h)

        rnn_out = tf.reshape(lstm_outputs, (-1, lstm_hid_size))
        
        # conjugate sensor and physics
        ob_last = tf.concat((rnn_out, obpz), axis = -1)

        # value network
        with tf.variable_scope("vf"):
            last_out = ob_last
            for i in range(num_hid_layers):
                last_out = tf.nn.relu(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0)))
            self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0]
 
        with tf.variable_scope("pol"):
            last_out = ob_last
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(1.0)))
            logits = tf.layers.dense(last_out, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01))
            self.pd = pdtype.pdfromflat(logits)

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob_p, ob_f, c_in, h_in], [ac, self.vpred, lstm_c, lstm_h])
    def _init(self, ob_space, sensor_space, ac_space, hid_size, num_hid_layers,
              kind, elm_mode):
        assert isinstance(ob_space, gym.spaces.Box)
        assert isinstance(sensor_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))
        ob_sensor = U.get_placeholder(name="ob_sensor",
                                      dtype=tf.float32,
                                      shape=[sequence_length] +
                                      list(sensor_space.shape))

        x = ob / 255.0
        x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID"))
        x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID"))

        num_res_net_blocks = 3
        for i in range(num_res_net_blocks):
            input_data = x
            for j in range(2):
                x = tf.nn.relu(
                    U.conv2d(x,
                             32,
                             "l%i" % (2 * i + 3 + j),
                             filter_size=[3, 3],
                             pad="SAME"))
            x = tf.nn.relu(tf.math.add(x, input_data))

        x = U.flattenallbut0(x)
        x = tf.nn.relu(
            tf.layers.dense(x,
                            256,
                            name='lin',
                            kernel_initializer=U.normc_initializer(1.0)))

        ## Obfilter on sensor output
        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=sensor_space.shape)
        obz_sensor = tf.clip_by_value(
            (ob_sensor - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)

        last_out = obz_sensor
        if not elm_mode:
            ## Adapted from mlp_policy
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(
                    tf.layers.dense(
                        last_out,
                        hid_size,
                        name="vffc%i" % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))
            y = tf.layers.dense(last_out,
                                64,
                                name="vffinal",
                                kernel_initializer=U.normc_initializer(1.0))
        else:
            last_out = tf.nn.tanh(
                tf.layers.dense(last_out,
                                hid_size,
                                name="vffc1",
                                kernel_initializer=U.normc_initializer(1.0),
                                trainable=False))
            y = tf.layers.dense(last_out,
                                64,
                                name="vffinal",
                                kernel_initializer=U.normc_initializer(1.0))

        x = tf.concat([x, y], 1)
        logits = tf.layers.dense(x,
                                 pdtype.param_shape()[0],
                                 name="logits",
                                 kernel_initializer=U.normc_initializer(0.01))
        self.pd = pdtype.pdfromflat(logits)
        self.vpred = tf.layers.dense(
            x, 1, name="value", kernel_initializer=U.normc_initializer(1.0))[:,
                                                                             0]

        # self.session.run(logits.kernel)
        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = self.pd.sample()  # XXX
        self._act = U.function([stochastic, ob, ob_sensor],
                               [ac, self.vpred, logits])
Beispiel #16
0
    def _init(self,
              ob_space,
              ac_space,
              kind,
              num_options=2,
              dc=0,
              w_intfc=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.w_intfc = w_intfc
        self.state_in = []
        self.state_out = []
        self.dc = dc
        self.num_options = num_options
        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))
        option = U.get_placeholder(name="option", dtype=tf.int32, shape=[None])

        x = ob / 255.0
        if kind == 'small':  # from A3C paper
            x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID"))
            x = U.flattenallbut0(x)
            hidden = tf.nn.relu(
                tf.layers.dense(x,
                                256,
                                name='lin',
                                kernel_initializer=U.normc_initializer(1.0)))
        elif kind == 'large':  # Nature DQN
            x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID"))
            x = U.flattenallbut0(x)
            hidden = tf.nn.relu(
                tf.layers.dense(x,
                                512,
                                name='lin',
                                kernel_initializer=U.normc_initializer(1.0)))
        else:
            raise NotImplementedError

        logits = dense3D2(hidden,
                          pdtype.param_shape()[0],
                          "polfinal",
                          option,
                          num_options=num_options,
                          weight_init=U.normc_initializer(0.01))
        self.pd = pdtype.pdfromflat(logits)

        self.vpred = dense3D2(hidden,
                              1,
                              "vffinal",
                              option,
                              num_options=num_options,
                              weight_init=U.normc_initializer(1.0))[:, 0]

        self.tpred = tf.nn.sigmoid(
            dense3D2(tf.stop_gradient(hidden),
                     1,
                     "termhead",
                     option,
                     num_options=num_options,
                     weight_init=U.normc_initializer(1.0)))[:, 0]
        termination_sample = tf.greater(
            self.tpred, tf.random_uniform(shape=tf.shape(self.tpred),
                                          maxval=1.))

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = self.pd.sample()  # XXX

        #self.op_pi = tf.nn.softmax(U.dense(tf.stop_gradient(hidden), num_options, "OP", weight_init=U.normc_initializer(1.0)))

        self.op_pi = tf.nn.softmax(
            U.dense(hidden,
                    num_options,
                    "OPfinal",
                    weight_init=U.normc_initializer(1.0)))

        self.intfc = tf.sigmoid(
            U.dense(hidden,
                    num_options,
                    "intfcfinal",
                    weight_init=U.normc_initializer(1.0)))

        self._act = U.function([stochastic, ob, option], [ac])
        self.get_term = U.function([ob, option], [termination_sample])
        self.get_tpred = U.function([ob, option], [self.tpred])
        self.get_vpred = U.function([ob, option], [self.vpred])
        self._get_op_int = U.function([ob], [self.op_pi, self.intfc])
        self._get_intfc = U.function([ob], [self.intfc])
        self._get_op = U.function([ob], [self.op_pi])
Beispiel #17
0
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, kind):
        assert isinstance(ob_space, tuple)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob_p = U.get_placeholder(name="ob_physics",
                                 dtype=tf.float32,
                                 shape=[sequence_length] +
                                 list(ob_space[0].shape))
        ob_f = U.get_placeholder(name="ob_frames",
                                 dtype=tf.float32,
                                 shape=[sequence_length] +
                                 list(ob_space[1].shape))

        #process ob_p
        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space[0].shape)
        obpz = tf.clip_by_value((ob_p - self.ob_rms.mean) / self.ob_rms.std,
                                -5.0, 5.0)

        #process ob_f
        x = ob_f / 255.0

        if kind == 'small':  # from A3C paper
            x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(
                tf.layers.dense(x,
                                256,
                                name='lin',
                                kernel_initializer=U.normc_initializer(1.0)))
        elif kind == 'large':  # Nature DQN
            x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(
                tf.layers.dense(x,
                                512,
                                name='lin',
                                kernel_initializer=U.normc_initializer(1.0)))
        else:
            raise NotImplementedError

        ob_last = tf.concat((obpz, x), axis=-1)

        with tf.variable_scope("vf"):
            last_out = ob_last
            for i in range(num_hid_layers):
                last_out = tf.nn.relu(
                    tf.layers.dense(
                        last_out,
                        hid_size,
                        name="fc%i" % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))
            self.vpred = tf.layers.dense(
                last_out,
                1,
                name='final',
                kernel_initializer=U.normc_initializer(1.0))[:, 0]

        with tf.variable_scope("pol"):
            last_out = ob_last
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(
                    tf.layers.dense(
                        last_out,
                        hid_size,
                        name='fc%i' % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))
            logits = tf.layers.dense(
                last_out,
                pdtype.param_shape()[0],
                name='logits',
                kernel_initializer=U.normc_initializer(0.01))
            self.pd = pdtype.pdfromflat(logits)

        self.state_in = []
        self.state_out = []
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = self.pd.sample()  # XXX
        self._act = U.function([stochastic, ob_p, ob_f], [ac, self.vpred])
Beispiel #18
0
    def __init__(self, input_shape, scope, args):
        assert len(input_shape) == 3
        self.input_shape = input_shape # (W, H, Channels)
        self.scope = scope
        self.MASKS = args.masks
        self.Z_SIZE = args.z_size
        self.EPSILON = 1e-8
        self.checkpoint_path = args.checkpoint_path
        if not os.path.exists(self.checkpoint_path):
            os.makedirs(self.checkpoint_path)

        self.trained_epochs = tf.Variable(0, dtype=tf.int32, name='trained_epochs', trainable=False)
        self.inc_trained_epochs = self.trained_epochs.assign_add(1)


        ## Build net
        with tf.variable_scope('input'):
            self.x_in = tf.placeholder(name="x_in", dtype="float", shape=(None, ) + self.input_shape) # Batch, W, H, Channels
            self.z_in = tf.placeholder(name="z_in", dtype="float", shape=(None, ) + (self.Z_SIZE,) ) # Batch, Z
            self.mask = tf.placeholder(name="mask", dtype="float", shape=(None, ) + self.input_shape[:-1] + (1,) ) # Batch, W, H, 1
        with tf.variable_scope('is_training'):
            self.is_training = tf.placeholder(tf.bool, name="is_training")
        with tf.variable_scope('kl_tolerance'):
            self.kl_tolerance = tf.placeholder(name="kl_tolerance", dtype=tf.float32)




#     def build_VAE(x_in, mask, is_training, kl_tolerance, Z_SIZE):
        """ 
            x_in (tf.placeholder): input (and target output) of the autoencoder network
            mask (tf.placeholder): is_person mask. Where this mask is True normal reconstruction_loss is computed.
                                where it is False, loss is set to 0.
            is_training (tf.placeholder): is training
            kl_tolerance (scalar, or tf.placeholder): 
            Z_SIZE (scalar): size of the latent z dimension
        """
        is_training = self.is_training
        x = self.x_in
        _7 = 7 if input_shape[0] > 64 else 1 # either 1 or 7 (whether input is lidar or image)
        _3 = 3 if input_shape[0] > 64 else 1 # either 1 or 3 
        _3_else_2 = 3 if input_shape[0] > 64 else 2
        with tf.variable_scope('encoder'):
            print("A0: {}".format(x.shape))
            x = tf.layers.batch_normalization(
                    tf.nn.relu(U.conv2d(x, 64, "l1", [_7, 7], [_3, 3], pad="SAME", summary_tag="Conv/Layer1")), training=is_training)
            print("A1: {}".format(x.shape))
            x = tf.layers.max_pooling2d(x, (_3, 3), (_3, 3), padding="SAME", name="Conv/MaxPool")
            xres = x
            print("A2: {}".format(x.shape))
            x = tf.layers.batch_normalization(
                    tf.nn.relu(U.conv2d(x, 64, "l2", [_3, 3], [1, 1], pad="SAME", summary_tag="Conv/Layer2")), training=is_training)
            print("A3: {}".format(x.shape))
            x = tf.layers.batch_normalization(
                    U.conv2d(x, 64, "l3", [_3, 3], [1, 1], pad="SAME", summary_tag="Conv/Layer3"), training=is_training)
            print("A4: {}".format(x.shape))
            xres2 = x
            x = tf.nn.relu(x + xres)
            x = tf.layers.batch_normalization(
                    tf.nn.relu(U.conv2d(x, 64, "l4", [_3_else_2, 3], [1, 1], pad="SAME", summary_tag="Conv/Layer4")), training=is_training)
            print("A5: {}".format(x.shape))
            x = tf.layers.batch_normalization(
                    U.conv2d(x, 64, "l5", [_3_else_2, 3], [1, 1], pad="SAME", summary_tag="Conv/Layer5"), training=is_training)
            print("A6: {}".format(x.shape))
            x = tf.nn.relu(x + xres2)
            x = tf.layers.average_pooling2d(x, (_3, 3), (_3, 3), padding="SAME", name="Conv/AvgPool")
            endconv_shape = x.shape
            print("A7: {}".format(x.shape))
            x = U.flattenallbut0(x)
            endconv_flat_shape = x.shape
            print("A8: {}".format(x.shape))
            x = tf.nn.relu(tf.layers.dense(x, 512, name='lin', kernel_initializer=U.normc_initializer(1.0)))
            print("A9: {}".format(x.shape))

            tf.summary.histogram("encoder/lin/output", x)

        with tf.variable_scope('latent_space'):
            z_mu = tf.nn.sigmoid(tf.layers.dense(x, self.Z_SIZE, name='z_mu', kernel_initializer=U.normc_initializer(1.0)))
            z_logvar = tf.nn.relu(tf.layers.dense(x, self.Z_SIZE, name='z_logvar', kernel_initializer=U.normc_initializer(1.0)))
            z_sigma = tf.exp(z_logvar/2.0)
            z = tf.contrib.distributions.Normal(loc=z_mu, scale=z_sigma)
            x = z.sample(1)[0]
            print("Z: {}".format(x.shape))
            self.z_mu = z_mu
            self.z_sigma = z_sigma
            self.z = z
            self.z_sample = x

        def build_decoder(z, is_training=self.is_training, output_shape=self.input_shape, scopename="decoder", reuse=False):
            with tf.variable_scope(scopename, reuse=reuse) as scope:
                x = z
                x = tf.nn.relu(tf.layers.dense(x, 512, name='z_inv', kernel_initializer=U.normc_initializer(1.0)))
                print("A9: {}".format(x.shape))
                x = tf.nn.relu(tf.layers.dense(x, endconv_flat_shape[1], name='lin_inv', kernel_initializer=U.normc_initializer(1.0)))
                print("A8: {}".format(x.shape))
                x = tf.reshape(x, (-1, endconv_shape[1], endconv_shape[2], endconv_shape[3]))
                print("A7: {}".format(x.shape))
                # 'opposite' of average_pooling2d with stride
        #         x = tf.image.resize_nearest_neighbor(x, (1*x.shape[1], 3*x.shape[2]), align_corners=True)
                x = tf.layers.conv2d_transpose(x, 64, (_3, 3), (_3, 3), activation=tf.nn.relu, padding="SAME", name="avgpool_inv")
                xres2 = x
                print("A6: {}".format(x.shape))
                x = tf.layers.batch_normalization(
                        tf.layers.conv2d_transpose(x, 64, (_3_else_2, 3), (1, 1), activation=tf.nn.relu, padding="SAME", name="l5_inv"), training=is_training)
                print("A5: {}".format(x.shape))
                x = tf.layers.batch_normalization(
                        tf.layers.conv2d_transpose(x, 64, (_3_else_2, 3), (1, 1), activation=tf.nn.relu, padding="SAME", name="l4_inv"), training=is_training)
                x = tf.nn.relu(x + xres2)
                xres = x
                print("A4: {}".format(x.shape))
                x = tf.layers.batch_normalization(
                        tf.layers.conv2d_transpose(x, 64, (_3, 3), (1, 1), activation=tf.nn.relu, padding="SAME", name="l3_inv"), training=is_training)
                print("A3: {}".format(x.shape))
                x = tf.layers.batch_normalization(
                        tf.layers.conv2d_transpose(x, 64, (_3, 3), (1, 1), activation=tf.nn.relu, padding="SAME", name="l2_inv"), training=is_training)
                print("A2: {}".format(x.shape))
                x = tf.nn.relu(x + xres)
                x = tf.layers.conv2d_transpose(x, 64, (_3, 3), (_3, 3), activation=tf.nn.relu, padding="SAME", name="maxpool_inv")
                print("A1: {}".format(x.shape))
                x = tf.layers.batch_normalization(
                        tf.layers.conv2d_transpose(x, output_shape[2], (_7, 7), (_3, 3), activation=tf.nn.relu, padding="SAME", name="l1_inv"), training=is_training)
                print("A0: {}".format(x.shape))
                y = x
            return y

        self.y = build_decoder(self.z_sample)
        # This must be done before creating the pure decoder, or tf will expect z_in to be fed
        self.batch_norm_update_op = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        # Create a separate decoder network with same variables fed by placeholder, not encoder
        # for off-training reconstruction
        self.reconstruction = build_decoder(self.z_in, reuse=True)






        # Losses
        with tf.variable_scope('reconstruction_loss'):
            self.avg_rec_abs_error = tf.reduce_mean(tf.abs(self.x_in - self.y),
                    reduction_indices=[0,1,2]) # per channel
#             reconstruction_s_e = tf.square((self.x_in - self.y) / 255) # reconstruction square of normalized error
            reconstruction_s_e = tf.log(tf.cosh((self.x_in - self.y) / 255)) # reconstruction square of normalized error
            if self.MASKS: # apply mask (W, H) to p.pixel error (Batch, W, H, Channels)
                reconstruction_s_e  = tf.boolean_mask(reconstruction_s_e, self.mask) 
            reconstruction_loss = tf.reduce_mean(reconstruction_s_e, reduction_indices=[1,2,3]) # per example
            self.reconstruction_loss = tf.reduce_mean(reconstruction_loss) # average over batch

            # kl loss (reduce along z dimensions)
            kl_loss = - 0.5 * tf.reduce_mean(
                    (1 + z_logvar - tf.square(z_mu) - tf.exp(z_logvar)), 
                    reduction_indices = 1
                    ) 
            kl_loss = tf.maximum(kl_loss, self.kl_tolerance) # kl_loss per example
            self.kl_loss = tf.reduce_mean(kl_loss) # batch kl_loss

            self.loss = self.reconstruction_loss + self.kl_loss

        # add tensorboard summaries
        for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES):
            variable_summaries(var)
        self.merged_summaries = tf.summary.merge_all()

        # A placeholder for adding arbitrary images to tensorboard
        self.image_tensor = tf.placeholder(name="image", dtype="float", shape=(None, 1000, 1000, 4)) # Batch, W, H, Channels
        self.image_summary = tf.summary.image("Reconstructions/val", self.image_tensor)
        self.image_tensor2 = tf.placeholder(name="image2", dtype="float", shape=(None, 1000, 1000, 4)) # Batch, W, H, Channels
        self.image_summary2 = tf.summary.image("Reconstructions/valtarget", self.image_tensor2)
Beispiel #19
0
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, num_options=2,dc=0, kind='small'):
        assert isinstance(ob_space, gym.spaces.Box)

        self.dc = dc
        self.num_options = num_options
        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
        option =  U.get_placeholder(name="option", dtype=tf.int32, shape=[None])

        x = ob / 255.0
        if kind == 'small': # from A3C paper
            x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(U.dense(x, 256, 'lin', U.normc_initializer(1.0)))
        elif kind == 'large': # Nature DQN
            x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(U.dense(x, 512, 'lin', U.normc_initializer(1.0)))
        else:
            raise NotImplementedError


        # Network to compute value function and termination probabilities
        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        obz = x
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
        self.vpred = dense3D2(last_out, 1, "vffinal", option, num_options=num_options, weight_init=U.normc_initializer(1.0))[:,0]

        self.vpred_ent = dense3D2(last_out, 1, "vffinal_ent", option, num_options=num_options, weight_init=U.normc_initializer(1.0))[:,0]

        self.tpred = tf.nn.sigmoid(dense3D2(tf.stop_gradient(last_out), 1, "termhead", option, num_options=num_options, weight_init=U.normc_initializer(1.0)))[:,0]
        termination_sample = tf.greater(self.tpred, tf.random_uniform(shape=tf.shape(self.tpred),maxval=1.))


        # Network to compute policy over options and intra_option policies
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
        # if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Discrete):
        #     mean = dense3D2(last_out, pdtype.param_shape()[0]//2, "polfinal", option, num_options=num_options, weight_init=U.normc_initializer(0.01))
        #     logstd = tf.get_variable(name="logstd", shape=[num_options, 1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
        #     pdparam = U.concatenate([mean, mean * 0.0 + logstd[option[0]]], axis=1)
        # else:
        pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))

        self.op_pi = tf.nn.softmax(U.dense(tf.stop_gradient(last_out), num_options, "OPfc%i"%(i+1), weight_init=U.normc_initializer(1.0)))

        self.pd = pdtype.pdfromflat(pdparam)


        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob, option], [ac, self.vpred, self.vpred_ent, last_out])
        self._get_logits = U.function([stochastic, ob, option], [self.pd.logits] )


        self._get_v = U.function([ob, option], [self.vpred])
        self._get_v_ent = U.function([ob, option], [self.vpred_ent])  # Entropy value estimate
        self.get_term = U.function([ob, option], [termination_sample])
        self.get_tpred = U.function([ob, option], [self.tpred])
        self.get_vpred = U.function([ob, option], [self.vpred])
        self.get_vpred_ent = U.function([ob, option], [self.vpred_ent]) # Entropy value estimate
        self._get_op = U.function([ob], [self.op_pi])
Beispiel #20
0
    def __init__(self, action_space, observation_space, scope, args):
        self.scope = scope
        self.EPSILON = 1e-8
        self.action_bound = [action_space.low, action_space.high]
        self.state_shape = observation_space[1].shape
        self.conv_state_shape = observation_space[0].shape
        DISCRETE = not args.continuous
        self.DIRECT_AGENT_OBS = len(observation_space) == 3
        if self.DIRECT_AGENT_OBS:
            self.relobst_state_shape = list(observation_space[2].shape)
            # set to fix sized of relative obstacles
            self.MAX_N_REL_OBSTACLES = args.max_n_relative_obstacles
            if self.relobst_state_shape[1] > self.MAX_N_REL_OBSTACLES:
                raise ValueError(
                    "Can only handle up to 10 dynamic obstacle states")
            self.relobst_state_shape[1] = self.MAX_N_REL_OBSTACLES
            self.relobst_state_shape = tuple(self.relobst_state_shape)
        if DISCRETE:
            assert len(action_space.shape) == 2
            self.num_action_values = action_space.shape[1]
        else:
            assert len(action_space.shape) == 1
        self.action_names = ['u', 'v', 'theta']
        self.num_action = action_space.shape[0]
        self.cliprange = args.cliprange
        self.checkpoint_path = args.checkpoint_path
        if not os.path.exists(self.checkpoint_path):
            os.makedirs(self.checkpoint_path)
        self.environment = args.environment

        self.global_steps = tf.Variable(0,
                                        dtype=tf.int32,
                                        name='global_steps',
                                        trainable=False)
        self.inc_global_steps = self.global_steps.assign_add(1)

        ## Build net
        with tf.variable_scope('input'):
            self.s_conv = tf.placeholder(name="s_conv",
                                         dtype="float",
                                         shape=(None, ) +
                                         self.conv_state_shape)
            self.s = tf.placeholder(name="s",
                                    dtype="float",
                                    shape=(None, ) + self.state_shape)
            if self.DIRECT_AGENT_OBS:
                self.s_relobst = tf.placeholder(name="s_relobst",
                                                dtype="float",
                                                shape=(None, ) +
                                                self.relobst_state_shape)
        with tf.variable_scope('action'):
            if DISCRETE:
                self.a = tf.placeholder(name="a",
                                        shape=[None, self.num_action],
                                        dtype=tf.float32)
            else:
                self.a = tf.placeholder(name="a",
                                        shape=[None, self.num_action],
                                        dtype=tf.float32)
        with tf.variable_scope('target_returns'):
            self.target_returns = tf.placeholder(name="target_returns",
                                                 shape=[None, 1],
                                                 dtype=tf.float32)
        with tf.variable_scope('advantages'):
            self.advantage = tf.placeholder(name="advantage",
                                            shape=[None, 1],
                                            dtype=tf.float32)
        with tf.variable_scope('is_training'):
            self.is_training = tf.placeholder(tf.bool, name="is_training")
        with tf.variable_scope('entropy_coeff'):
            self.entropy_coeff = tf.placeholder(name="entropy_coeff",
                                                dtype=tf.float32)
        with tf.variable_scope('old_predicted_values'):
            self.old_value = tf.placeholder(name="old_value",
                                            shape=[None, 1],
                                            dtype=tf.float32)

        assert len(self.state_shape) == 1
        assert self.state_shape[0] == 5
        state_relgoal, state_vel = tf.split(self.s, [2, 3], axis=1)
        if self.DIRECT_AGENT_OBS:
            state_relobst = U.flattenallbut0(self.s_relobst)
        features_relgoal = tf.nn.relu(
            tf.layers.dense(state_relgoal,
                            32,
                            name='s_relgoal_preproc',
                            kernel_initializer=U.normc_initializer(1.0)))
        features_vel = tf.nn.relu(
            tf.layers.dense(state_vel,
                            32,
                            name='s_vel_preproc',
                            kernel_initializer=U.normc_initializer(1.0)))
        if self.DIRECT_AGENT_OBS:
            features_relobst = tf.nn.relu(
                tf.layers.dense(state_relobst,
                                32,
                                name='s_relobst_preproc',
                                kernel_initializer=U.normc_initializer(1.0)))

        # batch normalization
        features_relgoal = tf.layers.batch_normalization(
            features_relgoal, training=self.is_training)
        features_vel = tf.layers.batch_normalization(features_vel,
                                                     training=self.is_training)
        if self.DIRECT_AGENT_OBS:
            features_relobst = tf.layers.batch_normalization(
                features_relobst, training=self.is_training)
        self.batch_norm_update_op = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

        all_features = [features_relgoal, features_vel]
        if self.DIRECT_AGENT_OBS:
            all_features.append(features_relobst)
        x = tf.concat(all_features, axis=-1)

        x = tf.check_numerics(x, message="after concat")

        #

        #         x = tf.nn.relu(tf.layers.dense(x, 256, name='merged_lin', kernel_initializer=U.normc_initializer(1.0)))

        def build_critic_net(inputs, scope):
            with tf.variable_scope(scope):
                dl1 = tf.contrib.layers.fully_connected(
                    inputs=inputs,
                    num_outputs=128,
                    activation_fn=tf.nn.relu,
                    scope='dl1')

                tf.summary.histogram("{}/dl1/output".format(scope), dl1)

                value = tf.contrib.layers.fully_connected(
                    inputs=dl1,
                    num_outputs=1,
                    activation_fn=None,
                    scope='value')  #[:, 0]  # initializer std 1.0

                tf.summary.histogram("{}/value/output".format(scope), value)
                tf.summary.scalar("{}/value/output_max".format(scope),
                                  tf.reduce_max(value))
                tf.summary.scalar("{}/value/output_min".format(scope),
                                  tf.reduce_min(value))
                tf.summary.scalar("{}/value/target_max".format(scope),
                                  tf.reduce_max(self.target_returns))
                tf.summary.scalar("{}/value/target_min".format(scope),
                                  tf.reduce_min(self.target_returns))

                return value

        self.value = build_critic_net(x, 'value_net')

        def build_actor_net(inputs, scope, trainable, CONTINUOUS):
            with tf.variable_scope(scope):
                # Hidden layer
                dl1 = tf.contrib.layers.fully_connected(
                    inputs=inputs,
                    num_outputs=256,
                    activation_fn=tf.nn.relu,
                    trainable=trainable,
                    scope='dl1')
                # Output layer and distribution
                if not CONTINUOUS:
                    action_logits = tf.contrib.layers.fully_connected(
                        inputs=dl1,
                        num_outputs=self.num_action * self.num_action_values,
                        activation_fn=tf.nn.relu,
                        trainable=trainable,
                        scope='action_logits')
                    action_logits = tf.reshape(
                        action_logits,
                        (-1, self.num_action, self.num_action_values))
                    # Multinomial distribution (draw one out of num_action_values classes)
                    # if 3 probs [0.4, 0.1, 0.5] and total_count = 1
                    # sample(1) -> [1, 0, 0], or [0, 1, 0], or [0, 0, 1]
                    # prob([1, 0, 0]) -> 0.4
                    # total_count is the amount of draws per iteration. in this case 1 (single action)
                    action_dist = tf.distributions.Categorical(
                        logits=action_logits)
                else:
                    mu = tf.contrib.layers.fully_connected(
                        inputs=dl1,
                        num_outputs=self.num_action,
                        activation_fn=tf.nn.tanh,
                        scope='mu')
                    # adding epsilon here to prevent inf in normal distribution when sigma -> 0
                    sigma = self.EPSILON + tf.contrib.layers.fully_connected(
                        inputs=dl1,
                        num_outputs=self.num_action,
                        activation_fn=tf.nn.softplus,
                        trainable=trainable,
                        scope='sigma')
                    action_dist = tf.contrib.distributions.Normal(loc=mu,
                                                                  scale=sigma)
                param = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope)
                # tensorboard
                tf.summary.histogram("{}/dl1/output".format(scope), dl1)
                if not CONTINUOUS:
                    action_outputs = tf.split(action_logits,
                                              self.num_action,
                                              axis=1)
                    for action_name, out in zip(self.action_names,
                                                action_outputs):
                        tf.summary.histogram(
                            "{}/action_logits/output_{}".format(
                                scope, action_name), out)
                else:
                    mu_outputs = tf.split(mu, self.num_action, axis=-1)
                    for action_name, out in zip(self.action_names, mu_outputs):
                        tf.summary.histogram(
                            "{}/mu/output_{}".format(scope, action_name), out)
                    sigma_outputs = tf.split(sigma, self.num_action, axis=-1)
                    for action_name, out in zip(self.action_names,
                                                sigma_outputs):
                        tf.summary.histogram(
                            "{}/sigma/output_{}".format(scope, action_name),
                            out)
                # ---
                return action_dist, param

        pi, pi_param = build_actor_net(x,
                                       'actor_net',
                                       trainable=True,
                                       CONTINUOUS=args.continuous)
        old_pi, old_pi_param = build_actor_net(x,
                                               'old_actor_net',
                                               trainable=False,
                                               CONTINUOUS=args.continuous)
        self.syn_old_pi = [
            oldp.assign(p) for p, oldp in zip(pi_param, old_pi_param)
        ]

        single_sample = tf.squeeze(pi.sample(1), axis=0)
        if DISCRETE:
            self.sample_op = single_sample  # one_hot
            self.best_action_op = tf.one_hot(
                tf.argmax(tf.squeeze(pi.probs, axis=0), axis=-1),
                self.num_action_values)  # one_hot
        else:
            self.sample_op = tf.clip_by_value(single_sample,
                                              self.action_bound[0][0],
                                              self.action_bound[1][0])
            self.best_action_op = tf.clip_by_value(pi.mean(),
                                                   self.action_bound[0][0],
                                                   self.action_bound[1][0])
        # tensorboard
        single_sample_outputs = tf.split(single_sample,
                                         self.num_action,
                                         axis=1)
        for action_name, out in zip(self.action_names, single_sample_outputs):
            tf.summary.histogram(
                "ActionDistribution/single_sample_{}".format(action_name), out)

        # Losses
        with tf.variable_scope('critic_loss'):
            diff_ypred_y = self.target_returns - self.value
            self.critic_loss_ = tf.square(diff_ypred_y)
            CLIP_VALUE_OPTIM = True
            if CLIP_VALUE_OPTIM:
                valueclipped = self.old_value + tf.clip_by_value(
                    self.value - self.old_value, -self.cliprange,
                    self.cliprange)
                self.clipped_critic_loss = tf.square(self.target_returns -
                                                     valueclipped)
                self.critic_loss_ = tf.maximum(self.critic_loss_,
                                               self.clipped_critic_loss)
            self.critic_loss = tf.reduce_mean(self.critic_loss_)

            self.critic_loss = tf.check_numerics(self.critic_loss,
                                                 message="after critic_loss")

        with tf.variable_scope('actor_loss'):
            self.entropy = pi.entropy()
            batch_entropy = tf.reduce_mean(self.entropy)
            ratio = pi.prob(self.a) / (old_pi.prob(self.a) + self.EPSILON
                                       )  #(old_pi.prob(self.a)+ 1e-5)
            #             ratio = tf.exp( pi.log_prob(self.a) -  old_pi.log_prob(self.a) ) # new / old  #(old_pi.prob(self.a)+ 1e-5)
            pg_losses = -self.advantage * ratio
            pg_losses2 = -self.advantage * tf.clip_by_value(
                ratio, 1.0 - self.cliprange, 1.0 + self.cliprange)
            self.actor_loss = tf.reduce_mean(tf.maximum(
                pg_losses, pg_losses2)) - batch_entropy * self.entropy_coeff
            self.actor_loss = tf.check_numerics(self.actor_loss,
                                                message="after actor_loss")

        # diagnostics
#         if args.continuous:
# entropy is not implemented for multinomial distribution
        if True:
            self.kl = tf.distributions.kl_divergence(pi, old_pi)
            tf.summary.histogram("Diagnostics/KL", self.kl)
            tf.summary.scalar("Diagnostics/MinibatchAvgKL",
                              tf.reduce_mean(self.kl))
            tf.summary.histogram("Diagnostics/Entropy", self.entropy)
            tf.summary.scalar("Diagnostics/MinibatchAvgEntropy", batch_entropy)
        #explained variance 1 = perfect, 0-1 good, 0 = might as well have predicted 0, < 0 worse than predicting 0
        def reduce_variance(x):
            """ reduce all but batch dim,
             input shape (batch_size, N)
             result shape (batch_size, ) """
            means = tf.reduce_mean(x, keepdims=True)
            sqdev = tf.square(x - means)
            return tf.reduce_mean(sqdev)

        self.ev = 1 - reduce_variance(diff_ypred_y) / reduce_variance(
            self.target_returns)
        tf.summary.scalar("Diagnostics/MinibatchExplainedVariance", self.ev)

        # add tensorboard summaries
        for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES):
            variable_summaries(var)
        self.merged_summaries = tf.summary.merge_all()
Beispiel #21
0
    def _init(self, ob_space, ac_space, architecture_size):
        """

        :param ob_space: (Gym Space) The observation space of the environment
        :param ac_space: (Gym Space) The action space of the environment
        :param architecture_size: (str) size of the policy's architecture
               (small as in A3C paper, large as in Nature DQN)
        """
        obs, pdtype = self.get_obs_and_pdtype(ob_space, ac_space)

        with tf.variable_scope(self.name, reuse=self.reuse):
            normalized_obs = obs / 255.0
            if architecture_size == 'small':  # from A3C paper
                layer_1 = tf.nn.relu(
                    tf_util.conv2d(normalized_obs,
                                   16,
                                   "l1", [8, 8], [4, 4],
                                   pad="VALID"))
                layer_2 = tf.nn.relu(
                    tf_util.conv2d(layer_1,
                                   32,
                                   "l2", [4, 4], [2, 2],
                                   pad="VALID"))
                flattened_layer_2 = tf_util.flattenallbut0(layer_2)
                last_layer = tf.nn.relu(
                    tf.layers.dense(
                        flattened_layer_2,
                        256,
                        name='lin',
                        kernel_initializer=tf_util.normc_initializer(1.0)))
            elif architecture_size == 'large':  # Nature DQN
                layer_1 = tf.nn.relu(
                    tf_util.conv2d(normalized_obs,
                                   32,
                                   "l1", [8, 8], [4, 4],
                                   pad="VALID"))
                layer_2 = tf.nn.relu(
                    tf_util.conv2d(layer_1,
                                   64,
                                   "l2", [4, 4], [2, 2],
                                   pad="VALID"))
                layer_3 = tf.nn.relu(
                    tf_util.conv2d(layer_2,
                                   64,
                                   "l3", [3, 3], [1, 1],
                                   pad="VALID"))
                flattened_layer_3 = tf_util.flattenallbut0(layer_3)
                last_layer = tf.nn.relu(
                    tf.layers.dense(
                        flattened_layer_3,
                        512,
                        name='lin',
                        kernel_initializer=tf_util.normc_initializer(1.0)))
            else:
                raise NotImplementedError

            logits = tf.layers.dense(
                last_layer,
                pdtype.param_shape()[0],
                name='logits',
                kernel_initializer=tf_util.normc_initializer(0.01))

            self.proba_distribution = pdtype.proba_distribution_from_flat(
                logits)
            self.vpred = tf.layers.dense(
                last_layer,
                1,
                name='value',
                kernel_initializer=tf_util.normc_initializer(1.0))[:, 0]

        self.state_in = []
        self.state_out = []

        if self.stochastic_ph is None:
            self.stochastic_ph = tf.placeholder(dtype=tf.bool, shape=())
        action = self.proba_distribution.sample()
        self._act = tf_util.function([self.stochastic_ph, obs],
                                     [action, self.vpred])
Beispiel #22
0
    def __build_graph(self, ob_space, ac_space, gaussian_fixed_var=True):

        self.pdtype = pdtype = make_pdtype(ac_space)

        assert not isinstance(ob_space, gym.spaces.tuple.Tuple)
        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[None] + list(ob_space.shape))
        ob_g, ob_l = tf.split(ob, 2, axis=1)
        ob_g = tf.squeeze(ob_g, axis=1) - 128.0
        ob_l = tf.squeeze(ob_l, axis=1) - 128.0

        # Conv layer
        net = slim.convolution(ob_g,
                               96, [7, 7],
                               2,
                               padding='VALID',
                               scope='conv1',
                               activation_fn=tf.nn.relu,
                               reuse=tf.AUTO_REUSE)
        net = tf.nn.lrn(net, depth_radius=5, bias=2, alpha=1e-4 * 1, beta=0.75)
        net = slim.pool(net, [3, 3],
                        'MAX',
                        stride=2,
                        padding='VALID',
                        scope='pool1')

        net = slim.convolution(net,
                               256, [5, 5],
                               2,
                               padding='VALID',
                               scope='conv2',
                               activation_fn=tf.nn.relu,
                               reuse=tf.AUTO_REUSE)
        net = tf.nn.lrn(net, depth_radius=5, bias=2, alpha=1e-4 * 1, beta=0.75)
        net = slim.pool(net, [3, 3],
                        'MAX',
                        stride=2,
                        padding='VALID',
                        scope='pool2')

        net = slim.convolution(net,
                               512, [3, 3],
                               1,
                               padding='VALID',
                               scope='conv3',
                               activation_fn=tf.nn.relu,
                               reuse=tf.AUTO_REUSE)

        net_g = slim.convolution(net,
                                 512, [3, 3],
                                 1,
                                 padding='VALID',
                                 scope='conv4',
                                 activation_fn=tf.nn.relu,
                                 reuse=tf.AUTO_REUSE)

        net = slim.convolution(ob_l,
                               96, [7, 7],
                               2,
                               padding='VALID',
                               scope='conv1',
                               activation_fn=tf.nn.relu,
                               reuse=tf.AUTO_REUSE)
        net = tf.nn.lrn(net, depth_radius=5, bias=2, alpha=1e-4 * 1, beta=0.75)
        net = slim.pool(net, [3, 3],
                        'MAX',
                        stride=2,
                        padding='VALID',
                        scope='pool1')

        net = slim.convolution(net,
                               256, [5, 5],
                               2,
                               padding='VALID',
                               scope='conv2',
                               activation_fn=tf.nn.relu,
                               reuse=tf.AUTO_REUSE)
        net = tf.nn.lrn(net, depth_radius=5, bias=2, alpha=1e-4 * 1, beta=0.75)
        net = slim.pool(net, [3, 3],
                        'MAX',
                        stride=2,
                        padding='VALID',
                        scope='pool2')

        net = slim.convolution(net,
                               512, [3, 3],
                               1,
                               padding='VALID',
                               scope='conv3',
                               activation_fn=tf.nn.relu,
                               reuse=tf.AUTO_REUSE)

        net_l = slim.convolution(net,
                                 512, [3, 3],
                                 1,
                                 padding='VALID',
                                 scope='conv4',
                                 activation_fn=tf.nn.relu,
                                 reuse=tf.AUTO_REUSE)

        # Concat Features
        self.feat = feat = tf.concat(
            [U.flattenallbut0(net_g),
             U.flattenallbut0(net_l)], 1)

        # fcs_actor
        net = slim.fully_connected(feat,
                                   512,
                                   scope='polfc1',
                                   activation_fn=tf.nn.relu)
        # pdparam = slim.fully_connected(net, 4, scope='polfc2', activation_fn=None)
        mean = slim.fully_connected(net,
                                    pdtype.param_shape()[0] // 2,
                                    scope='polfc2',
                                    activation_fn=None)
        logstd = tf.get_variable(name="logstd",
                                 shape=[1, pdtype.param_shape()[0] // 2],
                                 initializer=tf.zeros_initializer())
        pdparam = tf.concat([mean, logstd], axis=1)
        self.pd = pdtype.pdfromflat(pdparam)

        # fcs_value
        net = slim.fully_connected(feat,
                                   512,
                                   scope='vffc1',
                                   activation_fn=tf.nn.relu)
        self.vpred = slim.fully_connected(net,
                                          1,
                                          scope='vffc2',
                                          activation_fn=None)

        # change for BC
        stochastic = U.get_placeholder(name="stochastic",
                                       dtype=tf.bool,
                                       shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self.ac = ac
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Beispiel #23
0
    def _init(self, ob_space, sensor_space, ac_space, hid_size, num_hid_layers,
              kind):
        assert isinstance(ob_space, gym.spaces.Box)
        assert isinstance(sensor_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))
        ob_sensor = U.get_placeholder(name="ob_sensor",
                                      dtype=tf.float32,
                                      shape=[sequence_length] +
                                      list(sensor_space.shape))

        ## Obfilter on sensor output
        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=sensor_space.shape)

        obz_sensor = tf.clip_by_value(
            (ob_sensor - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
        #x = tf.nn.relu(tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0)))

        ## Adapted from mlp_policy
        last_out = obz_sensor
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                tf.layers.dense(last_out,
                                hid_size,
                                name="vffc%i" % (i + 1),
                                kernel_initializer=U.normc_initializer(1.0)))
        y = tf.layers.dense(last_out,
                            64,
                            name="vffinal",
                            kernel_initializer=U.normc_initializer(1.0))

        #y = ob_sensor
        #y = obz_sensor
        #y = tf.nn.relu(U.dense(y, 64, 'lin_ob', U.normc_initializer(1.0)))

        x = ob / 255.0
        if kind == 'small':  # from A3C paper
            x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(
                tf.layers.dense(x,
                                256,
                                name='lin',
                                kernel_initializer=U.normc_initializer(1.0)))
        elif kind == 'large':  # Nature DQN
            x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(
                tf.layers.dense(x,
                                64,
                                name='lin',
                                kernel_initializer=U.normc_initializer(1.0)))
        else:
            raise NotImplementedError

        print(x.shape, y.shape)
        x = tf.concat([x, y], 1)

        ## Saver
        # self.saver = tf.train.Saver()

        logits = tf.layers.dense(x,
                                 pdtype.param_shape()[0],
                                 name="logits",
                                 kernel_initializer=U.normc_initializer(0.01))
        self.pd = pdtype.pdfromflat(logits)
        self.vpred = tf.layers.dense(
            x, 1, name="value", kernel_initializer=U.normc_initializer(1.0))[:,
                                                                             0]

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = self.pd.sample()  # XXX
        self._act = U.function([stochastic, ob, ob_sensor],
                               [ac, self.vpred, logits])
Beispiel #24
0
    def _init(self, ob_space, ac_space, kind):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))
        ob_2 = U.get_placeholder(
            name="ob_2", dtype=tf.float32, shape=[sequence_length] +
            [5])  # observations to feed in after convolutions
        ob_2_fc = tf.nn.relu(
            tf.layers.dense(ob_2,
                            64,
                            name='s2_preproc',
                            kernel_initializer=U.normc_initializer(1.0)))

        x = ob / 25.
        if kind == 'small':  # from A3C paper
            x = tf.nn.relu(U.conv2d(x, 16, "l1", [2, 8], [1, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 32, "l2", [2, 4], [1, 2], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(
                tf.layers.dense(x,
                                256,
                                name='lin',
                                kernel_initializer=U.normc_initializer(1.0)))
            x = tf.concat([x, ob_2_fc], axis=-1)
        elif kind == 'large':  # Nature DQN
            x = tf.nn.relu(U.conv2d(x, 32, "l1", [2, 8], [1, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l2", [2, 4], [1, 2], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l3", [2, 3], [1, 1], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(
                tf.layers.dense(x,
                                512,
                                name='lin',
                                kernel_initializer=U.normc_initializer(1.0)))
            x = tf.concat([x, ob_2_fc], axis=-1)
        else:
            raise NotImplementedError

        x = tf.nn.relu(
            tf.layers.dense(x,
                            256,
                            name='merged_lin',
                            kernel_initializer=U.normc_initializer(1.0)))
        logits = tf.layers.dense(x,
                                 pdtype.param_shape()[0],
                                 name='logits',
                                 kernel_initializer=U.normc_initializer(0.01))
        self.pd = pdtype.pdfromflat(logits)
        self.vpred = tf.layers.dense(
            x, 1, name='value', kernel_initializer=U.normc_initializer(1.0))[:,
                                                                             0]

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(name="stochastic", dtype=tf.bool, shape=())
        ac = self.pd.sample()  # XXX
        self._act = U.function([stochastic, ob, ob_2], [ac, self.vpred])
Beispiel #25
0
    def _init(self,
              sensor_name,
              sensor_shape,
              ac_space,
              measure_name,
              measure_shape,
              init_std=1.0):
        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        self.sensor = utils.get_placeholder(name=sensor_name,
                                            dtype=tf.float32,
                                            shape=[sequence_length] +
                                            list(sensor_shape))
        self.measure = utils.get_placeholder(name=measure_name,
                                             dtype=tf.float32,
                                             shape=[sequence_length] +
                                             list(measure_shape))
        with tf.variable_scope("measurefilter"):
            self.ms_rms = RunningMeanStd(shape=measure_shape)

        obscaled = self.sensor / 255.0
        m = tf.clip_by_value(
            (self.measure - self.ms_rms.mean) / self.ms_rms.std, -5.0, 5.0)

        with tf.variable_scope("vf"):
            x = obscaled
            x = tf.nn.relu(
                utils.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(
                utils.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID"))

            m = tf.nn.tanh(
                tf.layers.dense(
                    m,
                    32,
                    name="fc1",
                    kernel_initializer=utils.normc_initializer(1.0)))

            x = utils.flattenallbut0(x)
            x = tf.nn.relu(
                tf.layers.dense(
                    x,
                    128,
                    name='lin',
                    kernel_initializer=utils.normc_initializer(1.0)))
            self.vpred = tf.layers.dense(
                x,
                1,
                name='value',
                kernel_initializer=utils.normc_initializer(1.0))
            self.vpredz = self.vpred

        with tf.variable_scope("pol"):
            x = obscaled
            x = tf.nn.relu(
                utils.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(
                utils.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID"))
            x = utils.flattenallbut0(x)
            x = tf.nn.relu(
                tf.layers.dense(
                    x,
                    128,
                    name='lin',
                    kernel_initializer=utils.normc_initializer(1.0)))

            self.action_dim = ac_space.shape[0]

            self.dist_diagonal = True
            self.varphi = x
            self.varphi_dim = 128

            stddev_init = np.ones([1, self.action_dim]) * init_std
            prec_init = 1. / (np.multiply(stddev_init, stddev_init))  # 1 x |a|
            self.prec = tf.get_variable(
                name="prec",
                shape=[1, self.action_dim],
                initializer=tf.constant_initializer(prec_init))
            kt_init = np.ones([self.varphi_dim, self.action_dim
                               ]) * 0.5 / self.varphi_dim
            ktprec_init = kt_init * prec_init
            self.ktprec = tf.get_variable(
                name="ktprec",
                shape=[self.varphi_dim, self.action_dim],
                initializer=tf.constant_initializer(ktprec_init))
            kt = tf.divide(self.ktprec, self.prec)
            mean = tf.matmul(x, kt)

            logstd = tf.log(tf.sqrt(1. / self.prec))

            self.prec_get_flat = utils.GetFlat([self.prec])
            self.prec_set_from_flat = utils.SetFromFlat([self.prec])

            self.ktprec_get_flat = utils.GetFlat([self.ktprec])
            self.ktprec_set_from_flat = utils.SetFromFlat([self.ktprec])

            pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)

        self.scope = tf.get_variable_scope().name

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = utils.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = utils.function([stochastic, self.sensor], [ac, self.vpred])

        # Get all policy parameters
        vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                 self.scope + '/pol')
        # Remove log-linear parameters ktprec and prec to get only non-linear parameters
        del vars[-1]
        del vars[-1]
        beta_params = vars

        # Flat w_beta
        beta_len = np.sum(
            [np.prod(p.get_shape().as_list()) for p in beta_params])
        w_beta_var = tf.placeholder(dtype=tf.float32, shape=[beta_len])

        # Unflatten w_beta
        beta_shapes = list(map(tf.shape, beta_params))
        w_beta_unflat_var = self.unflatten_tensor_variables(
            w_beta_var, beta_shapes)

        # w_beta^T * \grad_beta \varphi(s)^T
        v = tf.placeholder(dtype=self.varphi.dtype,
                           shape=self.varphi.get_shape(),
                           name="v_in_Rop")
        features_beta = self.alternative_Rop(self.varphi, beta_params,
                                             w_beta_unflat_var, v)

        self.features_beta = utils.function([self.sensor, w_beta_var, v],
                                            features_beta)