Beispiel #1
0
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, rnn_hid_units, gaussian_fixed_var=True):
        #assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        # Apply rnn_to reduce history
        with tf.variable_scope("vf"):
            last_out = self.rnn(ob, ob_space.shape[0], rnn_hid_units)
            for i in range(num_hid_layers):
                last_out = U.dense(last_out, hid_size, "vf_dense%i"%i, weight_init=U.normc_initializer(1.0))
            self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0]

        # Apply rnn_to reduce history
        with tf.variable_scope("pf"):
            last_out = self.rnn(ob, ob_space.shape[0], rnn_hid_units)
            for i in range(num_hid_layers):
                last_out = U.dense(last_out, hid_size, "pf_dense%i"%i, weight_init=U.normc_initializer(1.0))

            assert gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box)
            mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01))
            logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Beispiel #2
0
    def _create_network(self):
        x = self.ob

        # create ob filter
        if self.ob_filter:
            self.ob_rms = RunningMeanStd(shape=self.ob_space.shape)
            x = tf.clip_by_value(
                (self.ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)

        # actor
        l = x
        l = tf.nn.tanh(
            U.dense(l, 32, "a_1", weight_init=U.normc_initializer(1.0)))
        l = tf.nn.tanh(
            U.dense(l, 32, "a_2", weight_init=U.normc_initializer(1.0)))
        action_layer = l

        # critic
        l = x
        l = tf.nn.tanh(
            U.dense(l, 32, "c_1", weight_init=U.normc_initializer(1.0)))
        l = tf.nn.tanh(
            U.dense(l, 32, "c_2", weight_init=U.normc_initializer(1.0)))
        value_layer = l

        self._create_logit_value(action_layer, value_layer,
                                 self.gaussian_fixed_var)
    def _init(self, ob_space, ac_space):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        obscaled = ob / 255.0

        with tf.variable_scope("pol"):
            x = obscaled
            x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0)))
            logits = U.dense(x, pdtype.param_shape()[0], "logits", U.normc_initializer(0.01))
            self.pd = pdtype.pdfromflat(logits)
        with tf.variable_scope("vf"):
            x = obscaled
            x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0)))
            self.vpred = U.dense(x, 1, "value", U.normc_initializer(1.0))
            self.vpredz = self.vpred

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = self.pd.sample() # XXX
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Beispiel #4
0
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                               5.0)
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "vffc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        self.vpred = U.dense(last_out,
                             1,
                             "vffinal",
                             weight_init=U.normc_initializer(1.0))[:, 0]

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "polfc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = U.dense(last_out,
                           pdtype.param_shape()[0] // 2, "polfinal",
                           U.normc_initializer(0.01))
            logstd = tf.get_variable(name="logstd",
                                     shape=[1, pdtype.param_shape()[0] // 2],
                                     initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
        else:
            pdparam = U.dense(last_out,
                              pdtype.param_shape()[0], "polfinal",
                              U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Beispiel #5
0
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
        self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0]

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01))
            logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
        else:
            pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Beispiel #6
0
    def _create_logit_value(self,
                            action_layer,
                            value_layer,
                            gaussian_fixed_var=False):
        # actor
        if gaussian_fixed_var and isinstance(self.ac_space, gym.spaces.Box):
            mean = U.dense(action_layer,
                           self.pdtype.param_shape()[0] // 2, "polfinal",
                           U.normc_initializer(0.01))
            logstd = tf.get_variable(
                name="logstd",
                shape=[1, self.pdtype.param_shape()[0] // 2],
                initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
        else:
            pdparam = U.dense(action_layer,
                              self.pdtype.param_shape()[0], "polfinal",
                              U.normc_initializer(0.01))

        self.pd = self.pdtype.pdfromflat(pdparam)
        self.ac = U.switch(self.stochastic, self.pd.sample(), self.pd.mode())

        # critic
        self.vpred = U.dense(value_layer,
                             1,
                             "vffinal",
                             weight_init=U.normc_initializer(1.0))[:, 0]
Beispiel #7
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False):  #pylint: disable=W0613
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc)
        nact = ac_space.n
        X = tf.placeholder(tf.float32, ob_shape)  #obs
        print(ob_shape)

        self.pdtype = pdtype = make_pdtype(ac_space)
        with tf.variable_scope("model", reuse=reuse):
            '''
            h = conv(X, 'c1', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), pad="SAME")
            h2 = conv(h, 'c2', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), pad="SAME")
            h3 = conv(h2, 'c3', nf=128, rf=3, stride=1, init_scale=np.sqrt(2), pad="SAME")
            h3 = conv_to_fc(h3)
            h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))

            hh = conv(X, 'xc1', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), pad="SAME")
            hh2 = conv(hh, 'xc2', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), pad="SAME")
            hh3 = conv(hh2, 'xc3', nf=128, rf=3, stride=1, init_scale=np.sqrt(2), pad="SAME")
            hh3 = conv_to_fc(hh3)
            hh4 = fc(hh3, 'xfc1', nh=512, init_scale=np.sqrt(2))
            pi = fc(h4, 'pi', nact, act=lambda x:x, init_scale=0.01)
            vf = fc(hh4, 'v', 1, act=lambda x:x)[:,0]

            '''
            x = tf.nn.relu(U.conv2d(X, 32, "l1", [3, 3], [1, 1], pad="SAME"))
            x = tf.nn.relu(U.conv2d(x, 64, "l2", [3, 3], [1, 1], pad="SAME"))
            x = tf.nn.relu(U.conv2d(x, 128, "l3", [3, 3], [1, 1], pad="SAME"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(U.dense(x, 512, 'lin', U.normc_initializer(1.0)))

            y = tf.nn.relu(U.conv2d(X, 32, "yl1", [3, 3], [1, 1], pad="SAME"))
            y = tf.nn.relu(U.conv2d(y, 64, "yl2", [3, 3], [1, 1], pad="SAME"))
            y = tf.nn.relu(U.conv2d(y, 128, "yl3", [3, 3], [1, 1], pad="SAME"))
            y = U.flattenallbut0(y)
            y = tf.nn.relu(U.dense(y, 512, 'ylin', U.normc_initializer(1.0)))

            pi = U.dense(x,
                         pdtype.param_shape()[0], "logits",
                         U.normc_initializer(0.01))
            vf = U.dense(y, 1, "value", U.normc_initializer(1.0))[:, 0]

        self.pd = self.pdtype.pdfromflat(pi)

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X: ob})

        self.X = X
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
Beispiel #8
0
    def _init(self, ob_space, ac_space, kind):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        x = ob / 255.0
        if kind == 'small': # from A3C paper
            x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(U.dense(x, 256, 'lin', U.normc_initializer(1.0)))
        elif kind == 'large': # Nature DQN
            x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(U.dense(x, 512, 'lin', U.normc_initializer(1.0)))
        else:
            raise NotImplementedError

        logits = U.dense(x, pdtype.param_shape()[0], "logits", U.normc_initializer(0.01))
        self.pd = pdtype.pdfromflat(logits)
        self.vpred = U.dense(x, 1, "value", U.normc_initializer(1.0))[:,0]

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = self.pd.sample() # XXX
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Beispiel #9
0
    def _init(self, ob_space, ac_space, kind):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
    
        x = ob / 255.0
        if kind == 'small': # from A3C paper
            x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(U.dense(x, 256, 'lin', U.normc_initializer(1.0)))
        elif kind == 'large': # Nature DQN
            x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(U.dense(x, 512, 'lin', U.normc_initializer(1.0)))
        else:
            raise NotImplementedError

        logits = U.dense(x, pdtype.param_shape()[0], "logits", U.normc_initializer(0.01))
        self.pd = pdtype.pdfromflat(logits)
        self.vpred = U.dense(x, 1, "value", U.normc_initializer(1.0))[:,0]

        self.state_in = []
        self.state_out = []

        stochastic = tf.compat.v1.placeholder(dtype=tf.bool, shape=())
        ac = self.pd.sample() # XXX
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Beispiel #10
0
def resnet(inputs, hid_size, name):
    x = U.dense(inputs, hid_size, "%s_dense1"%name, weight_init=U.normc_initializer(1.0))
    #x = tf.contrib.layers.batch_norm(x)
    x = tf.nn.relu(x)
    x = U.dense(x, hid_size, "%s_dense2"%name, weight_init=U.normc_initializer(1.0))
    #x = tf.contrib.layers.batch_norm(x)
    x = tf.nn.relu(x+inputs)
    return x
 def build_forward(self, state, reuse):
     # build noise samples
     batch_size = [state.get_shape().as_list()[0], self.input_dim]
     noise_dist = tfd.Normal(loc=0., scale=1.)
     noise_samples = noise_dist.sample(
         batch_size)  # size of [batchsize, action dim]
     # build forward
     last_out = state
     self.meandict = meandict = []
     self.logstddict = logstddict = []
     with tf.variable_scope('forward', reuse=reuse):
         for i in range(self.num_hid_layers):
             last_out = tf.nn.tanh(
                 U.dense(last_out,
                         self.hid_size,
                         "polfc%i" % (i + 1),
                         weight_init=U.normc_initializer(1.0)))
         for k in range(self.K):
             mean = U.dense(last_out, self.input_dim,
                            "polfinal_{}".format(k),
                            U.normc_initializer(0.01))
             logstd = tf.get_variable(name="logstd_{}".format(k),
                                      shape=[1, self.input_dim],
                                      initializer=tf.zeros_initializer())
             meandict.append(mean)
             logstddict.append(logstd)
     meandicttf = tf.concat(meandict,
                            axis=1)  # size of [batchsize, action dim * K]
     logstddicttf = tf.concat(logstddict, axis=1)
     # generate masks
     logits = [0.0] * self.K
     num_samples = self.state.shape.as_list()[0]
     categorical_mask = tf.multinomial([logits], num_samples)
     #print('categoricalmask', categorical_mask)
     onehot_mask = tf.squeeze(tf.one_hot(categorical_mask, self.K), 0)
     #print('onehotmask', onehot_mask)
     onehot_mask_tiled = tf.squeeze(tf.reshape(
         tf.tile(tf.expand_dims(onehot_mask, axis=2),
                 [1, 1, self.input_dim]), [-1, self.input_dim * self.K, 1]),
                                    axis=2)
     # select
     mean_tiled = tf.multiply(
         onehot_mask_tiled,
         meandicttf)  # size of [batchsize, action dim * K]
     logstd_tiled = tf.multiply(onehot_mask_tiled, logstddicttf)
     # sample action mean and logstd
     mean = tf.reshape(
         mean_tiled,
         [-1, self.K, self.input_dim])  # size of [batchsize, K, action dim]
     logstd = tf.reshape(logstd_tiled, [-1, self.K, self.input_dim])
     mean_final = tf.reduce_sum(
         mean, axis=1, keepdims=True)  # size of [batchsize, action dim]
     logstd_final = tf.reduce_sum(logstd, axis=1, keepdims=True)
     # sample action
     action = tf.exp(logstd_final) * noise_samples + mean_final
     self.y_sample = action
Beispiel #12
0
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, num_units=3, num_layers=4):
        assert isinstance(ob_space, gym.spaces.Box)

        nbatch_train = 1024
        nbatch_vf_train = 64
        nbatch_fvp_train = 205 # sub-sampled size
        self.ob_train = ob_train = U.get_placeholder(name="ob_train", dtype=tf.float32, shape=[nbatch_train] + list(ob_space.shape))
        self.action_train = action_train = U.get_placeholder(name='ac_train', dtype=tf.float32, shape=[nbatch_train] + list(ac_space.shape))
        ob_act = U.get_placeholder(name="ob_act", dtype=tf.float32, shape=[1] + list(ob_space.shape))
        action_act = U.get_placeholder(name='ac_act', dtype=tf.float32, shape=[1] + list(ac_space.shape))
        self.ob_vf_train = ob_vf_train = U.get_placeholder(name="ob_vf_train", dtype=tf.float32, shape=[nbatch_vf_train] + list(ob_space.shape))
        self.ob_fvp_train = ob_fvp_train = U.get_placeholder(name="ob_fvp_train", dtype=tf.float32, shape=[nbatch_fvp_train] + list(ob_space.shape))
        self.ac_fvp_train = action_fvp_train = U.get_placeholder(name="ac_fvp_act", dtype=tf.float32, shape=[nbatch_fvp_train] + list(ac_space.shape))
        
        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        obz_train = tf.clip_by_value((ob_train - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
        obz_act = tf.clip_by_value((ob_act - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
        obz_vf_train = tf.clip_by_value((ob_vf_train - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
        obz_fvp_train = tf.clip_by_value((ob_fvp_train - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)

        # value function
        last_out = obz_vf_train
        with tf.variable_scope('value', reuse=False):
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
            self.vpred_train = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0]
        last_out = obz_act
        with tf.variable_scope('value', reuse=True):
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
            self.vpred_act = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0]
        
        # policy
        policy_train = NormalizingFlowStateModel(obz_train, action_train, name='policy', reuse=False, num_units=num_units, num_layers=num_layers)
        policy_act = NormalizingFlowStateModel(obz_act, action_act, name='policy', reuse=True, num_units=num_units, num_layers=num_layers)
        policy_fvp_train = NormalizingFlowStateModel(obz_fvp_train, action_fvp_train, name='policy', reuse=True, num_units=num_units, num_layers=num_layers)
        self.pi_act = policy_act.y_sample  #act for forward sampling
        self.pi_train = policy_fvp_train.y_sample  #for fvp
        self.entropy_train = policy_train.entropy
        self.log_prob_act = policy_act.log_prob
        self.action_act = action_act
        self.log_prob_train = policy_train.log_prob  #logprob
        self.log_prob_fvp_train = policy_fvp_train.log_prob        
        
        self.state_in = []
        self.state_out = []

        #stochastic = tf.placeholder(dtype=tf.bool, shape=())
        #ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        #self._act = U.function([stochastic, ob], [ac, self.vpred])
        self._act = U.function([ob_act], [self.pi_act, self.vpred_act])
        self.ob_act = ob_act
Beispiel #13
0
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, activation='tanh', gaussian_fixed_var=True, keep=1.0):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob_shape = OBSERVATION_DIM if PREPROCESS else ob_space.shape[0]
        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length, ob_shape])

        if activation == 'tanh':
            activ = tf.nn.tanh
        elif activation == 'elu':
            activ = tf.nn.elu
        elif activation == 'lrelu':
            activ = lambda x: tf.maximum(x, 0.01 * x)
        else:
            raise NotImplementedError("Not available activation: " + activation)

        if PREPROCESS:
            last_out = ob
        else:
            with tf.variable_scope("obfilter"):
                self.ob_rms = RunningMeanStd(shape=ob_space.shape)
            obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            last_out = obz

        for i in range(num_hid_layers):
            last_out = activ(U.dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0)))
            last_out = tf.nn.dropout(last_out, keep_prob=keep, name="vdrop%i" % (i + 1))
        self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0]

        last_out = ob
        for i in range(num_hid_layers):
            last_out = activ(U.dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0)))
            last_out = tf.nn.dropout(last_out, keep_prob=keep, name="pdrop%i" % (i + 1))
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = U.dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01))
            logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2],
                                     initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
        else:
            pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Beispiel #14
0
    def build_graph(self, ob, ac, scope, hid_layer, hid_size, out_size):
        filters, strides, cnn_type = U.cnn(self.rnd_cnn_type)
        logger.log(f'critic cnn type: {cnn_type}')
        with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
            cnn_layer = tf.nn.conv2d(ob, filters[0], strides=strides[0], padding="VALID")
            assert len(filters) > 1 and len(strides) == len(filters)
            for i in np.arange(1, len(filters)):
                cnn_layer = tf.nn.conv2d(cnn_layer, filters[i], strides[i], "VALID")
            ob = tf.reshape(cnn_layer, [-1, int(np.prod(cnn_layer.shape[1:]))])   # flatten cnn output, except the batch axis #1100+
            logger.log(f"critic cnn ob output shape: {ob.shape}")

            layer = ob
            list_of_output_shape = [500]  # 1000 -> 500 -> 100
            logger.log(f"critic cnn dense: {list_of_output_shape}")
            weights, biases = U.dense(layer, list_of_output_shape)
            for i in range(len(list_of_output_shape) - 1):
                layer = tf.add(tf.matmul(layer, weights[i]), biases[i])
                layer = tf.nn.relu(layer)
            layer = tf.add(tf.matmul(layer, weights[-1]), biases[-1])
            ob = layer

            layer = tf.concat([ob, ac], axis=1)
            for _ in range(hid_layer):
                layer = tf.layers.dense(layer, hid_size, activation=tf.nn.leaky_relu)
            layer = tf.layers.dense(layer, out_size, activation=None)
            logger.log(f"[ob, ac] dense hid_layer: {hid_layer}, hid_size: {hid_size}, out_size: {out_size}")
        return layer
Beispiel #15
0
    def __init__(self, ob_dim, ac_dim, hid_size=128, num_hid_layers=2): #pylint: disable=W0613
        X = tf.placeholder(tf.float32, shape=[None, ob_dim * 2 + ac_dim * 2 + 2]) # batch of observations
        vtarg_n = tf.placeholder(tf.float32, shape=[None], name='vtarg')
        wd_dict = {}

        last_out = X
        for i in range(num_hid_layers):
            last_out = tf.nn.selu(U.dense(last_out, hid_size, "vffc%i"%(i + 1), weight_init=U.normc_initializer(1.0))) # bias_init=0, weight_loss_dict=wd_dict
        vpred_n = dense(last_out, 1, "hfinal", weight_init=None, bias_init=0, weight_loss_dict=wd_dict)[:,0]

        sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n))
        wd_loss = tf.get_collection("vf_losses", None)
        loss = U.mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss)
        loss_sampled = U.mean(tf.square(vpred_n - tf.stop_gradient(sample_vpred_n)))
        self._predict = U.function([X], vpred_n)
        optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001 * (1.0 - 0.9), momentum=0.9, \
                                    clip_kl=0.3, epsilon=0.1, stats_decay=0.95, \
                                    async=1, kfac_update=2, cold_iter=50, \
                                    weight_decay_dict=wd_dict, max_grad_norm=1.0)
        vf_var_list = []
        for var in tf.trainable_variables():
            if "vf" in var.name:
                vf_var_list.append(var)

        update_op, self.q_runner = optim.minimize(loss, loss_sampled, var_list=vf_var_list)
        self.do_update = U.function([X, vtarg_n], update_op) #pylint: disable=E1101
        U.initialize() # Initialize uninitialized TF variables
Beispiel #16
0
    def _create_network(self):
        l = self.ob / 255.0
        if self.kind == 'small':  # from A3C paper
            l = tf.nn.relu(U.conv2d(l, 16, "l1", [8, 8], [4, 4], pad="VALID"))
            l = tf.nn.relu(U.conv2d(l, 32, "l2", [4, 4], [2, 2], pad="VALID"))
            l = U.flattenallbut0(l)
            l = tf.nn.relu(U.dense(l, 256, 'lin', U.normc_initializer(1.0)))
        elif self.kind == 'large':  # Nature DQN
            l = tf.nn.relu(U.conv2d(l, 32, "l1", [8, 8], [4, 4], pad="VALID"))
            l = tf.nn.relu(U.conv2d(l, 64, "l2", [4, 4], [2, 2], pad="VALID"))
            l = tf.nn.relu(U.conv2d(l, 64, "l3", [3, 3], [1, 1], pad="VALID"))
            l = U.flattenallbut0(l)
            l = tf.nn.relu(U.dense(l, 512, 'lin', U.normc_initializer(1.0)))
        else:
            raise NotImplementedError

        self._create_logit_value(l, l)
Beispiel #17
0
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, sigma_z=1.0, phi=None, normalize=True):
        assert isinstance(ob_space, gym.spaces.Box)
        self.pdtype = pdtype = make_ar_pdtype(ac_space)
        sequence_length = None
        self.sigma_z = sigma_z
        if not phi is None:
            p = len(phi)
        else:
            p = 0
        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length, p + 1] + list(ob_space.shape))
        acs = U.get_placeholder(name="ac", dtype=tf.float32, shape=[sequence_length, p] + list(ac_space.shape))
        past_x = U.get_placeholder(name="past_x", dtype=tf.float32, shape=[sequence_length, p] + list(ac_space.shape))
        update_mask = U.get_placeholder(name="update_mask", dtype=tf.float32, shape=[sequence_length, p, 1])

        if normalize:
            with tf.variable_scope("obfilter"):
                self.ob_rms = RunningMeanStd(shape=ob_space.shape[-1])

        with tf.variable_scope('vf'):
            if normalize:
                obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            else:
                obz = ob
            last_out = obz[:, -1, :]
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(dense(last_out, hid_size, name="fc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
            self.vpred = dense(last_out, 1, name='final', weight_init=U.normc_initializer(1.0))[:,0]

        with tf.variable_scope('pol'):
            obz = tf.reshape(obz, [-1, obz.shape[-1]])
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(dense(last_out, hid_size, name='fc%i'%(i+1), weight_init=U.normc_initializer(1.0)))
            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = dense(last_out, pdtype.param_shape()[0]//2, name='final', weight_init=U.normc_initializer(0.01))
                mean = tf.reshape(mean, [-1, mean.shape[-1] * (p + 1)])
                logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
                pdparam = tf.concat([mean, mean[:, :ac_space.shape[-1]] * 0.0 + logstd], axis=1)
            else:
                pdparam = U.dense(last_out, pdtype.param_shape()[0], name='final', weight_init=U.normc_initializer(0.01))
        self.pd = pdtype.pdfromflat(pdparam, phi, sigma_z)

        self.state_in = []
        self.state_out = []
        ac, past_x_next = self.pd.sample(acs, past_x, update_mask)
        self._act = U.function([ob, acs, past_x, update_mask], [ac, self.vpred, mean, logstd, past_x_next])
Beispiel #18
0
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, gmm_comp=1,
              mirror_loss=False, observation_permutation=[], action_permutation=[]):
        assert isinstance(ob_space, gym.spaces.Box)
        if mirror_loss:
            assert gaussian_fixed_var  # assume fixed std for now

        self.pdtype = pdtype = make_pdtype(ac_space, gmm_comp)
        sequence_length = None
        self.mirror_loss = mirror_loss
        if mirror_loss:
            # construct permutation matrices
            obs_perm_mat = np.zeros((len(observation_permutation), len(observation_permutation)), dtype=np.float32)
            act_perm_mat = np.zeros((len(action_permutation), len(action_permutation)), dtype=np.float32)
            for i, perm in enumerate(observation_permutation):
                obs_perm_mat[i][int(np.abs(perm))] = np.sign(perm)
            for i, perm in enumerate(action_permutation):
                act_perm_mat[i][int(np.abs(perm))] = np.sign(perm)

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        last_out = ob
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0)))
        self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0]

        last_out = ob
        params = []
        for i in range(num_hid_layers):
            rt, pw, pb = U.dense_wparams(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))
            last_out = tf.nn.tanh(rt)
            params.append([pw, pb])
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            if gmm_comp == 1:
                mean, pw, pb = U.dense_wparams(last_out, pdtype.param_shape()[0] // 2, "polfinal",
                                               U.normc_initializer(0.01))
                params.append([pw, pb])
                self.mean = mean
                logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2],
                                         initializer=tf.zeros_initializer())
                pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
            else:
                means = U.dense(last_out, (pdtype.param_shape()[0] - gmm_comp) // 2, "polfinal",
                                U.normc_initializer(0.01))
                logstd = tf.get_variable(name="logstd",
                                         initializer=tf.constant(np.ones((1, (pdtype.param_shape()[0] - gmm_comp) // 2),
                                                                         dtype=np.float32) * (-1.0)))
                weights = tf.nn.softmax(U.dense(last_out, gmm_comp, "gmmweights", U.normc_initializer(0.01)))
                pdparam = U.concatenate([means, means * 0.0 + logstd, weights], axis=1)
        elif gmm_comp == 1:
            pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))
        else:
            meanstd = U.dense(last_out, pdtype.param_shape()[0] - gmm_comp, "polfinal", U.normc_initializer(0.01))
            weights = tf.nn.softmax(U.dense(last_out, gmm_comp, "gmmweights", U.normc_initializer(0.01)))
            pdparam = U.concatenate([meanstd, weights], axis=1)

        if mirror_loss:
            mirrored_ob = tf.matmul(ob, obs_perm_mat)
            last_val = mirrored_ob
            for i in range(len(params) - 1):
                last_val = tf.nn.tanh(tf.matmul(last_val, params[i][0]) + params[i][1])
            mean_mir_obs = tf.matmul(last_val, params[-1][0]) + params[-1][1]
            self.mirrored_mean = tf.matmul(mean_mir_obs, act_perm_mat)

        if gmm_comp == 1:
            self.pd = pdtype.pdfromflat(pdparam)
        else:
            self.pd = pdtype.pdfromflat([pdparam, gmm_comp])

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True,
              num_options=2,
              dc=0):
        assert isinstance(ob_space, gym.spaces.Box)

        self.ac_space_dim = ac_space.shape[0]
        self.ob_space_dim = ob_space.shape[0]
        self.dc = dc
        self.last_action = tf.zeros(ac_space.shape, dtype=tf.float32)
        self.last_action_init = tf.zeros(ac_space.shape, dtype=tf.float32)
        self.num_options = num_options
        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))
        option = U.get_placeholder(name="option", dtype=tf.int32, shape=[None])

        # create a filter for the pure shape, meaning excluding u[k-1]
        obs_shape_pure = ((self.ob_space_dim - self.ac_space_dim), )

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)
        with tf.variable_scope("obfilter_pure"):
            self.ob_rms_only = RunningMeanStd(shape=obs_shape_pure)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                               5.0)
        obz_pure = tf.clip_by_value(
            (ob[:, :-self.ac_space_dim] - self.ob_rms_only.mean) /
            self.ob_rms_only.std, -5.0, 5.0)

        last_out0 = obz  # for option 0
        last_out1 = obz_pure  # for option 1
        for i in range(num_hid_layers):
            last_out0 = tf.nn.tanh(
                U.dense(last_out0,
                        hid_size,
                        "vffc0%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
            last_out1 = tf.nn.tanh(
                U.dense(last_out1,
                        hid_size,
                        "vffc1%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        last_out0 = U.dense(last_out0,
                            1,
                            "vfff0",
                            weight_init=U.normc_initializer(1.0))
        last_out1 = U.dense(last_out1,
                            1,
                            "vfff1",
                            weight_init=U.normc_initializer(1.0))

        #self.vpred = dense3D2(last_out, 1, "vffinal", option, num_options=num_options, weight_init=U.normc_initializer(1.0))[:,0]
        #last_out0 = tf.Print(last_out0,[tf.size(last_out0[:,0])])
        self.vpred = U.switch(option[0], last_out1, last_out0)[:, 0]

        #self.op_pi = tf.nn.softmax(U.dense(tf.stop_gradient(last_out), num_options, "OPfc%i"%(i+1), weight_init=U.normc_initializer(1.0)))

        last_out0 = obz  # for option 0
        last_out1 = obz_pure  # for option 1
        for i in range(num_hid_layers):
            last_out0 = tf.nn.tanh(
                U.dense(last_out0,
                        hid_size,
                        "oppi0%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
            last_out1 = tf.nn.tanh(
                U.dense(last_out1,
                        hid_size,
                        "oppi1%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        last_out0 = U.dense(last_out0,
                            1,
                            "oppif0",
                            weight_init=U.normc_initializer(1.0))
        last_out1 = U.dense(last_out1,
                            1,
                            "oppif1",
                            weight_init=U.normc_initializer(1.0))
        last_out = tf.concat([last_out0, last_out1], 1)
        self.op_pi = tf.nn.softmax(last_out)

        self.tpred = tf.nn.sigmoid(
            dense3D2(tf.stop_gradient(last_out),
                     1,
                     "termhead",
                     option,
                     num_options=num_options,
                     weight_init=U.normc_initializer(1.0)))[:, 0]
        #termination_sample = tf.greater(self.tpred, tf.random_uniform(shape=tf.shape(self.tpred),maxval=1.))
        termination_sample = tf.constant([True])

        # define the angle
        #ctrl_in = tf.reshape([(tf.math.atan2(ob[:,1],ob[:,0])),(ob[:,2])], [-1,2])
        #last_out = ctrl_in
        last_out = obz_pure
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "polfc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = dense3D2(last_out,
                            pdtype.param_shape()[0] // 2,
                            "polfinal",
                            option,
                            num_options=num_options,
                            weight_init=U.normc_initializer(0.01),
                            bias=False)
            mean = tf.nn.tanh(mean)
            logstd = tf.get_variable(
                name="logstd",
                shape=[num_options, 1,
                       pdtype.param_shape()[0] // 2],
                initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd[option[0]]],
                                    axis=1)
        else:
            pdparam = U.dense(last_out,
                              pdtype.param_shape()[0], "polfinal",
                              U.normc_initializer(0.01))

        #self.op_pi = tf.nn.softmax(U.dense(tf.stop_gradient(last_out), num_options, "OPfc%i"%(i+1), weight_init=U.normc_initializer(1.0)))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        #ac = tf.Print (ac, [ac,option,ob], "action and option before selecting: ")
        ac = U.switch(option[0], ac,
                      tf.stop_gradient(ob[:, -self.ac_space_dim:]))
        ac = tf.clip_by_value(ac, -1.0, 1.0)
        #ac = U.switch(option[0], tf.constant(1.0), tf.constant(0.0))
        #ac = tf.Print (ac, [ac], "action after selection: ")
        self.last_action = tf.stop_gradient(ac)
        self._act = U.function([stochastic, ob, option],
                               [ac, self.vpred, last_out, logstd])

        self._get_v = U.function([ob, option], [self.vpred])
        self.get_term = U.function([ob, option], [termination_sample])
        self.get_tpred = U.function([ob, option], [self.tpred])
        self.get_vpred = U.function([ob, option], [self.vpred])
        self._get_op = U.function([ob], [self.op_pi])
Beispiel #20
0
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True,
              num_options=2,
              dc=0,
              w_intfc=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.w_intfc = w_intfc
        self.state_in = []
        self.state_out = []
        self.dc = dc
        self.num_options = num_options
        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))
        option = U.get_placeholder(name="option", dtype=tf.int32, shape=[None])

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                               5.0)
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "vffc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        self.vpred = dense3D2(last_out,
                              1,
                              "vffinal",
                              option,
                              num_options=num_options,
                              weight_init=U.normc_initializer(1.0))[:, 0]

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "termfc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        self.tpred = tf.nn.sigmoid(
            dense3D2(tf.stop_gradient(last_out),
                     1,
                     "termhead",
                     option,
                     num_options=num_options,
                     weight_init=U.normc_initializer(1.0)))[:, 0]
        termination_sample = tf.greater(
            self.tpred, tf.random_uniform(shape=tf.shape(self.tpred),
                                          maxval=1.))

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "polfc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = dense3D2(last_out,
                            pdtype.param_shape()[0] // 2,
                            "polfinal",
                            option,
                            num_options=num_options,
                            weight_init=U.normc_initializer(0.01))
            logstd = tf.get_variable(
                name="logstd",
                shape=[num_options, 1,
                       pdtype.param_shape()[0] // 2],
                initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd[option[0]]],
                                    axis=1)
        else:
            pdparam = U.dense(last_out,
                              pdtype.param_shape()[0], "polfinal",
                              U.normc_initializer(0.01))
        self.pd = pdtype.pdfromflat(pdparam)
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())

        # self.op_pi = tf.nn.softmax(U.dense(tf.stop_gradient(last_out), num_options, "OP", weight_init=U.normc_initializer(1.0)))
        # pdb.set_trace()
        # self.op_pi = tf.constant(1./num_options)

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "intfc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        self.intfc = tf.sigmoid(
            U.dense(last_out,
                    num_options,
                    "intfcfinal",
                    weight_init=U.normc_initializer(1.0)))

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "OP%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        self.op_pi = tf.nn.softmax(
            U.dense(last_out,
                    num_options,
                    "OPfinal",
                    weight_init=U.normc_initializer(1.0)))

        self._act = U.function([stochastic, ob, option], [ac])
        self.get_term = U.function([ob, option], [termination_sample])
        self.get_tpred = U.function([ob, option], [self.tpred])
        self.get_vpred = U.function([ob, option], [self.vpred])
        self._get_op_int = U.function([ob], [self.op_pi, self.intfc])
        self._get_intfc = U.function([ob], [self.intfc])
        self._get_op = U.function([ob], [self.op_pi])
Beispiel #21
0
    def _init(self,
              ob_space,
              ac_space,
              kind,
              num_options=2,
              dc=0,
              w_intfc=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.w_intfc = w_intfc
        self.state_in = []
        self.state_out = []
        self.dc = dc
        self.num_options = num_options
        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))
        option = U.get_placeholder(name="option", dtype=tf.int32, shape=[None])

        x = ob / 255.0
        if kind == 'small':  # from A3C paper
            x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID"))
            x = U.flattenallbut0(x)
            hidden = tf.nn.relu(
                tf.layers.dense(x,
                                256,
                                name='lin',
                                kernel_initializer=U.normc_initializer(1.0)))
        elif kind == 'large':  # Nature DQN
            x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID"))
            x = U.flattenallbut0(x)
            hidden = tf.nn.relu(
                tf.layers.dense(x,
                                512,
                                name='lin',
                                kernel_initializer=U.normc_initializer(1.0)))
        else:
            raise NotImplementedError

        logits = dense3D2(hidden,
                          pdtype.param_shape()[0],
                          "polfinal",
                          option,
                          num_options=num_options,
                          weight_init=U.normc_initializer(0.01))
        self.pd = pdtype.pdfromflat(logits)

        self.vpred = dense3D2(hidden,
                              1,
                              "vffinal",
                              option,
                              num_options=num_options,
                              weight_init=U.normc_initializer(1.0))[:, 0]

        self.tpred = tf.nn.sigmoid(
            dense3D2(tf.stop_gradient(hidden),
                     1,
                     "termhead",
                     option,
                     num_options=num_options,
                     weight_init=U.normc_initializer(1.0)))[:, 0]
        termination_sample = tf.greater(
            self.tpred, tf.random_uniform(shape=tf.shape(self.tpred),
                                          maxval=1.))

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = self.pd.sample()  # XXX

        #self.op_pi = tf.nn.softmax(U.dense(tf.stop_gradient(hidden), num_options, "OP", weight_init=U.normc_initializer(1.0)))

        self.op_pi = tf.nn.softmax(
            U.dense(hidden,
                    num_options,
                    "OPfinal",
                    weight_init=U.normc_initializer(1.0)))

        self.intfc = tf.sigmoid(
            U.dense(hidden,
                    num_options,
                    "intfcfinal",
                    weight_init=U.normc_initializer(1.0)))

        self._act = U.function([stochastic, ob, option], [ac])
        self.get_term = U.function([ob, option], [termination_sample])
        self.get_tpred = U.function([ob, option], [self.tpred])
        self.get_vpred = U.function([ob, option], [self.vpred])
        self._get_op_int = U.function([ob], [self.op_pi, self.intfc])
        self._get_intfc = U.function([ob], [self.intfc])
        self._get_op = U.function([ob], [self.op_pi])
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True,
              gmm_comp=1,
              mirror_loss=False,
              observation_permutation=[],
              action_permutation=[]):
        assert isinstance(ob_space, gym.spaces.Box)
        if mirror_loss:
            assert gaussian_fixed_var  # assume fixed std for now

        self.pdtype = pdtype = make_pdtype(
            ac_space, gmm_comp
        )  #pd = probability distribution -- distrib of possible actions
        sequence_length = None
        self.mirror_loss = mirror_loss
        if mirror_loss:
            # construct permutation matrices
            obs_perm_mat = np.zeros(
                (len(observation_permutation), len(observation_permutation)),
                dtype=np.float32
            )  #implements mirror loss using permutation matrices
            act_perm_mat = np.zeros(
                (len(action_permutation), len(action_permutation)),
                dtype=np.float32
            )  #is it about // limbs learning same behavior?
            for i, perm in enumerate(
                    observation_permutation):  #to swap rows / cols of a matrix
                obs_perm_mat[i][int(np.abs(perm))] = np.sign(
                    perm)  # PA  /  AP  -- Permutation P swaps rows / cols of A
            for i, perm in enumerate(action_permutation):
                act_perm_mat[i][int(np.abs(perm))] = np.sign(perm)

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(
                shape=ob_space.shape
            )  #obs gathered from the env, use the Root Means Square of obs matrix as input to NN

        obz = tf.clip_by_value(
            (ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
            5.0)  # Standard Normal Distrib of Obs, clipped btw [-5, 5] !
        #Z = (X - μ)/σ where Z is the value on the standard normal distribution,
        #X is the value on the original distribution,
        #μ is the mean of the original distribution, and
        #σ is the standard deviation of the original distribution.
        last_out = obz  #input
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "vffc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        self.vpred = U.dense(
            last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0)
        )[:,
          0]  #prediction V network (predicts value V of state)  -- value function

        last_out = obz  #input
        params = []
        for i in range(num_hid_layers):
            rt, pw, pb = U.dense_wparams(
                last_out,
                hid_size,
                "polfc%i" % (i + 1),
                weight_init=U.normc_initializer(1.0)
            )  #policy function NN -- policy itself, what action given obervations? (aka state)
            last_out = tf.nn.tanh(rt)
            params.append([pw, pb])
        if gaussian_fixed_var and isinstance(ac_space,
                                             gym.spaces.Box):  #usually True
            if gmm_comp == 1:  #usually True
                mean, pw, pb = U.dense_wparams(
                    last_out,
                    pdtype.param_shape()[0] // 2, "polfinal",
                    U.normc_initializer(0.01)
                )  #final is the Gaussian distrib of all possible actions
                params.append([pw, pb])
                self.mean = mean
                logstd = tf.get_variable(
                    name="logstd",
                    shape=[1, pdtype.param_shape()[0] // 2],
                    initializer=tf.zeros_initializer(
                    ))  #log of std dev of actions distrib
                pdparam = U.concatenate(
                    [mean, mean * 0.0 + logstd], axis=1
                )  # probability distrib of actions! in given distribution
            else:
                means = U.dense(last_out,
                                (pdtype.param_shape()[0] - gmm_comp) // 2,
                                "polfinal", U.normc_initializer(0.01))
                logstd = tf.get_variable(
                    name="logstd",
                    initializer=tf.constant(
                        np.ones((1, (pdtype.param_shape()[0] - gmm_comp) // 2),
                                dtype=np.float32) * (-1.0)))
                weights = tf.nn.softmax(
                    U.dense(last_out, gmm_comp, "gmmweights",
                            U.normc_initializer(0.01)))
                pdparam = U.concatenate([means, means * 0.0 + logstd, weights],
                                        axis=1)
        elif gmm_comp == 1:
            pdparam = U.dense(last_out,
                              pdtype.param_shape()[0], "polfinal",
                              U.normc_initializer(0.01))
        else:
            meanstd = U.dense(last_out,
                              pdtype.param_shape()[0] - gmm_comp, "polfinal",
                              U.normc_initializer(0.01))
            weights = tf.nn.softmax(
                U.dense(last_out, gmm_comp, "gmmweights",
                        U.normc_initializer(0.01)))
            pdparam = U.concatenate([meanstd, weights], axis=1)

        if mirror_loss:
            mirrored_obz = tf.matmul(
                obz, obs_perm_mat
            )  #for mirrorred loss, input is permutated Observations!
            last_val = mirrored_obz  #as said in paper -- "encourage gait symmetry by measuring the symmetry of ACTIONS (instead os states) thus avoiding issue of delayed reward"
            for i in range(len(params) - 1):
                last_val = tf.nn.tanh(
                    tf.matmul(last_val, params[i][0]) + params[i][1]
                )  #wanna minimize error of both Action and Mirrored Action
            mean_mir_obs = tf.matmul(last_val, params[-1][0]) + params[-1][
                1]  #aka action in State and Mirrored State
            self.mirrored_mean = tf.matmul(mean_mir_obs, act_perm_mat)

        if gmm_comp == 1:  #usually True
            self.pd = pdtype.pdfromflat(
                pdparam)  #flattens probability distrib params
        else:
            self.pd = pdtype.pdfromflat([pdparam, gmm_comp])

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(
            stochastic, self.pd.sample(), self.pd.mode()
        )  #take action == sample from Diag Gaussian probalility distribution of actions
        self._act = U.function(
            [stochastic, ob], [ac, self.vpred]
        )  #wrapper function that when given stochastic and obs  ->  returns action it sampled and predicted value of state based on obs
Beispiel #23
0
    def _init(self, ob_space, ac_space,hid_size_V, hid_size_actor, num_hid_layers,V_keep_prob, pol_keep_prob,\
             mc_samples,layer_norm,activation_critic,activation_actor, dropout_on_V, dropout_on_policy,tau, gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)
        self.dropout_on_policy = dropout_on_policy
        #        self.pdtype = pdtype = make_pdtype(ac_space, dropout_on_policy)
        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                               5.0)
        last_out = obz

        self.mc_samples = mc_samples
        self.pol_keep_prob = pol_keep_prob
        self.V_keep_prob = V_keep_prob

        ### MAIN CHANGES
        #######################
        # Value function

        with tf.variable_scope("value_function"):

            dropout_networks = [last_out] * self.mc_samples
            #            dropout_networks = generate_dropout_layer(lambda x: x, dropout_networks, self.pol_keep_prob)

            for i in range(num_hid_layers):
                if layer_norm:
                    last_out = activation_critic(tc.layers.layer_norm(tf.layers.dense(last_out, hid_size_V, name="vffc%i"%(i+1), \
                    kernel_initializer=U.normc_initializer(1.0)), center=True,scope="vffc_activation%i"%(i+1) ,scale=True))

                    apply_layer = lambda x: activation_critic(
                        tc.layers.layer_norm(tf.layers.dense(
                            x, hid_size_V, name="vffc%i" %
                            (i + 1), reuse=True),
                                             center=True,
                                             scope="vffc_activation%i" %
                                             (i + 1),
                                             scale=True,
                                             reuse=True))
                else:
                    last_out = activation_critic(tf.layers.dense(last_out, hid_size_V, name="vffc%i"%(i+1), \
                    kernel_initializer=U.normc_initializer(1.0)))

                    apply_layer = lambda x: activation_critic(
                        tf.layers.dense(
                            x, hid_size_V, name="vffc%i" %
                            (i + 1), reuse=True))

                dropout_networks = generate_dropout_layer(
                    apply_layer, dropout_networks, self.V_keep_prob)

            ## final layer
            self.vpred = tf.layers.dense(
                last_out,
                1,
                name="vffinal",
                kernel_initializer=U.normc_initializer(1.0))[:, 0]

            apply_layer = lambda x : tf.layers.dense(x, 1, activation=None, \
                        name="vffinal", reuse=True)[:,0]
            dropout_networks = generate_dropout_layer(apply_layer,
                                                      dropout_networks,
                                                      self.V_keep_prob)

            self.vpred_mc_mean = tf.add_n(dropout_networks) / float(
                len(dropout_networks))
            self.vpred_dropout_networks = dropout_networks

        #######################
        ## Policy
        last_out = obz
        with tf.variable_scope("policy"):
            if not self.dropout_on_policy:
                for i in range(num_hid_layers):
                    last_out = U.dense(last_out, hid_size_actor, "polfc%i"%(i+1), \
                    weight_init=U.normc_initializer(1.0))
                    last_out = activation_actor(last_out)
                if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                    mean = U.dense(last_out,
                                   pdtype.param_shape()[0] // 2, "polfinal",
                                   U.normc_initializer(0.01))
                    logstd = tf.get_variable(
                        name="logstd",
                        shape=[1, pdtype.param_shape()[0] // 2],
                        initializer=tf.zeros_initializer())
                    pdparam = U.concatenate([mean, mean * 0.0 + logstd],
                                            axis=1)
                else:
                    pdparam = U.dense(last_out,
                                      pdtype.param_shape()[0], "polfinal",
                                      U.normc_initializer(0.01))
                self.pd = pdtype.pdfromflat(pdparam)
            else:
                dropout_networks = [last_out] * mc_samples
                dropout_networks = generate_dropout_layer(
                    lambda x: x, dropout_networks, 1.0)

                for i in range(num_hid_layers):
                    last_out = activation_actor(
                        tf.layers.dense(
                            last_out,
                            hid_size_actor,
                            activation=None,
                            name="polfc%i" % (i + 1),
                            kernel_initializer=U.normc_initializer(1.0),
                            bias_initializer=tf.zeros_initializer()))
                    apply_layer = lambda x: activation_actor(
                        tf.layers.dense(
                            x,
                            hid_size_actor,
                            activation=None,
                            name="polfc%i" % (i + 1),
                            kernel_initializer=U.normc_initializer(1.0),
                            bias_initializer=tf.zeros_initializer(),
                            reuse=True))
                    dropout_networks = generate_dropout_layer(
                        apply_layer, dropout_networks, pol_keep_prob)

                net = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0] // 2,
                    name="polfinal",
                    activation=None,
                    kernel_initializer=U.normc_initializer(0.01))
                apply_layer = lambda x: tf.layers.dense(
                    x,
                    pdtype.param_shape()[0] // 2,
                    activation=None,
                    name="polfinal",
                    kernel_initializer=U.normc_initializer(0.01),
                    reuse=True)
                dropout_networks = generate_dropout_layer(
                    apply_layer, dropout_networks, pol_keep_prob)

                self.pd = pdtype.pdfromflat(dropout_networks, tau)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())

        last_out = obz

        ### MAIN CHANGES
        ## if dropout:

        if dropout_on_V:
            vact = self.vpred_mc_mean
        else:
            vact = self.vpred

        if dropout_on_policy:
            self._actsfunc = [
                U.function([ob], [x, vact]) for x in dropout_networks
            ]

            self._act = self.dropout_act
        else:
            self._actfunc = U.function([stochastic, ob], [ac, vact])
            self._act = self.reg_act
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True,
              num_options=2,
              dc=0):
        assert isinstance(ob_space, gym.spaces.Box)

        # define action and observation space
        self.ac_space_dim = ac_space.shape[0]
        self.ob_space_dim = ob_space.shape[0]
        self.dc = dc
        self.last_action = tf.zeros(ac_space.shape, dtype=tf.float32)
        self.last_action_init = tf.zeros(ac_space.shape, dtype=tf.float32)
        self.num_options = num_options
        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))
        option = U.get_placeholder(name="option", dtype=tf.int32, shape=[None])

        # create a filter for the pure shape, meaning excluding u[k-1]
        obs_shape_pure = ((self.ob_space_dim - self.ac_space_dim), )

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)
        with tf.variable_scope("obfilter_pure"):
            self.ob_rms_only = RunningMeanStd(shape=obs_shape_pure)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                               5.0)
        obz_pure = tf.clip_by_value(
            (ob[:, :-self.ac_space_dim] - self.ob_rms_only.mean) /
            self.ob_rms_only.std, -5.0, 5.0)

        # implement Q-function approximation
        last_out0 = obz  # for option 0
        last_out1 = obz_pure  # for option 1
        for i in range(num_hid_layers):
            last_out0 = tf.nn.relu(
                U.dense(last_out0,
                        hid_size,
                        "vffc0%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
            last_out1 = tf.nn.relu(
                U.dense(last_out1,
                        hid_size,
                        "vffc1%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        last_out0 = U.dense(last_out0,
                            1,
                            "vfff0",
                            weight_init=U.normc_initializer(1.0))
        last_out1 = U.dense(last_out1,
                            1,
                            "vfff1",
                            weight_init=U.normc_initializer(1.0))

        # return the Q-function value
        self.vpred = U.switch(option[0], last_out1, last_out0)[:, 0]

        # implement parametrizatzion for policy over options
        last_out0 = obz  # for option 0
        last_out1 = obz_pure  # for option 1
        for i in range(num_hid_layers):
            last_out0 = tf.nn.relu(
                U.dense(last_out0,
                        hid_size,
                        "oppi0%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
            last_out1 = tf.nn.relu(
                U.dense(last_out1,
                        hid_size,
                        "oppi1%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        last_out0 = U.dense(last_out0,
                            1,
                            "oppif0",
                            weight_init=U.normc_initializer(1.0))
        last_out1 = U.dense(last_out1,
                            1,
                            "oppif1",
                            weight_init=U.normc_initializer(1.0))
        last_out = tf.concat([last_out0, last_out1], 1)
        # return probabilities for the options
        self.op_pi = tf.nn.softmax(last_out)

        # always terminate
        self.tpred = tf.nn.sigmoid(
            dense3D2(tf.stop_gradient(last_out),
                     1,
                     "termhead",
                     option,
                     num_options=num_options,
                     weight_init=U.normc_initializer(1.0)))[:, 0]
        termination_sample = tf.constant([True])

        # define the control policy / intra-option policy
        last_out = obz_pure
        for i in range(num_hid_layers):
            last_out = tf.nn.relu(
                U.dense(last_out,
                        hid_size,
                        "polfc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = dense3D2(last_out,
                            pdtype.param_shape()[0] // 2,
                            "polfinal",
                            option,
                            num_options=num_options,
                            weight_init=U.normc_initializer(0.01),
                            bias=False)
            # now also use relus to squash to -1,1
            mean = (-tf.nn.relu(-(mean - 1)) + tf.nn.relu(-(mean + 1))) + 1
            logstd = tf.get_variable(
                name="logstd",
                shape=[num_options, 1,
                       pdtype.param_shape()[0] // 2],
                initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd[option[0]]],
                                    axis=1)
        else:
            pdparam = U.dense(last_out,
                              pdtype.param_shape()[0], "polfinal",
                              U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        # sample stochastically -> this corresponds to exploration
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        # choose the appropriate action, apply the ZOH if using option 0
        ac = U.switch(option[0], ac,
                      tf.stop_gradient(ob[:, -self.ac_space_dim:]))
        ac = tf.clip_by_value(ac, -1.0, 1.0)

        self.last_action = tf.stop_gradient(ac)
        self._act = U.function([stochastic, ob, option],
                               [ac, self.vpred, last_out, logstd])

        self._get_v = U.function([ob, option], [self.vpred])
        self.get_term = U.function([ob, option], [termination_sample])
        self.get_tpred = U.function([ob, option], [self.tpred])
        self.get_vpred = U.function([ob, option], [self.vpred])
        self._get_op = U.function([ob], [self.op_pi])
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True,
              num_options=2,
              dc=0):
        assert isinstance(ob_space, gym.spaces.Box)

        # init
        self.ac_space_dim = ac_space.shape[0]
        self.ob_space_dim = ob_space.shape[0]
        self.dc = dc
        self.last_action = tf.zeros(ac_space.shape, dtype=tf.float32)
        self.last_action_init = tf.zeros(ac_space.shape, dtype=tf.float32)
        self.num_options = num_options
        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))
        option = U.get_placeholder(name="option", dtype=tf.int32, shape=[None])

        # create a filter for the pure shape, meaning excluding u[k-1]
        obs_shape_pure = ((self.ob_space_dim - self.ac_space_dim), )

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)
        with tf.variable_scope("obfilter_pure"):
            self.ob_rms_only = RunningMeanStd(shape=obs_shape_pure)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                               5.0)
        obz_pure = tf.clip_by_value(
            (ob[:, :-self.ac_space_dim] - self.ob_rms_only.mean) /
            self.ob_rms_only.std, -5.0, 5.0)

        # return Q-function value
        last_out0 = obz  # for option 0
        last_out1 = obz_pure  # for option 1
        for i in range(num_hid_layers):
            last_out0 = tf.nn.relu(
                U.dense(last_out0,
                        hid_size,
                        "vffc0%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
            last_out1 = tf.nn.relu(
                U.dense(last_out1,
                        hid_size,
                        "vffc1%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        last_out0 = U.dense(last_out0,
                            1,
                            "vfff0",
                            weight_init=U.normc_initializer(1.0))
        last_out1 = U.dense(last_out1,
                            1,
                            "vfff1",
                            weight_init=U.normc_initializer(1.0))

        self.vpred = U.switch(option[0], last_out1, last_out0)[:, 0]

        # policy over options:
        last_out0 = obz  # for option 0
        last_out1 = obz_pure  # for option 1
        for i in range(num_hid_layers):
            last_out0 = tf.nn.relu(
                U.dense(last_out0,
                        hid_size,
                        "oppi0%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
            last_out1 = tf.nn.relu(
                U.dense(last_out1,
                        hid_size,
                        "oppi1%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        last_out0 = U.dense(last_out0,
                            1,
                            "oppif0",
                            weight_init=U.normc_initializer(1.0))
        last_out1 = U.dense(last_out1,
                            1,
                            "oppif1",
                            weight_init=U.normc_initializer(1.0))
        last_out = tf.concat([last_out0, last_out1], 1)

        # instead of applying the softmax also define self.op_pi_orig which is the difference between the output values
        self.op_pi_orig = last_out0 - last_out1  #tf.math.subtract(last_out0,last_out1)
        self.op_pi = tf.nn.softmax(last_out)

        # still always terminate
        self.tpred = tf.nn.sigmoid(
            dense3D2(tf.stop_gradient(last_out),
                     1,
                     "termhead",
                     option,
                     num_options=num_options,
                     weight_init=U.normc_initializer(1.0)))[:, 0]
        #termination_sample = tf.greater(self.tpred, tf.random_uniform(shape=tf.shape(self.tpred),maxval=1.))
        termination_sample = tf.constant([True])

        # choose the appropriate action
        last_out = obz_pure
        for i in range(num_hid_layers):
            last_out = tf.nn.relu(
                U.dense(last_out,
                        hid_size,
                        "polfc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = dense3D2(last_out,
                            pdtype.param_shape()[0] // 2,
                            "polfinal",
                            option,
                            num_options=num_options,
                            weight_init=U.normc_initializer(0.01),
                            bias=False)
            # leave here the tanh to squash to (-1,1)
            mean = (-tf.nn.relu(-(mean - 1)) + tf.nn.relu(-(mean + 1))) + 1
            #mean = tf.nn.tanh(mean)
            logstd = tf.get_variable(
                name="logstd",
                shape=[num_options, 1,
                       pdtype.param_shape()[0] // 2],
                initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd[option[0]]],
                                    axis=1)
        else:
            pdparam = U.dense(last_out,
                              pdtype.param_shape()[0], "polfinal",
                              U.normc_initializer(0.01))

        #self.op_pi = tf.nn.softmax(U.dense(tf.stop_gradient(last_out), num_options, "OPfc%i"%(i+1), weight_init=U.normc_initializer(1.0)))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        # keep the variable which only incorporates the mean
        ac_mean = mean
        ac_mean = U.switch(option[0], ac_mean,
                           tf.stop_gradient(ob[:, -self.ac_space_dim:]))
        self.ac_mean = tf.clip_by_value(ac_mean, -1.0, 1.0)

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        #ac = tf.Print (ac, [ac,option,ob], "action and option before selecting: ")
        ac = U.switch(option[0], ac,
                      tf.stop_gradient(ob[:, -self.ac_space_dim:]))
        ac = tf.clip_by_value(ac, -1.0, 1.0)
        #ac = U.switch(option[0], tf.constant(1.0), tf.constant(0.0))
        #ac = tf.Print (ac, [ac], "action after selection: ")
        self.last_action = tf.stop_gradient(ac)
        self._act = U.function([stochastic, ob, option],
                               [ac, self.vpred, last_out, logstd])

        self._get_v = U.function([ob, option], [self.vpred])
        self.get_term = U.function([ob, option], [termination_sample])
        self.get_tpred = U.function([ob, option], [self.tpred])
        self.get_vpred = U.function([ob, option], [self.vpred])
        self._get_op = U.function([ob], [self.op_pi])

        # additional functions that return the action mean and the special values for the policy over options
        self._act_mean = U.function([ob, option], [ac_mean])
        self._get_op_orig = U.function([ob], [self.op_pi_orig])
Beispiel #26
0
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, num_options=2,dc=0, kind='small'):
        assert isinstance(ob_space, gym.spaces.Box)

        self.dc = dc
        self.num_options = num_options
        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
        option =  U.get_placeholder(name="option", dtype=tf.int32, shape=[None])

        x = ob / 255.0
        if kind == 'small': # from A3C paper
            x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(U.dense(x, 256, 'lin', U.normc_initializer(1.0)))
        elif kind == 'large': # Nature DQN
            x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(U.dense(x, 512, 'lin', U.normc_initializer(1.0)))
        else:
            raise NotImplementedError


        # Network to compute value function and termination probabilities
        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        obz = x
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
        self.vpred = dense3D2(last_out, 1, "vffinal", option, num_options=num_options, weight_init=U.normc_initializer(1.0))[:,0]

        self.vpred_ent = dense3D2(last_out, 1, "vffinal_ent", option, num_options=num_options, weight_init=U.normc_initializer(1.0))[:,0]

        self.tpred = tf.nn.sigmoid(dense3D2(tf.stop_gradient(last_out), 1, "termhead", option, num_options=num_options, weight_init=U.normc_initializer(1.0)))[:,0]
        termination_sample = tf.greater(self.tpred, tf.random_uniform(shape=tf.shape(self.tpred),maxval=1.))


        # Network to compute policy over options and intra_option policies
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
        # if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Discrete):
        #     mean = dense3D2(last_out, pdtype.param_shape()[0]//2, "polfinal", option, num_options=num_options, weight_init=U.normc_initializer(0.01))
        #     logstd = tf.get_variable(name="logstd", shape=[num_options, 1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
        #     pdparam = U.concatenate([mean, mean * 0.0 + logstd[option[0]]], axis=1)
        # else:
        pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))

        self.op_pi = tf.nn.softmax(U.dense(tf.stop_gradient(last_out), num_options, "OPfc%i"%(i+1), weight_init=U.normc_initializer(1.0)))

        self.pd = pdtype.pdfromflat(pdparam)


        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob, option], [ac, self.vpred, self.vpred_ent, last_out])
        self._get_logits = U.function([stochastic, ob, option], [self.pd.logits] )


        self._get_v = U.function([ob, option], [self.vpred])
        self._get_v_ent = U.function([ob, option], [self.vpred_ent])  # Entropy value estimate
        self.get_term = U.function([ob, option], [termination_sample])
        self.get_tpred = U.function([ob, option], [self.tpred])
        self.get_vpred = U.function([ob, option], [self.vpred])
        self.get_vpred_ent = U.function([ob, option], [self.vpred_ent]) # Entropy value estimate
        self._get_op = U.function([ob], [self.op_pi])
Beispiel #27
0
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                               5.0)
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "vffc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        self.vpred = U.dense(last_out,
                             1,
                             "vffinal",
                             weight_init=U.normc_initializer(1.0))[:, 0]

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "polfc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            alpha = tf.nn.softplus(
                U.dense(last_out,
                        ac_space.high.size,
                        'polfc_alpha',
                        weight_init=U.normc_initializer(0.001))) + 1.0
            beta = tf.nn.softplus(
                U.dense(last_out,
                        ac_space.high.size,
                        'polfc_beta',
                        weight_init=U.normc_initializer(0.001))) + 1.0
        else:
            raise NotImplementedError

        self.pd = tfp.distributions.Beta(alpha, beta)

        self.state_in = []
        self.state_out = []

        # compute sampled action
        sampled_action = self.pd.sample()

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, sampled_action, self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Beispiel #28
0
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True,
              bound_by_sigmoid=False,
              sigmoid_coef=1.,
              activation='tanh',
              normalize_obs=True,
              actions='gaussian',
              avg_norm_symmetry=False,
              symmetric_interpretation=False,
              stdclip=5.0,
              gaussian_bias=False,
              gaussian_from_binary=False,
              parallel_value=False,
              pv_layers=2,
              pv_hid_size=512,
              three=False):
        assert isinstance(ob_space, gym.spaces.Box)

        if actions == 'binary':
            self.pdtype = pdtype = MultiCategoricalPdType(
                low=np.zeros_like(ac_space.low, dtype=np.int32),
                high=np.ones_like(ac_space.high, dtype=np.int32))
        elif actions == 'beta':
            self.pdtype = pdtype = BetaPdType(
                low=np.zeros_like(ac_space.low, dtype=np.int32),
                high=np.ones_like(ac_space.high, dtype=np.int32))
        elif actions == 'bernoulli':
            self.pdtype = pdtype = BernoulliPdType(ac_space.low.size)
        elif actions == 'gaussian':
            self.pdtype = pdtype = make_pdtype(ac_space)
        elif actions == 'cat_3':
            self.pdtype = pdtype = MultiCategoricalPdType(
                low=np.zeros_like(ac_space.low, dtype=np.int32),
                high=np.ones_like(ac_space.high, dtype=np.int32) * 2)
        elif actions == 'cat_5':
            self.pdtype = pdtype = MultiCategoricalPdType(
                low=np.zeros_like(ac_space.low, dtype=np.int32),
                high=np.ones_like(ac_space.high, dtype=np.int32) * 4)
        else:
            assert False

        sequence_length = None

        self.ob = U.get_placeholder(name="ob",
                                    dtype=tf.float32,
                                    shape=[sequence_length] +
                                    list(ob_space.shape))
        self.st = U.get_placeholder(name="st", dtype=tf.int32, shape=[None])

        if normalize_obs:
            with tf.variable_scope("obfilter"):
                self.ob_rms = RunningMeanStd(shape=ob_space.shape)
            if avg_norm_symmetry:
                # Warning works only for normal observations (41 numbers)
                ob_mean = (tf.gather(self.ob_rms.mean, ORIG_SYMMETRIC_IDS) +
                           self.ob_rms.mean) / 2
                ob_std = (tf.gather(self.ob_rms.std, ORIG_SYMMETRIC_IDS) +
                          self.ob_rms.std) / 2  # Pretty crude
            else:
                ob_mean = self.ob_rms.mean
                ob_std = self.ob_rms.std

            obz = tf.clip_by_value((self.ob - ob_mean) / ob_std, -stdclip,
                                   stdclip)

            #obz = tf.Print(obz, [self.ob_rms.mean], message='rms_mean', summarize=41)
            #obz = tf.Print(obz, [self.ob_rms.std], message='rms_std', summarize=41)
        else:
            obz = self.ob

        vpreds = []
        pparams = []

        for part in range(1 if not three else 3):
            part_prefix = "" if part == 0 else "part_" + str(part)

            # Predicted value
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(
                    U.dense(last_out,
                            hid_size,
                            part_prefix + "vffc%i" % (i + 1),
                            weight_init=U.normc_initializer(1.0)))

            vpreds.append(
                U.dense(last_out,
                        1,
                        part_prefix + "vffinal",
                        weight_init=U.normc_initializer(1.0)))
            vpreds[-1] = vpreds[-1][:, 0]

            if parallel_value:
                last_out_2 = obz
                for i in range(pv_layers):
                    last_out_2 = tf.nn.tanh(
                        U.dense(last_out_2,
                                pv_hid_size,
                                part_prefix + "pv_vffc%i" % (i + 1),
                                weight_init=U.normc_initializer(1.0)))
                last_out_2 = U.dense(last_out_2,
                                     1,
                                     part_prefix + "pv_vffinal",
                                     weight_init=U.normc_initializer(1.0))
                vpreds[-1] += last_out_2[:, 0]

            last_out = obz
            if activation == 'tanh': activation = tf.nn.tanh
            elif activation == 'relu': activation = tf.nn.relu
            for i in range(num_hid_layers):
                dense = U.dense(last_out,
                                hid_size,
                                part_prefix + "polfc%i" % (i + 1),
                                weight_init=U.normc_initializer(1.0))
                last_out = activation(dense)

            if actions == 'gaussian':
                if gaussian_fixed_var:
                    mean = U.dense(last_out,
                                   pdtype.param_shape()[0] // 2,
                                   part_prefix + "polfinal",
                                   U.normc_initializer(0.01))
                    if bound_by_sigmoid:
                        mean = tf.nn.sigmoid(mean * sigmoid_coef)
                    logstd = tf.get_variable(
                        name=part_prefix + "logstd",
                        shape=[1, pdtype.param_shape()[0] // 2],
                        initializer=tf.zeros_initializer())
                    logstd = mean * 0.0 + logstd
                else:
                    mean = U.dense(last_out,
                                   pdtype.param_shape()[0] // 2,
                                   part_prefix + "polfinal",
                                   U.normc_initializer(0.01))
                    logstd = U.dense(last_out,
                                     pdtype.param_shape()[0] // 2,
                                     part_prefix + "polfinal_2",
                                     U.normc_initializer(0.01))
                if gaussian_bias:
                    mean = mean + 0.5

                pdparam = U.concatenate([mean, logstd], axis=1)
            elif actions == 'beta':
                pdparam = U.dense(last_out,
                                  pdtype.param_shape()[0],
                                  part_prefix + "beta_lastlayer",
                                  U.normc_initializer(0.01))
                pdparam = tf.nn.softplus(pdparam)
            elif actions in ['bernoulli', 'binary']:
                if bound_by_sigmoid:
                    raise NotImplementedError(
                        "bound by sigmoid not implemented here")
                pdparam = U.dense(last_out,
                                  pdtype.param_shape()[0],
                                  part_prefix + "polfinal",
                                  U.normc_initializer(0.01))
            elif actions in ['cat_3']:
                pdparam = U.dense(last_out,
                                  pdtype.param_shape()[0],
                                  part_prefix + "cat3_lastlayer",
                                  U.normc_initializer(0.01))
                # prob = tf.reshape(pdparam, [18, -1])
                # prob = tf.nn.softmax(prob)
                # elogit = tf.exp(pdparam)
                # pdparam = tf.Print(pdparam, [prob], summarize=18)
            elif actions in ['cat_5']:
                pdparam = U.dense(last_out,
                                  pdtype.param_shape()[0],
                                  part_prefix + "cat5_lastlayer",
                                  U.normc_initializer(0.01))
                # prob = tf.reshape(pdparam, [18, -1])
                # prob = tf.nn.softmax(prob)
                # elogit = tf.exp(pdparam)
                # pdparam = tf.Print(pdparam, [prob], summarize=18)
            else:
                assert False

            pparams.append(pdparam)

        pparams = tf.stack(pparams)
        vpreds = tf.stack(vpreds)
        pparams = tf.transpose(pparams,
                               perm=(1, 0, 2))  # [batchsize, networks, values]
        vpreds = tf.transpose(vpreds,
                              perm=(1, 0))  # [batchsize, networks, values]

        self.stochastic = tf.placeholder(name="stochastic",
                                         dtype=tf.bool,
                                         shape=())

        if three:
            batchsize = tf.shape(pdparam)[0]
            NO_OBSTACLES_ID = 5
            OBST_DIST = [278, 279, 280, 281, 282, 283, 284,
                         285]  # TODO: Alternative approach
            distances = [self.ob[:, i] for i in OBST_DIST]
            distances = tf.stack(distances, axis=1)
            no_obstacles = tf.cast(tf.equal(self.ob[:, NO_OBSTACLES_ID], 1.0),
                                   tf.int32)
            distances = tf.cast(tf.reduce_all(tf.equal(distances, 3), axis=1),
                                tf.int32)
            no_obstacles_ahead = distances * no_obstacles  # 0 if obstacles, 1 if no obstacles
            begin = tf.cast(tf.less(self.st, 75), tf.int32)
            take_id = (1 - begin) * (
                1 + no_obstacles_ahead
            )  # begin==1 => 0, begin==0 => 1 + no_obstacles_ahead

            take_id = tf.stack((tf.range(batchsize), take_id), axis=1)
            pdparam = tf.gather_nd(pparams, take_id)

            self.vpred = tf.gather_nd(vpreds, take_id)
            #self.vpred = tf.Print(self.vpred, [take_id])
        else:
            self.vpred = vpreds[:, 0]
            pdparam = pparams[:, 0]

        self.pd = pdtype.pdfromflat(pdparam)

        if hasattr(self.pd, 'real_mean'):
            real_mean = self.pd.real_mean()
            ac = U.switch(self.stochastic, self.pd.sample(), real_mean)
        else:
            ac = U.switch(self.stochastic, self.pd.sample(), self.pd.mode())

        self._act = U.function([self.stochastic, self.ob, self.st],
                               [ac, self.vpred, ob_mean, ob_std])

        if actions == 'binary':
            self._binary_f = U.function([self.stochastic, self.ob, self.st],
                                        [ac, self.pd.flat, self.vpred])
    def _init(self, ob_space, ac_space,hid_size_V, hid_size_actor, num_hid_layers,V_keep_prob,\
             mc_samples,layer_norm,activation_critic,activation_actor, dropout_on_V,gaussian_fixed_var=True, sample_dropout=False):
        assert isinstance(ob_space, gym.spaces.Box)
        self.sample_dropout = sample_dropout

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
        
        
        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
        last_out = obz
        
        
        
        self.mc_samples=mc_samples
        self.V_keep_prob=V_keep_prob
        
        ### MAIN CHANGES
        #######################
        # Value function  

      
        with tf.variable_scope("value_function"):
            
            dropout_networks = [last_out] * self.mc_samples
           # dropout_networks = generate_dropout_layer(lambda x: x, dropout_networks, self.V_keep_prob)
            
            for i in range(num_hid_layers):
                if layer_norm:
                    last_out = activation_critic(tc.layers.layer_norm(tf.layers.dense(last_out, hid_size_V, name="vffc%i"%(i+1), \
                    kernel_initializer=U.normc_initializer(1.0)), center=True,scope="vffc_activation%i"%(i+1) ,scale=True))
                    
                    apply_layer = lambda x : activation_critic(tc.layers.layer_norm(tf.layers.dense(x, hid_size_V,name="vffc%i"%(i+1), 
                                        reuse=True) ,center=True,scope="vffc_activation%i"%(i+1) ,scale=True,reuse=True) )
                else:
                    last_out = activation_critic(tf.layers.dense(last_out, hid_size_V, name="vffc%i"%(i+1), \
                    kernel_initializer=U.normc_initializer(1.0)))
                    
                    apply_layer = lambda x : activation_critic(tf.layers.dense(x, hid_size_V,name="vffc%i"%(i+1), 
                                        reuse=True))
               
                dropout_networks=generate_dropout_layer(apply_layer,dropout_networks,self.V_keep_prob)
            
            ## final layer
            self.vpred = tf.layers.dense(last_out, 1, name="vffinal", kernel_initializer=U.normc_initializer(1.0))[:,0]
            
            apply_layer = lambda x : tf.layers.dense(x, 1, activation=None, \
                        name="vffinal", reuse=True)[:,0]
            dropout_networks=generate_layer(apply_layer,dropout_networks,self.V_keep_prob)
            
            mean,variance=tf.nn.moments(tf.stack(dropout_networks), 0)
            
            self.vpred_mc_mean=tf.add_n(dropout_networks) / float(len(dropout_networks))
            self.vpred_dropout_networks=dropout_networks
            
            self.variance=variance
            LAMBDA = tf.placeholder(dtype=tf.float32, shape=())
            self.v_lambda_variance=self.vpred_mc_mean+LAMBDA*tf.sqrt(variance)
         
            

            
        #######################    
        ## Policy
        last_out = obz
      
        with tf.variable_scope("policy"):
            for i in range(num_hid_layers):
                
                last_out = U.dense(last_out, hid_size_actor, "polfc%i"%(i+1), \
                weight_init=U.normc_initializer(1.0)) 
                last_out = activation_actor(last_out)
            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01))
                logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
                pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
            else:
                pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
       
        
        
        last_out = obz
        
        ## BUilding function Q(s,a)
        
#        last_out2=self.pd.sample()
#        activation=tf.nn.relu
#        #######################
#        # Action Value function  
#        with tf.variable_scope("Q"):        
#            dropout_networks = [last_out] * self.mc_samples
#            dropout_networks = generate_dropout_layer(lambda x: x, dropout_networks, self.keep_prob)
#                 
#            ## concatenate state and action
#            last_out = tf.concat([last_out, last_out2], axis=-1)
#            
#            new_networks = []
#            for dropout_network in dropout_networks:
#                dropout_network = tf.concat([dropout_network, last_out2], axis=-1)
#                dropout_network, mask = U.bayes_dropout(dropout_network, self.keep_prob)
#                new_networks.append(dropout_network)
#            dropout_networks = new_networks
#            
#            ### hidden layers
#            for i in range(num_hid_layers):
#                
#                last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="Q%i"%(i+1), kernel_initializer=U.normc_initializer(1.0)))
#                apply_layer = lambda x : activation(tf.layers.dense(x, hid_size, activation=None, \
#                        name="Q%i"%(i+1), reuse=True))
#                dropout_networks=generate_dropout_layer(apply_layer,dropout_networks,self.keep_prob)
#            
#            ## final layer
#            self.qpred = tf.layers.dense(last_out, 1, name="Qfinal", kernel_initializer=U.normc_initializer(1.0))[:,0]
#            
#            apply_layer = lambda x : tf.layers.dense(x, 1, activation=None, \
#                        name="Qfinal", reuse=True)[:,0]
#            dropout_networks=generate_dropout_layer(apply_layer,dropout_networks,self.keep_prob)
#            
#            self.qpred_mc_mean=tf.add_n(dropout_networks) / float(len(dropout_networks))
#            self.qpred_dropout_networks=dropout_networks
        
        
        
        
        ### MAIN CHANGES
        ## if dropout:
        if dropout_on_V:
            if self.sample_dropout:
                self._act = [U.function([stochastic, ob], [ac, x]) for x in dropout_networks]
            else:
                self._act = U.function([stochastic, ob], [ac, self.vpred_mc_mean])
                       


            
        else:
            self._act = U.function([stochastic, ob], [ac, self.vpred])
Beispiel #30
0
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True,
              num_options=2,
              dc=0):
        assert isinstance(ob_space, gym.spaces.Box)

        # Define the dimensions
        self.ac_space_dim = ac_space.shape[0]
        self.ob_space_dim = ob_space.shape[0]
        self.dc = dc
        self.last_action = tf.zeros(ac_space.shape, dtype=tf.float32)
        self.last_action_init = tf.zeros(ac_space.shape, dtype=tf.float32)
        self.num_options = num_options
        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))
        option = U.get_placeholder(name="option", dtype=tf.int32, shape=[None])

        # create a filter for the pure shape, meaning excluding u[k-1]
        obs_shape_pure = ((self.ob_space_dim - self.ac_space_dim), )

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)
        with tf.variable_scope("obfilter_pure"):
            self.ob_rms_only = RunningMeanStd(shape=obs_shape_pure)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                               5.0)
        obz_pure = tf.clip_by_value(
            (ob[:, :-self.ac_space_dim] - self.ob_rms_only.mean) /
            self.ob_rms_only.std, -5.0, 5.0)

        # implementation of the Q-funtion:
        last_out0 = obz  # for option 0
        last_out1 = obz_pure  # for option 1
        for i in range(num_hid_layers):
            last_out0 = tf.nn.tanh(
                U.dense(last_out0,
                        hid_size,
                        "vffc0%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
            last_out1 = tf.nn.tanh(
                U.dense(last_out1,
                        hid_size,
                        "vffc1%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        last_out0 = U.dense(last_out0,
                            1,
                            "vfff0",
                            weight_init=U.normc_initializer(1.0))
        last_out1 = U.dense(last_out1,
                            1,
                            "vfff1",
                            weight_init=U.normc_initializer(1.0))

        # presents the value of (state,option) -> denoted as Q-fct in report
        self.vpred = U.switch(option[0], last_out1, last_out0)[:, 0]

        # Implementation of the policy over options:
        last_out0 = obz  # for option 0
        last_out1 = obz_pure  # for option 1
        for i in range(num_hid_layers):
            last_out0 = tf.nn.tanh(
                U.dense(last_out0,
                        hid_size,
                        "oppi0%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
            last_out1 = tf.nn.tanh(
                U.dense(last_out1,
                        hid_size,
                        "oppi1%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        last_out0 = U.dense(last_out0,
                            1,
                            "oppif0",
                            weight_init=U.normc_initializer(1.0))
        last_out1 = U.dense(last_out1,
                            1,
                            "oppif1",
                            weight_init=U.normc_initializer(1.0))
        last_out = tf.concat([last_out0, last_out1], 1)
        # this is the output of the policy over options:
        self.op_pi = tf.nn.softmax(last_out)

        self.tpred = tf.nn.sigmoid(
            dense3D2(tf.stop_gradient(last_out),
                     1,
                     "termhead",
                     option,
                     num_options=num_options,
                     weight_init=U.normc_initializer(1.0)))[:, 0]

        # Always terminate
        termination_sample = tf.constant([True])

        # calculate the control action: -> implementation of intra option policy
        last_out = obz_pure
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "polfc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = dense3D2(last_out,
                            pdtype.param_shape()[0] // 2,
                            "polfinal",
                            option,
                            num_options=num_options,
                            weight_init=U.normc_initializer(0.01),
                            bias=False)
            mean = tf.nn.tanh(mean)
            logstd = tf.get_variable(
                name="logstd",
                shape=[num_options, 1,
                       pdtype.param_shape()[0] // 2],
                initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd[option[0]]],
                                    axis=1)
        else:
            pdparam = U.dense(last_out,
                              pdtype.param_shape()[0], "polfinal",
                              U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        # if stochastic is true, we sample around the mean, this corresponds to the exploration at the action level
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())

        # determine the control action to be applied. In case of ZOH == opt 0 just use u[k-1]
        ac = U.switch(option[0], ac,
                      tf.stop_gradient(ob[:, -self.ac_space_dim:]))
        ac = tf.clip_by_value(ac, -1.0, 1.0)

        self.last_action = tf.stop_gradient(ac)
        self._act = U.function([stochastic, ob, option],
                               [ac, self.vpred, last_out, logstd])

        self._get_v = U.function([ob, option], [self.vpred])
        self.get_term = U.function([ob, option], [termination_sample])
        self.get_tpred = U.function([ob, option], [self.tpred])
        self.get_vpred = U.function([ob, option], [self.vpred])
        self._get_op = U.function([ob], [self.op_pi])
Beispiel #31
0
    def __init__(self, ob_dim, ac_dim, hid_size=128, num_hid_layers=2):
        # Here we'll construct a bunch of expressions, which will be used in two places:
        # (1) When sampling actions
        # (2) When computing loss functions, for the policy update
        # Variables specific to (1) have the word "sampled" in them,
        # whereas variables specific to (2) have the word "old" in them
        ob_no = tf.placeholder(tf.float32, shape=[None, ob_dim * 2],
                               name="ob")  # batch of observations
        oldac_na = tf.placeholder(
            tf.float32, shape=[None, ac_dim],
            name="ac")  # batch of actions previous actions
        oldac_dist = tf.placeholder(
            tf.float32, shape=[None, ac_dim * 2], name="oldac_dist"
        )  # batch of actions previous action distributions
        adv_n = tf.placeholder(tf.float32, shape=[None],
                               name="adv")  # advantage function estimate
        wd_dict = {}

        last_out = ob_no
        for i in range(num_hid_layers):
            last_out = tf.nn.selu(
                U.dense(last_out,
                        hid_size,
                        "polfc%i" % (i + 1),
                        weight_init=U.normc_initializer(
                            1.0)))  # bias_init=0.0, weight_loss_dict=wd_dict
        mean_na = dense(last_out,
                        ac_dim,
                        "mean",
                        weight_init=U.normc_initializer(0.1),
                        bias_init=0.0,
                        weight_loss_dict=wd_dict)  # Mean control output

        self.wd_dict = wd_dict
        self.logstd_1a = logstd_1a = tf.get_variable(
            "logstd", [ac_dim], tf.float32,
            tf.zeros_initializer())  # Variance on outputs
        logstd_1a = tf.expand_dims(logstd_1a, 0)
        std_1a = tf.exp(logstd_1a)
        std_na = tf.tile(std_1a, [tf.shape(mean_na)[0], 1])
        ac_dist = tf.concat([
            tf.reshape(mean_na, [-1, ac_dim]),
            tf.reshape(std_na, [-1, ac_dim])
        ], 1)
        sampled_ac_na = tf.random_normal(
            tf.shape(ac_dist[:, ac_dim:])
        ) * ac_dist[:,
                    ac_dim:] + ac_dist[:, :
                                       ac_dim]  # This is the sampled action we'll perform.

        logprobsampled_n = -U.sum(tf.log(
            ac_dist[:, ac_dim:]), axis=1) - 0.5 * tf.log(
                2.0 * np.pi) * ac_dim - 0.5 * U.sum(
                    tf.square(ac_dist[:, :ac_dim] - sampled_ac_na) /
                    (tf.square(ac_dist[:, ac_dim:])),
                    axis=1)  # Logprob of sampled action
        logprob_n = -U.sum(tf.log(ac_dist[:, ac_dim:]), axis=1) - 0.5 * tf.log(
            2.0 * np.pi
        ) * ac_dim - 0.5 * U.sum(
            tf.square(ac_dist[:, :ac_dim] - oldac_na) /
            (tf.square(ac_dist[:, ac_dim:])),
            axis=1
        )  # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy)
        kl = U.mean(kl_div(oldac_dist, ac_dist, ac_dim))

        #kl = .5 * U.mean(tf.square(logprob_n - oldlogprob_n)) # Approximation of KL divergence between old policy used to generate actions, and new policy used to compute logprob_n
        surr = -U.mean(
            adv_n * logprob_n
        )  # Loss function that we'll differentiate to get the policy gradient
        surr_sampled = -U.mean(logprob_n)  # Sampled loss of the policy
        self._act = U.function([ob_no],
                               [sampled_ac_na, ac_dist, logprobsampled_n
                                ])  # Generate a new action and its logprob
        #self.compute_kl = U.function([ob_no, oldac_na, oldlogprob_n], kl) # Compute (approximate) KL divergence between old policy and new policy
        self.compute_kl = U.function([ob_no, oldac_dist], kl)
        self.update_info = (
            (ob_no, oldac_na, adv_n), surr, surr_sampled
        )  # Input and output variables needed for computing loss
        U.initialize()  # Initialize uninitialized TF variables
Beispiel #32
0
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        bins = ac_space.high[0] - ac_space.low[0] + 1
        print('making policy bins size {}'.format(bins))
        assert bins is not None
        act_dim = len(ac_space.high)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                               5.0)
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "vffc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        self.vpred = U.dense(last_out,
                             1,
                             "vffinal",
                             weight_init=U.normc_initializer(1.0))[:, 0]

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "polfc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            raise NotImplementedError
            mean = U.dense(last_out,
                           pdtype.param_shape()[0] // 2, "polfinal",
                           U.normc_initializer(0.01))
            logstd = tf.get_variable(name="logstd",
                                     shape=[1, pdtype.param_shape()[0] // 2],
                                     initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
        else:
            m = U.dense(last_out,
                        pdtype.param_shape()[0], "polfinal",
                        U.normc_initializer(0.01))
            norm_softm = tf.nn.sigmoid(
                m
            )  # of size [batchsize, num-actions*bins], initialized to be about uniform
            norm_softm = tf.reshape(
                norm_softm, [-1, act_dim, bins]
            )  # of size [batchsize, num-actions, bins], initialized to be about uniform

            norm_softm_tiled = tf.tile(tf.expand_dims(norm_softm, axis=-1),
                                       [1, 1, 1, bins])

            # construct the mask
            am_numpy = construct_mask(bins)
            am_tf = tf.constant(am_numpy, dtype=tf.float32)

            # construct pdparam
            pdparam = tf.reduce_sum(
                tf.math.log(norm_softm_tiled + 1e-8) * am_tf +
                tf.math.log(1 - norm_softm_tiled + 1e-8) * (1 - am_tf),
                axis=-1)
            pdparam = tf.reshape(pdparam, [-1, act_dim * bins])

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])