Example #1
0
    def __init__(self, name, observation_shape, hid_size, num_hid_layers):
        with tf.variable_scope(name):
            self.scope = tf.get_variable_scope().name

            observations_ph = U.get_placeholder(name='ob',
                                                dtype=tf.float32,
                                                shape=[None] +
                                                list(observation_shape))

            with tf.variable_scope('obfilter'):
                self.ob_rms = RunningMeanStd(shape=observation_shape)

            with tf.variable_scope('vf'):
                last_out = tf.clip_by_value(
                    (observations_ph - self.ob_rms.mean) / self.ob_rms.std,
                    -5.0, 5.0)
                for i in range(num_hid_layers):
                    last_out = tf.nn.tanh(
                        tf.layers.dense(
                            last_out,
                            hid_size,
                            name='fc%i' % (i + 1),
                            kernel_initializer=U.normc_initializer(1.0)))
                self.vpred = tf.layers.dense(
                    last_out,
                    1,
                    name='final',
                    kernel_initializer=U.normc_initializer(1.0))[:, 0]

            self.predict = U.function([observations_ph], self.vpred)
Example #2
0
    def _init(self, ob_space, ac_space, kind):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
    
        x = ob / 255.0
        if kind == 'small': # from A3C paper
            x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(U.dense(x, 256, 'lin', U.normc_initializer(1.0)))
        elif kind == 'large': # Nature DQN
            x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(U.dense(x, 512, 'lin', U.normc_initializer(1.0)))
        else:
            raise NotImplementedError

        logits = U.dense(x, pdtype.param_shape()[0], "logits", U.normc_initializer(0.01))
        self.pd = pdtype.pdfromflat(logits)
        self.vpred = U.dense(x, 1, "value", U.normc_initializer(1.0))[:,0]

        self.state_in = []
        self.state_out = []

        stochastic = tf.compat.v1.placeholder(dtype=tf.bool, shape=())
        ac = self.pd.sample() # XXX
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Example #3
0
    def build_graph(self, obs_ph, acs_ph, reuse=False):
        with tf.variable_scope(self.scope):
            if reuse:
                tf.get_variable_scope().reuse_variables()

            with tf.variable_scope("obfilter"):
                self.obs_rms = RunningMeanStd(shape=self.observation_shape)
            obs = (obs_ph - self.obs_rms.mean / self.obs_rms.std)

            last_out = obs
            last_out = tf.nn.tanh(
                U.conv2d(last_out, 64, 'vfconv1', (7, 7), (3, 3), pad='VALID'))
            last_out = tf.nn.tanh(
                U.conv2d(last_out, 64, 'vfconv2', (5, 5), (2, 2), pad='VALID'))
            last_out = tf.nn.tanh(
                U.conv2d(last_out, 64, 'vfconv3', (3, 3), (1, 1), pad='VALID'))
            last_out = tf.nn.tanh(
                U.conv2d(last_out, 64, 'vfconv4', (3, 3), (1, 1), pad='VALID'))
            last_out = tf.reshape(last_out, tf.convert_to_tensor([-1,
                                                                  784 * 4]))
            last_out = tf.nn.tanh(
                tf.layers.dense(last_out,
                                512,
                                kernel_initializer=U.normc_initializer(1.0)))
            last_out = tf.concat([last_out, acs_ph], axis=1)
            logits = tf.layers.dense(
                last_out + self.num_actions,
                1,
                kernel_initializer=U.normc_initializer(1.0))
        return logits
Example #4
0
    def __init__(self, ob_dim, ac_dim): #pylint: disable=W0613
        #X = tf.placeholder(tf.float32, shape=[None, ob_dim*2+ac_dim*2+2]) # batch of observations
        X = tf.placeholder(tf.float32, shape=[None, ob_dim*2+2]) # batch of observations
        vtarg_n = tf.placeholder(tf.float32, shape=[None], name='vtarg')
        wd_dict = {}
        h1 = tf.nn.elu(dense(X, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict))
        h2 = tf.nn.elu(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict))
        vpred_n = dense(h2, 1, "hfinal", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)[:,0]
        sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n))
        wd_loss = tf.get_collection("vf_losses", None)
        loss = U.mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss)
        loss_sampled = U.mean(tf.square(vpred_n - tf.stop_gradient(sample_vpred_n)))
        self._predict = U.function([X], vpred_n)
        optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001*(1-0.9), momentum=0.9, \
                                    clip_kl=0.3, epsilon=0.1, stats_decay=0.95, \
                                    async=1, kfac_update=2, cold_iter=50, \
                                    weight_decay_dict=wd_dict, max_grad_norm=None)
        vf_var_list = []
        for var in tf.trainable_variables():
            if "vf" in var.name:
                vf_var_list.append(var)

        update_op, self.q_runner = optim.minimize(loss, loss_sampled, var_list=vf_var_list)
        self.do_update = U.function([X, vtarg_n], update_op) #pylint: disable=E1101
        U.initialize() # Initialize uninitialized TF variables
Example #5
0
    def _init(self, ob_space, ac_space):
        self.pdtype = distributions.make_pdtype(ac_space)

        ob = U.get_placeholder(name='ob', dtype=tf.int32, shape=[None] + list(ob_space.shape))
        next_blocks, my_grid, opp_grid = tf.split(ob, [16, 12 * 6, 12 * 6], axis=1)

        with tf.variable_scope('next_blocks'):
            next_blocks = tf.one_hot(next_blocks, depth=5)
            next_blocks = U.flattenallbut0(next_blocks)
            next_blocks = tf.nn.leaky_relu(tf.layers.dense(next_blocks, 12, name='l1', kernel_initializer=U.normc_initializer(1.0)), alpha=0.1)
            next_blocks = tf.nn.leaky_relu(tf.layers.dense(next_blocks, 12, name='l2', kernel_initializer=U.normc_initializer(1.0)), alpha=0.1)

        with tf.variable_scope('grids', reuse=False):
            my_grid = _grid_cnn(my_grid)

        with tf.variable_scope('grids', reuse=True):
            opp_grid = _grid_cnn(opp_grid)

        x = tf.concat([next_blocks, my_grid, opp_grid], axis=1)
        x = tf.nn.leaky_relu(tf.layers.dense(x, 64, name='lin', kernel_initializer=U.normc_initializer(1.0)), alpha=0.1)

        logits = tf.layers.dense(x, self.pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01))
        self.pd = self.pdtype.pdfromflat(logits)
        self.vpred = tf.layers.dense(x, 1, name='value', kernel_initializer=U.normc_initializer(1.0))[:, 0]

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
    def _init(self, ob_space, ac_space):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        obscaled = ob / 255.0

        with tf.variable_scope("pol"):
            x = obscaled
            x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0)))
            logits = U.dense(x, pdtype.param_shape()[0], "logits", U.normc_initializer(0.01))
            self.pd = pdtype.pdfromflat(logits)
        with tf.variable_scope("vf"):
            x = obscaled
            x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0)))
            self.vpred = U.dense(x, 1, "value", U.normc_initializer(1.0))
            self.vpredz = self.vpred

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = self.pd.sample() # XXX
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Example #7
0
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
        self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0]

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01))
            logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
        else:
            pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Example #8
0
    def _init(self, ob_space, ac_space, kind):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        x = ob / 255.0
        if kind == 'small': # from A3C paper
            x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0)))
        elif kind == 'large': # Nature DQN
            x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(tf.layers.dense(x, 512, name='lin', kernel_initializer=U.normc_initializer(1.0)))
        else:
            raise NotImplementedError

        logits = tf.layers.dense(x, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01))
        self.pd = pdtype.pdfromflat(logits)
        self.vpred = tf.layers.dense(x, 1, name='value', kernel_initializer=U.normc_initializer(1.0))[:,0]

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = self.pd.sample() # XXX
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Example #9
0
 def __init__(self, ob_dim, ac_dim):
     # Here we'll construct a bunch of expressions, which will be used in two places:
     # (1) When sampling actions
     # (2) When computing loss functions, for the policy update
     # Variables specific to (1) have the word "sampled" in them,
     # whereas variables specific to (2) have the word "old" in them
     ob_no = tf.placeholder(tf.float32, shape=[None, ob_dim*2], name="ob") # batch of observations
     oldac_na = tf.placeholder(tf.float32, shape=[None, ac_dim], name="ac") # batch of actions previous actions
     oldac_dist = tf.placeholder(tf.float32, shape=[None, ac_dim*2], name="oldac_dist") # batch of actions previous action distributions
     adv_n = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage function estimate
     wd_dict = {}
     h1 = tf.nn.tanh(dense(ob_no, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict))
     h2 = tf.nn.tanh(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict))
     mean_na = dense(h2, ac_dim, "mean", weight_init=U.normc_initializer(0.1), bias_init=0.0, weight_loss_dict=wd_dict) # Mean control output
     self.wd_dict = wd_dict
     self.logstd_1a = logstd_1a = tf.get_variable("logstd", [ac_dim], tf.float32, tf.zeros_initializer()) # Variance on outputs
     logstd_1a = tf.expand_dims(logstd_1a, 0)
     std_1a = tf.exp(logstd_1a)
     std_na = tf.tile(std_1a, [tf.shape(mean_na)[0], 1])
     ac_dist = tf.concat([tf.reshape(mean_na, [-1, ac_dim]), tf.reshape(std_na, [-1, ac_dim])], 1)
     sampled_ac_na = tf.random_normal(tf.shape(ac_dist[:,ac_dim:])) * ac_dist[:,ac_dim:] + ac_dist[:,:ac_dim] # This is the sampled action we'll perform.
     logprobsampled_n = - tf.reduce_sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * tf.reduce_sum(tf.square(ac_dist[:,:ac_dim] - sampled_ac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of sampled action
     logprob_n = - tf.reduce_sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * tf.reduce_sum(tf.square(ac_dist[:,:ac_dim] - oldac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy)
     kl = tf.reduce_mean(kl_div(oldac_dist, ac_dist, ac_dim))
     #kl = .5 * tf.reduce_mean(tf.square(logprob_n - oldlogprob_n)) # Approximation of KL divergence between old policy used to generate actions, and new policy used to compute logprob_n
     surr = - tf.reduce_mean(adv_n * logprob_n) # Loss function that we'll differentiate to get the policy gradient
     surr_sampled = - tf.reduce_mean(logprob_n) # Sampled loss of the policy
     self._act = U.function([ob_no], [sampled_ac_na, ac_dist, logprobsampled_n]) # Generate a new action and its logprob
     #self.compute_kl = U.function([ob_no, oldac_na, oldlogprob_n], kl) # Compute (approximate) KL divergence between old policy and new policy
     self.compute_kl = U.function([ob_no, oldac_dist], kl)
     self.update_info = ((ob_no, oldac_na, adv_n), surr, surr_sampled) # Input and output variables needed for computing loss
     U.initialize() # Initialize uninitialized TF variables
Example #10
0
    def _create_logit_value(self,
                            action_layer,
                            value_layer,
                            gaussian_fixed_var=False):
        # actor
        if gaussian_fixed_var and isinstance(self.ac_space, gym.spaces.Box):
            mean = U.dense(action_layer,
                           self.pdtype.param_shape()[0] // 2, "polfinal",
                           U.normc_initializer(0.01))
            logstd = tf.get_variable(
                name="logstd",
                shape=[1, self.pdtype.param_shape()[0] // 2],
                initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
        else:
            pdparam = U.dense(action_layer,
                              self.pdtype.param_shape()[0], "polfinal",
                              U.normc_initializer(0.01))

        self.pd = self.pdtype.pdfromflat(pdparam)
        self.ac = U.switch(self.stochastic, self.pd.sample(), self.pd.mode())

        # critic
        self.vpred = U.dense(value_layer,
                             1,
                             "vffinal",
                             weight_init=U.normc_initializer(1.0))[:, 0]
Example #11
0
    def _create_network(self):
        x = self.ob

        # create ob filter
        if self.ob_filter:
            self.ob_rms = RunningMeanStd(shape=self.ob_space.shape)
            x = tf.clip_by_value(
                (self.ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)

        # actor
        l = x
        l = tf.nn.tanh(
            U.dense(l, 32, "a_1", weight_init=U.normc_initializer(1.0)))
        l = tf.nn.tanh(
            U.dense(l, 32, "a_2", weight_init=U.normc_initializer(1.0)))
        action_layer = l

        # critic
        l = x
        l = tf.nn.tanh(
            U.dense(l, 32, "c_1", weight_init=U.normc_initializer(1.0)))
        l = tf.nn.tanh(
            U.dense(l, 32, "c_2", weight_init=U.normc_initializer(1.0)))
        value_layer = l

        self._create_logit_value(action_layer, value_layer,
                                 self.gaussian_fixed_var)
Example #12
0
    def _init(self,
              in_dim,
              out_dim,
              hid_size,
              num_hid_layers,
              last_init_size=0.01,
              name='ff'):
        # state_dim: dimension of input/output state from previous/root encoder
        self.params = []
        self.num_hid_layers = num_hid_layers

        self.intin_dim = in_dim

        last_out_dim = in_dim
        for i in range(num_hid_layers):
            w, b = dense_params(last_out_dim,
                                hid_size,
                                name + "%i" % (i + 1),
                                weight_init=U.normc_initializer(1.0))
            self.params.append([w, b])
            last_out_dim = hid_size
        w, b = dense_params(last_out_dim,
                            out_dim,
                            name + "_out",
                            weight_init=U.normc_initializer(last_init_size))
        self.params.append([w, b])
Example #13
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False):  #pylint: disable=W0613
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc)
        nact = ac_space.n
        X = tf.placeholder(tf.float32, ob_shape)  #obs
        print(ob_shape)

        self.pdtype = pdtype = make_pdtype(ac_space)
        with tf.variable_scope("model", reuse=reuse):
            '''
            h = conv(X, 'c1', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), pad="SAME")
            h2 = conv(h, 'c2', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), pad="SAME")
            h3 = conv(h2, 'c3', nf=128, rf=3, stride=1, init_scale=np.sqrt(2), pad="SAME")
            h3 = conv_to_fc(h3)
            h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))

            hh = conv(X, 'xc1', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), pad="SAME")
            hh2 = conv(hh, 'xc2', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), pad="SAME")
            hh3 = conv(hh2, 'xc3', nf=128, rf=3, stride=1, init_scale=np.sqrt(2), pad="SAME")
            hh3 = conv_to_fc(hh3)
            hh4 = fc(hh3, 'xfc1', nh=512, init_scale=np.sqrt(2))
            pi = fc(h4, 'pi', nact, act=lambda x:x, init_scale=0.01)
            vf = fc(hh4, 'v', 1, act=lambda x:x)[:,0]

            '''
            x = tf.nn.relu(U.conv2d(X, 32, "l1", [3, 3], [1, 1], pad="SAME"))
            x = tf.nn.relu(U.conv2d(x, 64, "l2", [3, 3], [1, 1], pad="SAME"))
            x = tf.nn.relu(U.conv2d(x, 128, "l3", [3, 3], [1, 1], pad="SAME"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(U.dense(x, 512, 'lin', U.normc_initializer(1.0)))

            y = tf.nn.relu(U.conv2d(X, 32, "yl1", [3, 3], [1, 1], pad="SAME"))
            y = tf.nn.relu(U.conv2d(y, 64, "yl2", [3, 3], [1, 1], pad="SAME"))
            y = tf.nn.relu(U.conv2d(y, 128, "yl3", [3, 3], [1, 1], pad="SAME"))
            y = U.flattenallbut0(y)
            y = tf.nn.relu(U.dense(y, 512, 'ylin', U.normc_initializer(1.0)))

            pi = U.dense(x,
                         pdtype.param_shape()[0], "logits",
                         U.normc_initializer(0.01))
            vf = U.dense(y, 1, "value", U.normc_initializer(1.0))[:, 0]

        self.pd = self.pdtype.pdfromflat(pi)

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X: ob})

        self.X = X
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
Example #14
0
    def _init(self,
              in_dim,
              out_dim,
              hid_size,
              num_hid_layers,
              last_init_size=0.01):
        # state_dim: dimension of input/output state from previous/root encoder
        self.params = []
        self.num_hid_layers = num_hid_layers

        self.intin_dim = in_dim - 1

        last_out_dim = in_dim - 1
        for i in range(num_hid_layers):
            w, b = dense_params(last_out_dim,
                                hid_size,
                                "ff%i" % (i + 1),
                                weight_init=U.normc_initializer(1.0))
            logmask = tf.get_variable(
                name="logmask%i" % (i + 1),
                shape=[1],
                initializer=tf.constant_initializer(-1.0))
            self.params.append([w, b, logmask])
            last_out_dim = hid_size
        w, b = dense_params(last_out_dim,
                            out_dim,
                            "ff_out",
                            weight_init=U.normc_initializer(last_init_size))
        logmask = tf.get_variable(name="logmask_out",
                                  shape=[1],
                                  initializer=tf.constant_initializer(-1.0))
        self.params.append([w, b, logmask])
Example #15
0
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              exploration_rate,
              gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        # with tf.variable_scope("obfilter"):
        #     self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        # obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
        obz = ob

        valueFunction = Sequential()
        valueFunction.add(InputLayer(input_tensor=obz))
        valueFunction.add(Dense(64, activation='tanh'))
        valueFunction.add(Dense(64, activation='tanh'))

        self.vpred = self.dense(x=valueFunction.output,
                                size=1,
                                name="vffinal",
                                weight_init=U.normc_initializer(1.0),
                                bias=True)[:, 0]

        model = Sequential()
        model.add(InputLayer(input_tensor=obz))
        model.add(Dense(64, activation='tanh'))
        model.add(Dense(64, activation='tanh'))
        model.add(Dense(23))
        model.load_weights("neural_kick")

        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = model.output
            logstd = tf.get_variable(
                name="logstd",
                shape=[1, pdtype.param_shape()[0] // 2],
                initializer=tf.constant_initializer(exploration_rate))
            pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
        else:
            pdparam = tf.layers.dense(model.output,
                                      pdtype.param_shape()[0], "polfinal",
                                      U.normc_initializer(0.01))
        my_var = tf.strided_slice(mean, [0], [1], [1], shrink_axis_mask=1)
        my_var_out = tf.identity(my_var, name='output_node')
        self.pd = pdtype.pdfromflat(pdparam)
        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Example #16
0
 def __init__(self, ob_dim, ac_dim):
     # Here we'll construct a bunch of expressions, which will be used in two places:
     # (1) When sampling actions
     # (2) When computing loss functions, for the policy update
     # Variables specific to (1) have the word "sampled" in them,
     # whereas variables specific to (2) have the word "old" in them
     ob_no = tf.placeholder(tf.float32, shape=[None, ob_dim*2], name="ob") # batch of observations
     oldac_na = tf.placeholder(tf.float32, shape=[None, ac_dim], name="ac") # batch of actions previous actions
     oldac_dist = tf.placeholder(tf.float32, shape=[None, ac_dim*2], name="oldac_dist") # batch of actions previous action distributions
     adv_n = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage function estimate
     oldlogprob_n = tf.placeholder(tf.float32, shape=[None], name='oldlogprob') # log probability of previous actions
     wd_dict = {}
     h1 = tf.nn.tanh(dense(ob_no, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict))
     h2 = tf.nn.tanh(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict))
     mean_na = dense(h2, ac_dim, "mean", weight_init=U.normc_initializer(0.1), bias_init=0.0, weight_loss_dict=wd_dict) # Mean control output
     self.wd_dict = wd_dict
     self.logstd_1a = logstd_1a = tf.get_variable("logstd", [ac_dim], tf.float32, tf.zeros_initializer()) # Variance on outputs
     logstd_1a = tf.expand_dims(logstd_1a, 0)
     std_1a = tf.exp(logstd_1a)
     std_na = tf.tile(std_1a, [tf.shape(mean_na)[0], 1])
     ac_dist = tf.concat([tf.reshape(mean_na, [-1, ac_dim]), tf.reshape(std_na, [-1, ac_dim])], 1)
     sampled_ac_na = tf.random_normal(tf.shape(ac_dist[:,ac_dim:])) * ac_dist[:,ac_dim:] + ac_dist[:,:ac_dim] # This is the sampled action we'll perform.
     logprobsampled_n = - U.sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * U.sum(tf.square(ac_dist[:,:ac_dim] - sampled_ac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of sampled action
     logprob_n = - U.sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * U.sum(tf.square(ac_dist[:,:ac_dim] - oldac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy)
     kl = U.mean(kl_div(oldac_dist, ac_dist, ac_dim))
     #kl = .5 * U.mean(tf.square(logprob_n - oldlogprob_n)) # Approximation of KL divergence between old policy used to generate actions, and new policy used to compute logprob_n
     surr = - U.mean(adv_n * logprob_n) # Loss function that we'll differentiate to get the policy gradient
     surr_sampled = - U.mean(logprob_n) # Sampled loss of the policy
     self._act = U.function([ob_no], [sampled_ac_na, ac_dist, logprobsampled_n]) # Generate a new action and its logprob
     #self.compute_kl = U.function([ob_no, oldac_na, oldlogprob_n], kl) # Compute (approximate) KL divergence between old policy and new policy
     self.compute_kl = U.function([ob_no, oldac_dist], kl)
     self.update_info = ((ob_no, oldac_na, adv_n), surr, surr_sampled) # Input and output variables needed for computing loss
     U.initialize() # Initialize uninitialized TF variables
Example #17
0
 def img_encoder(self, x, kind):
     if kind == 'small':  # from A3C paper
         x = max_pool(
             tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [1, 1], pad="VALID")),
             4)
         x = max_pool(
             tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [1, 1], pad="VALID")),
             2)
         x = U.flattenallbut0(x)
         x = tf.nn.relu(
             tf.layers.dense(x,
                             256,
                             name='lin',
                             kernel_initializer=U.normc_initializer(1.0)))
     elif kind == 'large':  # Nature DQN
         x = max_pool(
             tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [1, 1], pad="VALID")),
             4)
         x = max_pool(
             tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [1, 1], pad="VALID")),
             2)
         x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID"))
         x = U.flattenallbut0(x)
         x = tf.nn.relu(
             tf.layers.dense(x,
                             512,
                             name='lin',
                             kernel_initializer=U.normc_initializer(1.0)))
     else:
         raise NotImplementedError
     return x
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, use_actions):
        assert isinstance(ob_space, gym.spaces.Box)
        self.use_actions = use_actions
        sequence_length = None

        if use_actions:
            inp_shape = (ob_space.shape[0] + ac_space.shape[0], )
        else:
            inp_shape = ob_space.shape
        rew_input = U.get_placeholder(name="rew_input",
                                      dtype=tf.float32,
                                      shape=[sequence_length] +
                                      list(inp_shape))

        with tf.variable_scope("inputfilter"):
            self.inp_rms = RunningMeanStd(shape=inp_shape)

        with tf.variable_scope('rew'):
            input_clipped = tf.clip_by_value(
                (rew_input - self.inp_rms.mean) / self.inp_rms.std, -5.0, 5.0)
            last_out = input_clipped
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(
                    tf.layers.dense(
                        last_out,
                        hid_size,
                        name="fc%i" % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))
            self.reward = tf.layers.dense(
                last_out,
                1,
                name='final',
                kernel_initializer=U.normc_initializer(1.0))[:, 0]

        self._rew = U.function([rew_input], [self.reward])
Example #19
0
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                               5.0)
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "vffc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        self.vpred = U.dense(last_out,
                             1,
                             "vffinal",
                             weight_init=U.normc_initializer(1.0))[:, 0]

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "polfc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = U.dense(last_out,
                           pdtype.param_shape()[0] // 2, "polfinal",
                           U.normc_initializer(0.01))
            logstd = tf.get_variable(name="logstd",
                                     shape=[1, pdtype.param_shape()[0] // 2],
                                     initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
        else:
            pdparam = U.dense(last_out,
                              pdtype.param_shape()[0], "polfinal",
                              U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
    def build_graph(self, obs_ph, acs_ph, reuse=False):
        """
        obs_ph: tf tensor shape of [None,84,84,4]
        acs_ph: tf tensor shape of [None,ac_dim] #one hot encoding

        """
        with tf.variable_scope(self.scope):
            if reuse:
                tf.get_variable_scope().reuse_variables()

            one_hot_ac = tf.one_hot(acs_ph, self.num_actions, dtype=tf.float32)
            x = tf.concat([
                obs_ph / 255.0,
                tf.tile(one_hot_ac[:, None, None, :], [1, 84, 84, 1])
            ],
                          axis=3)  #[None,84,84,4+ac_dim]
            x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.tanh(
                tf.layers.dense(x,
                                512,
                                name='lin1',
                                kernel_initializer=U.normc_initializer(1.0)))
            logits = tf.layers.dense(
                x, 1, name='lin2', kernel_initializer=U.normc_initializer(1.0))
        return logits
Example #21
0
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, rnn_hid_units, gaussian_fixed_var=True):
        #assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        # Apply rnn_to reduce history
        with tf.variable_scope("vf"):
            last_out = self.rnn(ob, ob_space.shape[0], rnn_hid_units)
            for i in range(num_hid_layers):
                last_out = U.dense(last_out, hid_size, "vf_dense%i"%i, weight_init=U.normc_initializer(1.0))
            self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0]

        # Apply rnn_to reduce history
        with tf.variable_scope("pf"):
            last_out = self.rnn(ob, ob_space.shape[0], rnn_hid_units)
            for i in range(num_hid_layers):
                last_out = U.dense(last_out, hid_size, "pf_dense%i"%i, weight_init=U.normc_initializer(1.0))

            assert gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box)
            mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01))
            logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Example #22
0
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              tau,
              gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)
        print('use zpmpl_Adv')
        self.ac_space = ac_space
        self.hid_size = hid_size
        self.num_hid_layers = num_hid_layers
        self.gaussian_fixed_var = gaussian_fixed_var

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        self.ob = U.get_placeholder(name="ob_adv",
                                    dtype=tf.float32,
                                    shape=[sequence_length] +
                                    list(ob_space.shape))
        self.ob_ = U.get_placeholder(name="adv_ob_",
                                     dtype=tf.float32,
                                     shape=[sequence_length] +
                                     list(ob_space.shape))

        with tf.variable_scope("obfilter_adv"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        with tf.variable_scope('adv_vf'):
            self.obz = tf.clip_by_value(
                (self.ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            last_out = self.obz
            for i in range(self.num_hid_layers):
                last_out = tf.nn.tanh(
                    tf.layers.dense(
                        last_out,
                        self.hid_size,
                        name="adv_vffc%i" % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))
            self.vpred = tf.layers.dense(
                last_out,
                1,
                name="adv_vffinal",
                kernel_initializer=U.normc_initializer(1.0))[:, 0]

        self.pdparam = self.build_action(self.ob)
        self.pdparam_ = self.build_action(self.ob_, reuse=True)

        self.pd = pdtype.pdfromflat(self.pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self.ac = self.pd.sample()
        self.ac_, _ = self.sample_()
        self._act = U.function([stochastic, self.ob], [ac, self.vpred])
Example #23
0
 def __init__(self, ob_dim, ac_dim, ac_space, bins):
     # Here we'll construct a bunch of expressions, which will be used in two places:
     # (1) When sampling actions
     # (2) When computing loss functions, for the policy update
     # Variables specific to (1) have the word "sampled" in them,
     # whereas variables specific to (2) have the word "old" in them
     ob_no = tf.placeholder(tf.float32, shape=[None, ob_dim * 2],
                            name="ob")  # batch of observations
     oldac_na = tf.placeholder(
         tf.int32, shape=[None, ac_dim],
         name="ac")  # batch of actions previous actions
     oldac_logits = tf.placeholder(
         tf.float32, shape=[None, ac_dim * bins], name="oldac_logit"
     )  # batch of actions previous action distributions
     adv_n = tf.placeholder(tf.float32, shape=[None],
                            name="adv")  # advantage function estimate
     self.pdtype = make_pdtype(ac_space)
     wd_dict = {}
     # forward pass
     h1 = tf.nn.tanh(
         dense(ob_no,
               64,
               "h1",
               weight_init=U.normc_initializer(1.0),
               bias_init=0.0,
               weight_loss_dict=wd_dict))
     h2 = tf.nn.tanh(
         dense(h1,
               64,
               "h2",
               weight_init=U.normc_initializer(1.0),
               bias_init=0.0,
               weight_loss_dict=wd_dict))
     logits_na = dense(h2,
                       self.pdtype.param_shape()[0],
                       "logits",
                       weight_init=U.normc_initializer(0.1),
                       bias_init=0.0,
                       weight_loss_dict=wd_dict)  # Mean control
     self.wd_dict = wd_dict
     self.pd = self.pdtype.pdfromflat(
         logits_na)  # multi-categorical distributions
     # sample action for control
     sampled_ac_na = self.pd.sample()
     # log prob for sampled actions
     logprobsampled_n = -self.pd.neglogp(sampled_ac_na)
     logprob_n = -self.pd.neglogp(oldac_na)
     # kl div
     old_pd = self.pdtype.pdfromflat(oldac_logits)
     kl = U.mean(old_pd.kl(self.pd))
     # surr loss
     surr = -U.mean(adv_n * logprob_n)
     surr_sampled = -U.mean(logprob_n)
     # expressions
     self._act = U.function([ob_no],
                            [sampled_ac_na, logits_na, logprobsampled_n])
     self.compute_kl = U.function([ob_no, oldac_logits], kl)
     self.update_info = ((ob_no, oldac_na, adv_n), surr, surr_sampled)
     U.initialize()
Example #24
0
def resnet(inputs, hid_size, name):
    x = U.dense(inputs, hid_size, "%s_dense1"%name, weight_init=U.normc_initializer(1.0))
    #x = tf.contrib.layers.batch_norm(x)
    x = tf.nn.relu(x)
    x = U.dense(x, hid_size, "%s_dense2"%name, weight_init=U.normc_initializer(1.0))
    #x = tf.contrib.layers.batch_norm(x)
    x = tf.nn.relu(x+inputs)
    return x
    def __init__(self, sess, ob_dim, ac_dim, vf_lr=0.001, cv_lr=0.001, reuse=False):
        # Here we'll construct a bunch of expressions, which will be used in two places:
        # (1) When sampling actions
        # (2) When computing loss functions, for the policy update
        # Variables specific to (1) have the word "sampled" in them,
        # whereas variables specific to (2) have the word "old" in them
        self.relaxed = False
        self.X = tf.placeholder(tf.float32, shape=[None, ob_dim*2+ac_dim*2+2]) # batch of observations
        self.ob_no = tf.placeholder(tf.float32, shape=[None, ob_dim*2], name="ob") # batch of observations
        self.oldac_na = tf.placeholder(tf.float32, shape=[None, ac_dim], name="ac") # batch of actions previous actions
        oldac_dist = tf.placeholder(tf.float32, shape=[None, ac_dim*2], name="oldac_dist") # batch of actions previous action distributions
        
        with tf.variable_scope("model", reuse=reuse):
            h1 = tf.nn.tanh(dense(self.ob_no, 64, "pi_h1", weight_init=U.normc_initializer(1.0), bias_init=0.0))
            h2 = tf.nn.tanh(dense(h1, 64, "pi_h2", weight_init=U.normc_initializer(1.0), bias_init=0.0))
            mean_na = dense(h2, ac_dim, "pi", weight_init=U.normc_initializer(0.1), bias_init=0.0) # Mean control output
            self.logstd_1a = logstd_1a = tf.get_variable("logstd", [ac_dim], tf.float32, tf.zeros_initializer()) # Variance on outputs
            logstd_1a = tf.expand_dims(logstd_1a, 0)
            self.std_1a = tf.exp(logstd_1a)
            self.std_na = tf.tile(self.std_1a, [tf.shape(mean_na)[0], 1])
            ac_dist = tf.concat([tf.reshape(mean_na, [-1, ac_dim]), tf.reshape(self.std_na, [-1, ac_dim])], 1)
            sampled_ac_na = tf.random_normal(tf.shape(ac_dist[:,ac_dim:])) * ac_dist[:,ac_dim:] + ac_dist[:,:ac_dim] # This is the sampled action we'll perform.
            logprobsampled_n = - U.sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * U.sum(tf.square(ac_dist[:,:ac_dim] - sampled_ac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of sampled action
            self.logprob_n = - U.sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * U.sum(tf.square(ac_dist[:,:ac_dim] - self.oldac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy)
            kl = U.mean(kl_div(oldac_dist, ac_dist, ac_dim))
        

            vh1 = tf.nn.elu(dense(self.X, 64, "vf_h1", weight_init=U.normc_initializer(1.0), bias_init=0))
            vh2 = tf.nn.elu(dense(vh1, 64, "vf_h2", weight_init=U.normc_initializer(1.0), bias_init=0))
            vpred_n = dense(vh2, 1, "vf", weight_init=None, bias_init=0)
            v0 = vpred_n[:, 0]
            self.vf_optim = tf.train.AdamOptimizer(vf_lr)
        
        def act(ob):
            ac, dist, logp = sess.run([sampled_ac_na, ac_dist, logprobsampled_n], {self.ob_no: ob[None]})  # Generate a new action and its logprob
            return ac[0], dist[0], logp[0]
        def value(obs, x):
            return sess.run(v0, {self.X: x, self.ob_no:obs})
        def preproc(path):
            l = pathlength(path)
            al = np.arange(l).reshape(-1,1)/10.0
            act = path["action_dist"].astype('float32')
            X = np.concatenate([path['observation'], act, al, np.ones((l, 1))], axis=1)
            return X
        def predict(obs, path):
            return value(obs, preproc(path))
        def compute_kl(ob, dist):
            return sess.run(kl, {self.ob_no: ob, oldac_dist: dist})
            
        self.mean = mean_na
        self.vf = v0
        self.act = act
        self.value = value
        self.preproc = preproc
        self.predict = predict
        self.compute_kl = compute_kl
        self.a0 = sampled_ac_na
 def build_forward(self, state, reuse):
     # build noise samples
     batch_size = [state.get_shape().as_list()[0], self.input_dim]
     noise_dist = tfd.Normal(loc=0., scale=1.)
     noise_samples = noise_dist.sample(
         batch_size)  # size of [batchsize, action dim]
     # build forward
     last_out = state
     self.meandict = meandict = []
     self.logstddict = logstddict = []
     with tf.variable_scope('forward', reuse=reuse):
         for i in range(self.num_hid_layers):
             last_out = tf.nn.tanh(
                 U.dense(last_out,
                         self.hid_size,
                         "polfc%i" % (i + 1),
                         weight_init=U.normc_initializer(1.0)))
         for k in range(self.K):
             mean = U.dense(last_out, self.input_dim,
                            "polfinal_{}".format(k),
                            U.normc_initializer(0.01))
             logstd = tf.get_variable(name="logstd_{}".format(k),
                                      shape=[1, self.input_dim],
                                      initializer=tf.zeros_initializer())
             meandict.append(mean)
             logstddict.append(logstd)
     meandicttf = tf.concat(meandict,
                            axis=1)  # size of [batchsize, action dim * K]
     logstddicttf = tf.concat(logstddict, axis=1)
     # generate masks
     logits = [0.0] * self.K
     num_samples = self.state.shape.as_list()[0]
     categorical_mask = tf.multinomial([logits], num_samples)
     #print('categoricalmask', categorical_mask)
     onehot_mask = tf.squeeze(tf.one_hot(categorical_mask, self.K), 0)
     #print('onehotmask', onehot_mask)
     onehot_mask_tiled = tf.squeeze(tf.reshape(
         tf.tile(tf.expand_dims(onehot_mask, axis=2),
                 [1, 1, self.input_dim]), [-1, self.input_dim * self.K, 1]),
                                    axis=2)
     # select
     mean_tiled = tf.multiply(
         onehot_mask_tiled,
         meandicttf)  # size of [batchsize, action dim * K]
     logstd_tiled = tf.multiply(onehot_mask_tiled, logstddicttf)
     # sample action mean and logstd
     mean = tf.reshape(
         mean_tiled,
         [-1, self.K, self.input_dim])  # size of [batchsize, K, action dim]
     logstd = tf.reshape(logstd_tiled, [-1, self.K, self.input_dim])
     mean_final = tf.reduce_sum(
         mean, axis=1, keepdims=True)  # size of [batchsize, action dim]
     logstd_final = tf.reduce_sum(logstd, axis=1, keepdims=True)
     # sample action
     action = tf.exp(logstd_final) * noise_samples + mean_final
     self.y_sample = action
Example #27
0
    def build_network(self, sess, scope, ob):

        with tf.variable_scope(scope + "/obfilter"):
            ob_rms = RunningMeanStd(shape=self.ob_space.shape)

        with tf.variable_scope(scope + '/vf'):
            obz = tf.clip_by_value((ob - ob_rms.mean) / ob_rms.std, -5.0, 5.0)
            last_out = obz
            for i in range(self.num_hid_layers):
                last_out = tf.nn.tanh(
                    tf.layers.dense(
                        last_out,
                        self.hid_size,
                        name="fc%i" % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))
            vpred = tf.layers.dense(
                last_out,
                1,
                name='final',
                kernel_initializer=U.normc_initializer(1.0))[:, 0]

        with tf.variable_scope(scope + '/pol'):
            last_out = obz

            ############## tf layers version #############
            for i in range(self.num_hid_layers):
                last_out = tf.nn.tanh(
                    tf.layers.dense(
                        last_out,
                        self.hid_size,
                        name='fc%i' % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))
            mean = tf.layers.dense(
                last_out,
                self.ac_dim,
                name='final',
                kernel_initializer=U.normc_initializer(0.01))

            # ############## tf learn version #############
            # for i in range(self.num_hid_layers):
            #     last_out = tflearn.fully_connected(last_out, self.hid_size, name='fc%i'%(i+1), activation='tanh')
            # mean = tflearn.fully_connected(last_out, self.ac_dim, name='final', activation='tanh')

            logstd = tf.get_variable(
                name="logstd",
                shape=[1, self.pdtype.param_shape()[0] // 2],
                initializer=tf.zeros_initializer())
            pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)

        pd = self.pdtype.pdfromflat(pdparam)

        sample_ac = pd.sample()
        ac_mean = pd.mode()

        return ob_rms, vpred, pd, sample_ac, ac_mean
Example #28
0
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, num_units=3, num_layers=4):
        assert isinstance(ob_space, gym.spaces.Box)

        nbatch_train = 1024
        nbatch_vf_train = 64
        nbatch_fvp_train = 205 # sub-sampled size
        self.ob_train = ob_train = U.get_placeholder(name="ob_train", dtype=tf.float32, shape=[nbatch_train] + list(ob_space.shape))
        self.action_train = action_train = U.get_placeholder(name='ac_train', dtype=tf.float32, shape=[nbatch_train] + list(ac_space.shape))
        ob_act = U.get_placeholder(name="ob_act", dtype=tf.float32, shape=[1] + list(ob_space.shape))
        action_act = U.get_placeholder(name='ac_act', dtype=tf.float32, shape=[1] + list(ac_space.shape))
        self.ob_vf_train = ob_vf_train = U.get_placeholder(name="ob_vf_train", dtype=tf.float32, shape=[nbatch_vf_train] + list(ob_space.shape))
        self.ob_fvp_train = ob_fvp_train = U.get_placeholder(name="ob_fvp_train", dtype=tf.float32, shape=[nbatch_fvp_train] + list(ob_space.shape))
        self.ac_fvp_train = action_fvp_train = U.get_placeholder(name="ac_fvp_act", dtype=tf.float32, shape=[nbatch_fvp_train] + list(ac_space.shape))
        
        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        obz_train = tf.clip_by_value((ob_train - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
        obz_act = tf.clip_by_value((ob_act - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
        obz_vf_train = tf.clip_by_value((ob_vf_train - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
        obz_fvp_train = tf.clip_by_value((ob_fvp_train - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)

        # value function
        last_out = obz_vf_train
        with tf.variable_scope('value', reuse=False):
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
            self.vpred_train = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0]
        last_out = obz_act
        with tf.variable_scope('value', reuse=True):
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
            self.vpred_act = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0]
        
        # policy
        policy_train = NormalizingFlowStateModel(obz_train, action_train, name='policy', reuse=False, num_units=num_units, num_layers=num_layers)
        policy_act = NormalizingFlowStateModel(obz_act, action_act, name='policy', reuse=True, num_units=num_units, num_layers=num_layers)
        policy_fvp_train = NormalizingFlowStateModel(obz_fvp_train, action_fvp_train, name='policy', reuse=True, num_units=num_units, num_layers=num_layers)
        self.pi_act = policy_act.y_sample  #act for forward sampling
        self.pi_train = policy_fvp_train.y_sample  #for fvp
        self.entropy_train = policy_train.entropy
        self.log_prob_act = policy_act.log_prob
        self.action_act = action_act
        self.log_prob_train = policy_train.log_prob  #logprob
        self.log_prob_fvp_train = policy_fvp_train.log_prob        
        
        self.state_in = []
        self.state_out = []

        #stochastic = tf.placeholder(dtype=tf.bool, shape=())
        #ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        #self._act = U.function([stochastic, ob], [ac, self.vpred])
        self._act = U.function([ob_act], [self.pi_act, self.vpred_act])
        self.ob_act = ob_act
Example #29
0
def lstm_graph(ob_combined, input_state_combined, env):

    # parse action distribution mean, logstd
    # get action space type
    pdtype = make_pdtype(env.action_space)

    # new cell
    cell = tf.contrib.rnn.LSTMCell(num_units=NUM_UNITS, name="lol")

    # Initailize state with zero of batch size 1 and type float32
    #c_state, m_state = tf.split(input_state_combined, [1, 1], 0 )
    c_state, m_state = input_state_combined[0, :, :], input_state_combined[
        1, :, :]
    state = tf.tuple([c_state, m_state])

    s_mean_list, s_std_list, s_logstd_list = [], [], []

    for i in range(STEPS_UNROLLED):

        if i > 0: tf.get_variable_scope().reuse_variables()
        # normalize observation vector with rms
        rms = RunningMeanStd(shape=env.observation_space.shape)
        # only first step; the rest will need all (all batch, all observation space dim )
        obz = tf.clip_by_value((ob_combined[i, :, :] - rms.mean) / rms.std,
                               -5.0, 5.0)
        output, state = cell(obz, state)

        output = tf.nn.tanh(
            tf.layers.dense(output,
                            64,
                            name='last',
                            kernel_initializer=U.normc_initializer(1.0)))

        # feed the output of lstm to a final FC layer
        # this 'flat' vector will be split into the mean and std of a pd
        pdparam = tf.layers.dense(output,
                                  pdtype.param_shape()[0],
                                  name='final',
                                  kernel_initializer=U.normc_initializer(0.01))

        pd = pdtype.pdfromflat(pdparam)

        s_mean_list.append(pd.mean)
        s_std_list.append(pd.std)
        s_logstd_list.append(pd.logstd)

    # stack the outputs at each cell together so that we can conveniently compute loss and etc
    s_mean_combined = tf.stack(s_mean_list)
    s_std_combined = tf.stack(s_std_list)
    s_logstd_combined = tf.stack(s_logstd_list)
    final_state_combined = tf.stack(state)

    return s_mean_combined, s_std_combined, s_logstd_combined, final_state_combined
Example #30
0
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, kind):
        print type(ob_space)
        assert isinstance(ob_space, gym.spaces.box.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))
        self.ob = [ob]

        #process ob_
        x = ob / 255.0

        ob_last = self.img_encoder(x, kind)

        with tf.variable_scope("vf"):
            last_out = ob_last
            for i in range(num_hid_layers):
                last_out = tf.nn.relu(
                    tf.layers.dense(
                        last_out,
                        hid_size,
                        name="fc%i" % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))
            self.vpred = tf.layers.dense(
                last_out,
                1,
                name='final',
                kernel_initializer=U.normc_initializer(1.0))[:, 0]

        with tf.variable_scope("pol"):
            last_out = ob_last
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(
                    tf.layers.dense(
                        last_out,
                        hid_size,
                        name='fc%i' % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))
            logits = tf.layers.dense(
                last_out,
                pdtype.param_shape()[0],
                name='logits',
                kernel_initializer=U.normc_initializer(0.01))
            self.pd = pdtype.pdfromflat(logits)

        self.state_in = []
        self.state_out = []
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())  # XXX
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Example #31
0
    def __init__(self,
                 name,
                 observation_shape,
                 action_shape,
                 hid_size,
                 num_hid_layers,
                 stochastic=True):
        with tf.variable_scope(name):
            self.stochastic = stochastic
            self.hid_size, self.num_hid_layers = hid_size, num_hid_layers
            self.action_shape, self.observation_shape = action_shape, observation_shape
            self.scope = tf.get_variable_scope().name
            self.pdtype = DiagGaussianPdType(action_shape[0])

            observations_ph = U.get_placeholder(name='ob',
                                                dtype=tf.float32,
                                                shape=[None] +
                                                list(observation_shape))
            stochastic_ph = tf.placeholder(dtype=tf.bool, shape=())

            with tf.variable_scope('obfilter'):
                self.ob_rms = RunningMeanStd(shape=observation_shape)

            with tf.variable_scope('pol'):
                last_out = tf.clip_by_value(
                    (observations_ph - self.ob_rms.mean) / self.ob_rms.std,
                    -5.0, 5.0)
                for i in range(num_hid_layers):
                    last_out = tf.nn.tanh(
                        tf.layers.dense(
                            last_out,
                            hid_size,
                            name='fc%i' % (i + 1),
                            kernel_initializer=U.normc_initializer(1.0)))

                mean = tf.layers.dense(
                    last_out,
                    self.pdtype.param_shape()[0] // 2,
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))
                logstd = tf.get_variable(
                    name='logstd',
                    shape=[1, self.pdtype.param_shape()[0] // 2],
                    initializer=tf.zeros_initializer())
                pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)

            self.pd = self.pdtype.pdfromflat(pdparam)

            action_op = U.switch(stochastic_ph, self.pd.sample(),
                                 self.pd.mode())
            self._act = U.function([stochastic_ph, observations_ph], action_op)
Example #32
0
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, activation='tanh', gaussian_fixed_var=True, keep=1.0):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob_shape = OBSERVATION_DIM if PREPROCESS else ob_space.shape[0]
        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length, ob_shape])

        if activation == 'tanh':
            activ = tf.nn.tanh
        elif activation == 'elu':
            activ = tf.nn.elu
        elif activation == 'lrelu':
            activ = lambda x: tf.maximum(x, 0.01 * x)
        else:
            raise NotImplementedError("Not available activation: " + activation)

        if PREPROCESS:
            last_out = ob
        else:
            with tf.variable_scope("obfilter"):
                self.ob_rms = RunningMeanStd(shape=ob_space.shape)
            obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            last_out = obz

        for i in range(num_hid_layers):
            last_out = activ(U.dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0)))
            last_out = tf.nn.dropout(last_out, keep_prob=keep, name="vdrop%i" % (i + 1))
        self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0]

        last_out = ob
        for i in range(num_hid_layers):
            last_out = activ(U.dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0)))
            last_out = tf.nn.dropout(last_out, keep_prob=keep, name="pdrop%i" % (i + 1))
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = U.dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01))
            logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2],
                                     initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
        else:
            pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
    def __init__(self,
                 dim_state: int,
                 dim_action: int,
                 hidden_sizes: List[int],
                 normalizer: GaussianNormalizer,
                 init_std=1.):
        super().__init__()
        self.dim_state = dim_state
        self.dim_action = dim_action
        self.hidden_sizes = hidden_sizes
        self.init_std = init_std
        self.normalizer = normalizer
        with self.scope:
            self.op_states = tf.placeholder(tf.float32,
                                            shape=[None, dim_state],
                                            name='states')
            self.op_actions_ = tf.placeholder(tf.float32,
                                              shape=[None, dim_action],
                                              name='actions')

            layers = []
            # note that the placeholder has size 105.
            all_sizes = [dim_state, *self.hidden_sizes]
            for i, (in_features, out_features) in enumerate(
                    zip(all_sizes[:-1], all_sizes[1:])):
                layers.append(
                    nn.Linear(in_features,
                              out_features,
                              weight_initializer=normc_initializer(1)))
                layers.append(nn.Tanh())
            layers.append(
                nn.Linear(all_sizes[-1],
                          dim_action,
                          weight_initializer=normc_initializer(0.01)))
            self.net = nn.Sequential(*layers)

            self.op_log_std = nn.Parameter(tf.constant(np.log(self.init_std),
                                                       shape=[self.dim_action],
                                                       dtype=tf.float32),
                                           name='log_std')

        self.distribution = self(self.op_states)
        self.op_actions = self.distribution.sample()
        self.op_actions_mean = self.distribution.mean()
        self.op_actions_std = self.distribution.stddev()
        self.op_nlls_ = -self.distribution.log_prob(
            self.op_actions_).reduce_sum(axis=1)

        self.register_callable('[states] => [actions]', self.fast)
Example #34
0
    def __init__(self, ob_dim, ac_dim): #pylint: disable=W0613
        X = tf.placeholder(tf.float32, shape=[None, ob_dim*2+ac_dim*2+2]) # batch of observations
        vtarg_n = tf.placeholder(tf.float32, shape=[None], name='vtarg')
        wd_dict = {}
        h1 = tf.nn.elu(dense(X, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict))
        h2 = tf.nn.elu(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict))
        vpred_n = dense(h2, 1, "hfinal", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)[:,0]
        sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n))
        wd_loss = tf.get_collection("vf_losses", None)
        loss = U.mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss)
        loss_sampled = U.mean(tf.square(vpred_n - tf.stop_gradient(sample_vpred_n)))
        self._predict = U.function([X], vpred_n)
        optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001*(1-0.9), momentum=0.9, \
                                    clip_kl=0.3, epsilon=0.1, stats_decay=0.95, \
                                    async=1, kfac_update=2, cold_iter=50, \
                                    weight_decay_dict=wd_dict, max_grad_norm=None)
        vf_var_list = []
        for var in tf.trainable_variables():
            if "vf" in var.name:
                vf_var_list.append(var)

        update_op, self.q_runner = optim.minimize(loss, loss_sampled, var_list=vf_var_list)
        self.do_update = U.function([X, vtarg_n], update_op) #pylint: disable=E1101
        U.initialize() # Initialize uninitialized TF variables