コード例 #1
0
    def __init__(self, sess, state_dim, n_actions, reuse=False):
        # Model Input
        self.obs_in = tf.placeholder(dtype=tf.float32, shape=[None, state_dim], name='obs_in')
        with tf.variable_scope("model", reuse=reuse):
            h1 = tf.layers.dense(self.obs_in, units=20, activation=tf.nn.relu)
            h2 = tf.layers.dense(h1, units=20, activation=tf.nn.relu)

            self.ap_out = tf.layers.dense(h2, units=n_actions, activation=None)  # action probabilities
            self.vf_out = tf.layers.dense(h2, units=1, activation=None)  # state value

        # The output of the NN are non-normalized action probabilities. They are converted to a probabiltiy
        # distribution from which normalized probabilities can be sampled.
        self.pd = CategoricalPd(self.ap_out)  # Init the distribution with output values of NN
        a0 = self.pd.sample()  # sample probabilities for each action from probability distribution which adds small unifrom noise to the prob distribution derived from NN output (a0=[n_actions])
        v0 = self.vf_out[:, 0]

        neglogprob0 = self.pd.neglogprob(a0)  # a0 are the labels for the cross entropy computation
        self.initial_states = None

        # Prediction functions for a complete step and for the state value only
        def step(obs, dones, lstm_states):
            a, v, neglogprob = sess.run([a0, v0, neglogprob0], {self.obs_in: obs})
            return a, v, self.initial_states, neglogprob

        def value(obs, dones, lstm_states):
            return sess.run(v0, {self.obs_in: obs})
            # return sess.run(self.vf_out, {self.obs_in: obs})

        self.step = step
        self.value = value
        self.a0 = a0
コード例 #2
0
    def __init__(self, sess, state_dim, n_actions, n_steps, n_lstm=256, reuse=False):
        self.obs_in = tf.placeholder(dtype=tf.float32, shape=[None, state_dim], name='obs_in') # observations
        self.D = tf.placeholder(dtype=tf.float32, shape=[None], name='dones')  # dones
        self.LS = tf.placeholder(dtype=tf.float32, shape=[None, n_lstm*2], name='lstm_s')  # cell and hidden states

        with tf.variable_scope("model", reuse=reuse):
            h1 = tf.layers.dense(self.obs_in, units=20, activation=tf.nn.relu)
            h2 = tf.layers.dense(h1, units=20, activation=tf.nn.relu)

            # LSTM cell
            h3, s_new = lstm(h2, self.D, self.LS, scope='lstm', n_lstm=n_lstm)

            self.ap_out = tf.layers.dense(h3, units=n_actions, activation=None)
            self.vf_out = tf.layers.dense(h3, units=1, activation=None)

        # The output of the NN are non-normalized action probabilities. They are converted to a probabiltiy
        # distribution from which normalized probabilities can be sampled.
        self.pd = CategoricalPd(self.ap_out)  # Init the distribution with output values of NN
        a0 = self.pd.sample()  # sample probabilities for each action from probability distribution which adds small unifrom noise to the prob distribution derived from NN output (a0=[n_actions])
        v0 = self.vf_out[:, 0]

        neglogprob0 = self.pd.neglogprob(a0)  # a0 are the labels for the cross entropy computation
        self.initial_states = [np.zeros(shape=n_lstm*2, dtype=np.float32)]

        def step(obs, dones, lstm_states):
            return sess.run([a0, self.ap_out, v0, s_new, neglogprob0], {self.obs_in: obs, self.D: dones, self.LS: lstm_states})

        def value(obs, dones, lstm_states):
            return sess.run(v0, {self.obs_in: obs, self.D: dones, self.LS: lstm_states})
            # return sess.run([self.vf_out], {self.obs_in: obs, self.D: dones, self.LS: lstm_states})

        self.step = step
        self.value = value
        self.a0 = a0
コード例 #3
0
class MLPPolicy(object):
    def __init__(self, sess, ob_space, ac_space, nenvs, nsteps, units_per_hlayer, reuse=False, activ_fcn='relu6'):  # pylint: disable=W0613
        # this method is called with nbatch = nenvs*nsteps

        # nh, nw, nc = ob_space.shape
        # ob_shape = (nbatch, nh, nw, nc)
        # actdim = ac_space.shape[0]
        # Todo check initialization
        # Input and Output dimensions
        nd, = ob_space.shape
        nbatch = nenvs * nsteps
        ob_shape = (nbatch, nd)
        nact = ac_space.n
        X = tf.placeholder(tf.float32, ob_shape, name='Ob')  # obs
        with tf.variable_scope("model", reuse=reuse):
            if activ_fcn == 'relu6':
                h1 = tf.nn.relu6(fc(X, 'pi_vf_fc1', nh=units_per_hlayer[0]))  # , init_scale=np.sqrt(2)))
                h2 = tf.nn.relu6(fc(h1, 'pi_vf_fc2', nh=units_per_hlayer[1]))  # , init_scale=np.sqrt(2)))

                h3 = tf.nn.relu6(fc(h2, 'pi_fc1', nh=units_per_hlayer[2]))  # , init_scale=np.sqrt(2)))
            elif activ_fcn == 'elu':
                h1 = tf.nn.elu(fc(X, 'pi_vf_fc1', nh=units_per_hlayer[0]))  # , init_scale=np.sqrt(2)))
                h2 = tf.nn.elu(fc(h1, 'pi_vf_fc2', nh=units_per_hlayer[1]))  # , init_scale=np.sqrt(2)))

                h3 = tf.nn.elu(fc(h2, 'pi_fc1', nh=units_per_hlayer[2]))  # , init_scale=np.sqrt(2)))
            elif activ_fcn == 'mixed':
                h1 = tf.nn.relu6(fc(X, 'pi_vf_fc1', nh=units_per_hlayer[0]))  #, init_scale=np.sqrt(2)))
                h2 = tf.nn.relu6(fc(h1, 'pi_vf_fc2', nh=units_per_hlayer[1]))  #, init_scale=np.sqrt(2)))

                h3 = tf.nn.tanh(fc(h2, 'pi_fc1', nh=units_per_hlayer[2]))  #, init_scale=np.sqrt(2)))

            pi_logit = fc(h3, 'pi', nact, init_scale=0.01)
            pi = tf.nn.softmax(pi_logit)

            vf = fc(h2, 'vf', 1)[:, 0]  # predicted value of input state

        self.pd = CategoricalPd(pi_logit)  # pdparam
        a0 = self.pd.sample()  # returns action index: 0,1
        # a0 = tf.argmax(pi, axis=1)
        neglogp0 = self.pd.neglogp(a0)

        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, pi, v, neglogp = sess.run([a0, pi_logit, vf, neglogp0], {X: ob})
            return a, pi, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X: ob})

        self.X = X
        self.pi = pi
        self.pi_logit = pi_logit
        self.vf = vf
        self.ac = a0
        self.step = step
        self.value = value
コード例 #4
0
    def __init__(self, sess, ob_space, ac_space, nenvs, nsteps, units_per_hlayer, reuse=False, activ_fcn='relu6'):  # pylint: disable=W0613
        # this method is called with nbatch = nenvs*nsteps

        # nh, nw, nc = ob_space.shape
        # ob_shape = (nbatch, nh, nw, nc)
        # actdim = ac_space.shape[0]
        # Todo check initialization
        # Input and Output dimensions
        nd, = ob_space.shape
        nbatch = nenvs * nsteps
        ob_shape = (nbatch, nd)
        nact = ac_space.n
        X = tf.placeholder(tf.float32, ob_shape, name='Ob')  # obs
        with tf.variable_scope("model", reuse=reuse):
            if activ_fcn == 'relu6':
                h1 = tf.nn.relu6(fc(X, 'pi_vf_fc1', nh=units_per_hlayer[0]))  # , init_scale=np.sqrt(2)))
                h2 = tf.nn.relu6(fc(h1, 'pi_vf_fc2', nh=units_per_hlayer[1]))  # , init_scale=np.sqrt(2)))
            elif activ_fcn == 'elu':
                h1 = tf.nn.elu(fc(X, 'pi_vf_fc1', nh=units_per_hlayer[0]))  # , init_scale=np.sqrt(2)))
                h2 = tf.nn.elu(fc(h1, 'pi_vf_fc2', nh=units_per_hlayer[1]))  # , init_scale=np.sqrt(2)))
            elif activ_fcn == 'mixed':
                h1 = tf.nn.relu6(fc(X, 'pi_vf_fc1', nh=units_per_hlayer[0]))  #, init_scale=np.sqrt(2)))
                h2 = tf.nn.relu6(fc(h1, 'pi_vf_fc2', nh=units_per_hlayer[1]))  #, init_scale=np.sqrt(2)))

            # The output matrix [nbatch x trace_length, h_units] of layer 2 needs to be reshaped to a vector with
            # dimensions: [nbatch , trace_length , h_units] for rnn processing.
            rnn_cell = tf.contrib.rnn.BasicLSTMCell(num_units=units_per_hlayer[1], state_is_tuple=True)
            rnn_input = tf.reshape(h2, shape=[nenvs, nsteps, units_per_hlayer[1]])
            rnn_state_in = rnn_cell.zero_state(batch_size=nenvs,
                                               dtype=tf.float32)  # reset the state in every training iteration
            rnn_output, rnn_state_out = tf.nn.dynamic_rnn(inputs=rnn_input,
                                                          cell=rnn_cell,
                                                          initial_state=rnn_state_in,
                                                          dtype=tf.float32,
                                                          scope="model" + '_rnn')
            # The output of the recurrent cell then needs to be reshaped to the original matrix shape.
            rnn_output = tf.reshape(rnn_output, shape=[-1, units_per_hlayer[1]])

            if activ_fcn == 'relu6':
                activ = tf.nn.relu6
            elif activ_fcn == 'elu':
                activ = tf.nn.elu
            elif activ_fcn == 'mixed':
                activ = tf.nn.tanh
            h3 =activ(fc(rnn_output, 'pi_fc1', nh=units_per_hlayer[2]))  # , init_scale=np.sqrt(2)))
            pi_logit = fc(h3, 'pi', nact, init_scale=0.01)
            pi = tf.nn.softmax(pi_logit)

            vf = fc(rnn_output, 'vf', 1)[:, 0]  # predicted value of input state

        self.pd = CategoricalPd(pi_logit)  # pdparam
        a0 = self.pd.sample()  # returns action index: 0,1
        # a0 = tf.argmax(pi_logit, axis=1)
        neglogp0 = self.pd.neglogp(a0)

        # The rnn state consists of the "cell state" c and the "input vector" x_t = h_{t-1}
        self.initial_state = (np.zeros([nenvs, units_per_hlayer[1]]), np.zeros([nenvs, units_per_hlayer[1]]))

        def step(ob, r_state, *_args, **_kwargs):
            a, pi, v, r_state_out, neglogp = sess.run([a0, pi_logit, vf, rnn_state_out, neglogp0], {X: ob, rnn_state_in: r_state})
            return a, pi, v, r_state_out, neglogp

        def value(ob, r_state, *_args, **_kwargs):
            return sess.run(vf, {X: ob, rnn_state_in: r_state})

        self.X = X
        self.pi = pi
        self.pi_logit = pi_logit
        self.vf = vf
        self.ac = a0
        self.rnn_state_in = rnn_state_in
        self.rnn_state_out = rnn_state_out
        self.step = step
        self.value = value