Beispiel #1
0
class SharedMLP:
    def __init__(self, sess, state_dim, n_actions, reuse=False):
        # Model Input
        self.obs_in = tf.placeholder(dtype=tf.float32, shape=[None, state_dim], name='obs_in')
        with tf.variable_scope("model", reuse=reuse):
            h1 = tf.layers.dense(self.obs_in, units=20, activation=tf.nn.relu)
            h2 = tf.layers.dense(h1, units=20, activation=tf.nn.relu)

            self.ap_out = tf.layers.dense(h2, units=n_actions, activation=None)  # action probabilities
            self.vf_out = tf.layers.dense(h2, units=1, activation=None)  # state value

        # The output of the NN are non-normalized action probabilities. They are converted to a probabiltiy
        # distribution from which normalized probabilities can be sampled.
        self.pd = CategoricalPd(self.ap_out)  # Init the distribution with output values of NN
        a0 = self.pd.sample()  # sample probabilities for each action from probability distribution which adds small unifrom noise to the prob distribution derived from NN output (a0=[n_actions])
        v0 = self.vf_out[:, 0]

        neglogprob0 = self.pd.neglogprob(a0)  # a0 are the labels for the cross entropy computation
        self.initial_states = None

        # Prediction functions for a complete step and for the state value only
        def step(obs, dones, lstm_states):
            a, v, neglogprob = sess.run([a0, v0, neglogprob0], {self.obs_in: obs})
            return a, v, self.initial_states, neglogprob

        def value(obs, dones, lstm_states):
            return sess.run(v0, {self.obs_in: obs})
            # return sess.run(self.vf_out, {self.obs_in: obs})

        self.step = step
        self.value = value
        self.a0 = a0
Beispiel #2
0
class LSTM_CatPD:
    def __init__(self, sess, state_dim, n_actions, n_steps, n_lstm=256, reuse=False):
        self.obs_in = tf.placeholder(dtype=tf.float32, shape=[None, state_dim], name='obs_in') # observations
        self.D = tf.placeholder(dtype=tf.float32, shape=[None], name='dones')  # dones
        self.LS = tf.placeholder(dtype=tf.float32, shape=[None, n_lstm*2], name='lstm_s')  # cell and hidden states

        with tf.variable_scope("model", reuse=reuse):
            h1 = tf.layers.dense(self.obs_in, units=20, activation=tf.nn.relu)
            h2 = tf.layers.dense(h1, units=20, activation=tf.nn.relu)

            # LSTM cell
            h3, s_new = lstm(h2, self.D, self.LS, scope='lstm', n_lstm=n_lstm)

            self.ap_out = tf.layers.dense(h3, units=n_actions, activation=None)
            self.vf_out = tf.layers.dense(h3, units=1, activation=None)

        # The output of the NN are non-normalized action probabilities. They are converted to a probabiltiy
        # distribution from which normalized probabilities can be sampled.
        self.pd = CategoricalPd(self.ap_out)  # Init the distribution with output values of NN
        a0 = self.pd.sample()  # sample probabilities for each action from probability distribution which adds small unifrom noise to the prob distribution derived from NN output (a0=[n_actions])
        v0 = self.vf_out[:, 0]

        neglogprob0 = self.pd.neglogprob(a0)  # a0 are the labels for the cross entropy computation
        self.initial_states = [np.zeros(shape=n_lstm*2, dtype=np.float32)]

        def step(obs, dones, lstm_states):
            return sess.run([a0, self.ap_out, v0, s_new, neglogprob0], {self.obs_in: obs, self.D: dones, self.LS: lstm_states})

        def value(obs, dones, lstm_states):
            return sess.run(v0, {self.obs_in: obs, self.D: dones, self.LS: lstm_states})
            # return sess.run([self.vf_out], {self.obs_in: obs, self.D: dones, self.LS: lstm_states})

        self.step = step
        self.value = value
        self.a0 = a0