def __init__(self, sess, state_dim, n_actions, reuse=False): # Model Input self.obs_in = tf.placeholder(dtype=tf.float32, shape=[None, state_dim], name='obs_in') with tf.variable_scope("model", reuse=reuse): h1 = tf.layers.dense(self.obs_in, units=20, activation=tf.nn.relu) h2 = tf.layers.dense(h1, units=20, activation=tf.nn.relu) self.ap_out = tf.layers.dense(h2, units=n_actions, activation=None) # action probabilities self.vf_out = tf.layers.dense(h2, units=1, activation=None) # state value # The output of the NN are non-normalized action probabilities. They are converted to a probabiltiy # distribution from which normalized probabilities can be sampled. self.pd = CategoricalPd(self.ap_out) # Init the distribution with output values of NN a0 = self.pd.sample() # sample probabilities for each action from probability distribution which adds small unifrom noise to the prob distribution derived from NN output (a0=[n_actions]) v0 = self.vf_out[:, 0] neglogprob0 = self.pd.neglogprob(a0) # a0 are the labels for the cross entropy computation self.initial_states = None # Prediction functions for a complete step and for the state value only def step(obs, dones, lstm_states): a, v, neglogprob = sess.run([a0, v0, neglogprob0], {self.obs_in: obs}) return a, v, self.initial_states, neglogprob def value(obs, dones, lstm_states): return sess.run(v0, {self.obs_in: obs}) # return sess.run(self.vf_out, {self.obs_in: obs}) self.step = step self.value = value self.a0 = a0
def __init__(self, sess, state_dim, n_actions, n_steps, n_lstm=256, reuse=False): self.obs_in = tf.placeholder(dtype=tf.float32, shape=[None, state_dim], name='obs_in') # observations self.D = tf.placeholder(dtype=tf.float32, shape=[None], name='dones') # dones self.LS = tf.placeholder(dtype=tf.float32, shape=[None, n_lstm*2], name='lstm_s') # cell and hidden states with tf.variable_scope("model", reuse=reuse): h1 = tf.layers.dense(self.obs_in, units=20, activation=tf.nn.relu) h2 = tf.layers.dense(h1, units=20, activation=tf.nn.relu) # LSTM cell h3, s_new = lstm(h2, self.D, self.LS, scope='lstm', n_lstm=n_lstm) self.ap_out = tf.layers.dense(h3, units=n_actions, activation=None) self.vf_out = tf.layers.dense(h3, units=1, activation=None) # The output of the NN are non-normalized action probabilities. They are converted to a probabiltiy # distribution from which normalized probabilities can be sampled. self.pd = CategoricalPd(self.ap_out) # Init the distribution with output values of NN a0 = self.pd.sample() # sample probabilities for each action from probability distribution which adds small unifrom noise to the prob distribution derived from NN output (a0=[n_actions]) v0 = self.vf_out[:, 0] neglogprob0 = self.pd.neglogprob(a0) # a0 are the labels for the cross entropy computation self.initial_states = [np.zeros(shape=n_lstm*2, dtype=np.float32)] def step(obs, dones, lstm_states): return sess.run([a0, self.ap_out, v0, s_new, neglogprob0], {self.obs_in: obs, self.D: dones, self.LS: lstm_states}) def value(obs, dones, lstm_states): return sess.run(v0, {self.obs_in: obs, self.D: dones, self.LS: lstm_states}) # return sess.run([self.vf_out], {self.obs_in: obs, self.D: dones, self.LS: lstm_states}) self.step = step self.value = value self.a0 = a0
class MLPPolicy(object): def __init__(self, sess, ob_space, ac_space, nenvs, nsteps, units_per_hlayer, reuse=False, activ_fcn='relu6'): # pylint: disable=W0613 # this method is called with nbatch = nenvs*nsteps # nh, nw, nc = ob_space.shape # ob_shape = (nbatch, nh, nw, nc) # actdim = ac_space.shape[0] # Todo check initialization # Input and Output dimensions nd, = ob_space.shape nbatch = nenvs * nsteps ob_shape = (nbatch, nd) nact = ac_space.n X = tf.placeholder(tf.float32, ob_shape, name='Ob') # obs with tf.variable_scope("model", reuse=reuse): if activ_fcn == 'relu6': h1 = tf.nn.relu6(fc(X, 'pi_vf_fc1', nh=units_per_hlayer[0])) # , init_scale=np.sqrt(2))) h2 = tf.nn.relu6(fc(h1, 'pi_vf_fc2', nh=units_per_hlayer[1])) # , init_scale=np.sqrt(2))) h3 = tf.nn.relu6(fc(h2, 'pi_fc1', nh=units_per_hlayer[2])) # , init_scale=np.sqrt(2))) elif activ_fcn == 'elu': h1 = tf.nn.elu(fc(X, 'pi_vf_fc1', nh=units_per_hlayer[0])) # , init_scale=np.sqrt(2))) h2 = tf.nn.elu(fc(h1, 'pi_vf_fc2', nh=units_per_hlayer[1])) # , init_scale=np.sqrt(2))) h3 = tf.nn.elu(fc(h2, 'pi_fc1', nh=units_per_hlayer[2])) # , init_scale=np.sqrt(2))) elif activ_fcn == 'mixed': h1 = tf.nn.relu6(fc(X, 'pi_vf_fc1', nh=units_per_hlayer[0])) #, init_scale=np.sqrt(2))) h2 = tf.nn.relu6(fc(h1, 'pi_vf_fc2', nh=units_per_hlayer[1])) #, init_scale=np.sqrt(2))) h3 = tf.nn.tanh(fc(h2, 'pi_fc1', nh=units_per_hlayer[2])) #, init_scale=np.sqrt(2))) pi_logit = fc(h3, 'pi', nact, init_scale=0.01) pi = tf.nn.softmax(pi_logit) vf = fc(h2, 'vf', 1)[:, 0] # predicted value of input state self.pd = CategoricalPd(pi_logit) # pdparam a0 = self.pd.sample() # returns action index: 0,1 # a0 = tf.argmax(pi, axis=1) neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, pi, v, neglogp = sess.run([a0, pi_logit, vf, neglogp0], {X: ob}) return a, pi, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X: ob}) self.X = X self.pi = pi self.pi_logit = pi_logit self.vf = vf self.ac = a0 self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nenvs, nsteps, units_per_hlayer, reuse=False, activ_fcn='relu6'): # pylint: disable=W0613 # this method is called with nbatch = nenvs*nsteps # nh, nw, nc = ob_space.shape # ob_shape = (nbatch, nh, nw, nc) # actdim = ac_space.shape[0] # Todo check initialization # Input and Output dimensions nd, = ob_space.shape nbatch = nenvs * nsteps ob_shape = (nbatch, nd) nact = ac_space.n X = tf.placeholder(tf.float32, ob_shape, name='Ob') # obs with tf.variable_scope("model", reuse=reuse): if activ_fcn == 'relu6': h1 = tf.nn.relu6(fc(X, 'pi_vf_fc1', nh=units_per_hlayer[0])) # , init_scale=np.sqrt(2))) h2 = tf.nn.relu6(fc(h1, 'pi_vf_fc2', nh=units_per_hlayer[1])) # , init_scale=np.sqrt(2))) elif activ_fcn == 'elu': h1 = tf.nn.elu(fc(X, 'pi_vf_fc1', nh=units_per_hlayer[0])) # , init_scale=np.sqrt(2))) h2 = tf.nn.elu(fc(h1, 'pi_vf_fc2', nh=units_per_hlayer[1])) # , init_scale=np.sqrt(2))) elif activ_fcn == 'mixed': h1 = tf.nn.relu6(fc(X, 'pi_vf_fc1', nh=units_per_hlayer[0])) #, init_scale=np.sqrt(2))) h2 = tf.nn.relu6(fc(h1, 'pi_vf_fc2', nh=units_per_hlayer[1])) #, init_scale=np.sqrt(2))) # The output matrix [nbatch x trace_length, h_units] of layer 2 needs to be reshaped to a vector with # dimensions: [nbatch , trace_length , h_units] for rnn processing. rnn_cell = tf.contrib.rnn.BasicLSTMCell(num_units=units_per_hlayer[1], state_is_tuple=True) rnn_input = tf.reshape(h2, shape=[nenvs, nsteps, units_per_hlayer[1]]) rnn_state_in = rnn_cell.zero_state(batch_size=nenvs, dtype=tf.float32) # reset the state in every training iteration rnn_output, rnn_state_out = tf.nn.dynamic_rnn(inputs=rnn_input, cell=rnn_cell, initial_state=rnn_state_in, dtype=tf.float32, scope="model" + '_rnn') # The output of the recurrent cell then needs to be reshaped to the original matrix shape. rnn_output = tf.reshape(rnn_output, shape=[-1, units_per_hlayer[1]]) if activ_fcn == 'relu6': activ = tf.nn.relu6 elif activ_fcn == 'elu': activ = tf.nn.elu elif activ_fcn == 'mixed': activ = tf.nn.tanh h3 =activ(fc(rnn_output, 'pi_fc1', nh=units_per_hlayer[2])) # , init_scale=np.sqrt(2))) pi_logit = fc(h3, 'pi', nact, init_scale=0.01) pi = tf.nn.softmax(pi_logit) vf = fc(rnn_output, 'vf', 1)[:, 0] # predicted value of input state self.pd = CategoricalPd(pi_logit) # pdparam a0 = self.pd.sample() # returns action index: 0,1 # a0 = tf.argmax(pi_logit, axis=1) neglogp0 = self.pd.neglogp(a0) # The rnn state consists of the "cell state" c and the "input vector" x_t = h_{t-1} self.initial_state = (np.zeros([nenvs, units_per_hlayer[1]]), np.zeros([nenvs, units_per_hlayer[1]])) def step(ob, r_state, *_args, **_kwargs): a, pi, v, r_state_out, neglogp = sess.run([a0, pi_logit, vf, rnn_state_out, neglogp0], {X: ob, rnn_state_in: r_state}) return a, pi, v, r_state_out, neglogp def value(ob, r_state, *_args, **_kwargs): return sess.run(vf, {X: ob, rnn_state_in: r_state}) self.X = X self.pi = pi self.pi_logit = pi_logit self.vf = vf self.ac = a0 self.rnn_state_in = rnn_state_in self.rnn_state_out = rnn_state_out self.step = step self.value = value