class MLPPolicy(object): def __init__(self, sess, ob_space, ac_space, nenvs, nsteps, units_per_hlayer, reuse=False, activ_fcn='relu6'): # pylint: disable=W0613 # this method is called with nbatch = nenvs*nsteps # nh, nw, nc = ob_space.shape # ob_shape = (nbatch, nh, nw, nc) # actdim = ac_space.shape[0] # Todo check initialization # Input and Output dimensions nd, = ob_space.shape nbatch = nenvs * nsteps ob_shape = (nbatch, nd) nact = ac_space.n X = tf.placeholder(tf.float32, ob_shape, name='Ob') # obs with tf.variable_scope("model", reuse=reuse): if activ_fcn == 'relu6': h1 = tf.nn.relu6(fc(X, 'pi_vf_fc1', nh=units_per_hlayer[0])) # , init_scale=np.sqrt(2))) h2 = tf.nn.relu6(fc(h1, 'pi_vf_fc2', nh=units_per_hlayer[1])) # , init_scale=np.sqrt(2))) h3 = tf.nn.relu6(fc(h2, 'pi_fc1', nh=units_per_hlayer[2])) # , init_scale=np.sqrt(2))) elif activ_fcn == 'elu': h1 = tf.nn.elu(fc(X, 'pi_vf_fc1', nh=units_per_hlayer[0])) # , init_scale=np.sqrt(2))) h2 = tf.nn.elu(fc(h1, 'pi_vf_fc2', nh=units_per_hlayer[1])) # , init_scale=np.sqrt(2))) h3 = tf.nn.elu(fc(h2, 'pi_fc1', nh=units_per_hlayer[2])) # , init_scale=np.sqrt(2))) elif activ_fcn == 'mixed': h1 = tf.nn.relu6(fc(X, 'pi_vf_fc1', nh=units_per_hlayer[0])) #, init_scale=np.sqrt(2))) h2 = tf.nn.relu6(fc(h1, 'pi_vf_fc2', nh=units_per_hlayer[1])) #, init_scale=np.sqrt(2))) h3 = tf.nn.tanh(fc(h2, 'pi_fc1', nh=units_per_hlayer[2])) #, init_scale=np.sqrt(2))) pi_logit = fc(h3, 'pi', nact, init_scale=0.01) pi = tf.nn.softmax(pi_logit) vf = fc(h2, 'vf', 1)[:, 0] # predicted value of input state self.pd = CategoricalPd(pi_logit) # pdparam a0 = self.pd.sample() # returns action index: 0,1 # a0 = tf.argmax(pi, axis=1) neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, pi, v, neglogp = sess.run([a0, pi_logit, vf, neglogp0], {X: ob}) return a, pi, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X: ob}) self.X = X self.pi = pi self.pi_logit = pi_logit self.vf = vf self.ac = a0 self.step = step self.value = value
class LSTMPolicy(object): def __init__(self, sess, ob_space, ac_space, nenvs, nsteps, units_per_hlayer, reuse=False, activ_fcn='relu6'): # pylint: disable=W0613 # this method is called with nbatch = nenvs*nsteps # nh, nw, nc = ob_space.shape # ob_shape = (nbatch, nh, nw, nc) # actdim = ac_space.shape[0] # Todo check initialization # Input and Output dimensions nd, = ob_space.shape nbatch = nenvs * nsteps ob_shape = (nbatch, nd) nact = ac_space.n X = tf.placeholder(tf.float32, ob_shape, name='Ob') # obs with tf.variable_scope("model", reuse=reuse): if activ_fcn == 'relu6': h1 = tf.nn.relu6(fc(X, 'pi_vf_fc1', nh=units_per_hlayer[0])) # , init_scale=np.sqrt(2))) h2 = tf.nn.relu6(fc(h1, 'pi_vf_fc2', nh=units_per_hlayer[1])) # , init_scale=np.sqrt(2))) elif activ_fcn == 'elu': h1 = tf.nn.elu(fc(X, 'pi_vf_fc1', nh=units_per_hlayer[0])) # , init_scale=np.sqrt(2))) h2 = tf.nn.elu(fc(h1, 'pi_vf_fc2', nh=units_per_hlayer[1])) # , init_scale=np.sqrt(2))) elif activ_fcn == 'mixed': h1 = tf.nn.relu6(fc(X, 'pi_vf_fc1', nh=units_per_hlayer[0])) #, init_scale=np.sqrt(2))) h2 = tf.nn.relu6(fc(h1, 'pi_vf_fc2', nh=units_per_hlayer[1])) #, init_scale=np.sqrt(2))) # The output matrix [nbatch x trace_length, h_units] of layer 2 needs to be reshaped to a vector with # dimensions: [nbatch , trace_length , h_units] for rnn processing. rnn_cell = tf.contrib.rnn.BasicLSTMCell(num_units=units_per_hlayer[1], state_is_tuple=True) rnn_input = tf.reshape(h2, shape=[nenvs, nsteps, units_per_hlayer[1]]) rnn_state_in = rnn_cell.zero_state(batch_size=nenvs, dtype=tf.float32) # reset the state in every training iteration rnn_output, rnn_state_out = tf.nn.dynamic_rnn(inputs=rnn_input, cell=rnn_cell, initial_state=rnn_state_in, dtype=tf.float32, scope="model" + '_rnn') # The output of the recurrent cell then needs to be reshaped to the original matrix shape. rnn_output = tf.reshape(rnn_output, shape=[-1, units_per_hlayer[1]]) if activ_fcn == 'relu6': activ = tf.nn.relu6 elif activ_fcn == 'elu': activ = tf.nn.elu elif activ_fcn == 'mixed': activ = tf.nn.tanh h3 =activ(fc(rnn_output, 'pi_fc1', nh=units_per_hlayer[2])) # , init_scale=np.sqrt(2))) pi_logit = fc(h3, 'pi', nact, init_scale=0.01) pi = tf.nn.softmax(pi_logit) vf = fc(rnn_output, 'vf', 1)[:, 0] # predicted value of input state self.pd = CategoricalPd(pi_logit) # pdparam a0 = self.pd.sample() # returns action index: 0,1 # a0 = tf.argmax(pi_logit, axis=1) neglogp0 = self.pd.neglogp(a0) # The rnn state consists of the "cell state" c and the "input vector" x_t = h_{t-1} self.initial_state = (np.zeros([nenvs, units_per_hlayer[1]]), np.zeros([nenvs, units_per_hlayer[1]])) def step(ob, r_state, *_args, **_kwargs): a, pi, v, r_state_out, neglogp = sess.run([a0, pi_logit, vf, rnn_state_out, neglogp0], {X: ob, rnn_state_in: r_state}) return a, pi, v, r_state_out, neglogp def value(ob, r_state, *_args, **_kwargs): return sess.run(vf, {X: ob, rnn_state_in: r_state}) self.X = X self.pi = pi self.pi_logit = pi_logit self.vf = vf self.ac = a0 self.rnn_state_in = rnn_state_in self.rnn_state_out = rnn_state_out self.step = step self.value = value