def __init__(self, env, observations, latent, estimate_q=False, vf_latent=None, sess=None, **tensors): """ Parameters: ---------- env RL environment observations tensorflow placeholder in which the observations will be fed latent latent state from which policy distribution parameters should be inferred vf_latent latent state from which value function should be inferred (if None, then latent is used) sess tensorflow session to run calculations in (if None, default session is used) **tensors tensorflow tensors for additional attributes such as state or mask """ self.X = observations self.state = tf.constant([]) self.initial_state = None self.__dict__.update(tensors) vf_latent = vf_latent if vf_latent is not None else latent vf_latent = tf.layers.flatten(vf_latent) latent = tf.layers.flatten(latent) # Based on the action space, will select what probability distribution type self.pdtype = make_pdtype(env.action_space) self.pd, self.pi = self.pdtype.pdfromlatent(latent, init_scale=0.01) # Take an action self.action = self.pd.sample() # Calculate the neg log of our probability self.neglogp = self.pd.neglogp(self.action) self.sess = sess or tf.get_default_session() if estimate_q: assert isinstance(env.action_space, gym.spaces.Discrete) self.q = fc(vf_latent, 'q', env.action_space.n) self.vf = self.q else: self.vf = fc(vf_latent, 'vf', 1) self.vf = self.vf[:, 0]
def nature_cnn(unscaled_images, **conv_kwargs): """ CNN from Nature paper. """ scaled_images = tf.cast(unscaled_images, tf.float32) / 255. activ = tf.nn.relu h = activ( conv(scaled_images, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2), **conv_kwargs)) h2 = activ( conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), **conv_kwargs)) h3 = activ( conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), **conv_kwargs)) h3 = conv_to_fc(h3) return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)))
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False): nbatch = nenv * nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc * nstack) nact = ac_space.n X = tf.placeholder(tf.uint8, ob_shape) # obs with tf.variable_scope("model", reuse=reuse): h = nature_cnn(X) pi_logits = fc(h, 'pi', nact, init_scale=0.01) pi = tf.nn.softmax(pi_logits) q = fc(h, 'q', nact) # could change this to use self.pi instead a = sample(tf.nn.softmax(pi_logits)) self.initial_state = [] # not stateful self.X = X self.pi = pi # actual policy params now self.pi_logits = pi_logits self.q = q self.vf = q def step(ob, *args, **kwargs): # returns actions, mus, states a0, pi0 = sess.run([a, pi], {X: ob}) return a0, pi0, [] # dummy state def out(ob, *args, **kwargs): pi0, q0 = sess.run([pi, q], {X: ob}) return pi0, q0 def act(ob, *args, **kwargs): return sess.run(a, {X: ob}) self.step = step self.out = out self.act = act
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False, nlstm=256): nbatch = nenv * nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc * nstack) nact = ac_space.n X = tf.placeholder(tf.uint8, ob_shape) # obs M = tf.placeholder(tf.float32, [nbatch]) # mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm * 2]) # states with tf.variable_scope("model", reuse=reuse): h = nature_cnn(X) # lstm xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) pi_logits = fc(h5, 'pi', nact, init_scale=0.01) pi = tf.nn.softmax(pi_logits) q = fc(h5, 'q', nact) a = sample(pi_logits) # could change this to use self.pi instead self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32) self.X = X self.M = M self.S = S self.pi = pi # actual policy params now self.q = q def step(ob, state, mask, *args, **kwargs): # returns actions, mus, states a0, pi0, s = sess.run([a, pi, snew], {X: ob, S: state, M: mask}) return a0, pi0, s self.step = step
def network_fn(X): h = tf.layers.flatten(X) for i in range(num_layers): h = fc(h, 'mlp_fc{}'.format(i), nh=num_hidden, init_scale=np.sqrt(2)) if layer_norm: h = tf.contrib.layers.layer_norm(h, center=True, scale=True) h = activation(h) return h
def network_fn(X): h = tf.cast(X, tf.float32) / 255. activ = tf.nn.relu h = activ( conv(h, 'c1', nf=8, rf=8, stride=4, init_scale=np.sqrt(2), **conv_kwargs)) h = activ( conv(h, 'c2', nf=16, rf=4, stride=2, init_scale=np.sqrt(2), **conv_kwargs)) h = conv_to_fc(h) h = activ(fc(h, 'fc1', nh=128, init_scale=np.sqrt(2))) return h
def __init__(self, env, observations, latent, dones, states=None, estimate_q=False, vf_latent=None, sess=None): """ Parameters: ---------- env RL environment observations tensorflow placeholder in which the observations will be fed latent latent state from which policy distribution parameters should be inferred vf_latent latent state from which value function should be inferred (if None, then latent is used) sess tensorflow session to run calculations in (if None, default session is used) **tensors tensorflow tensors for additional attributes such as state or mask """ self.X = observations self.dones = dones self.pdtype = make_pdtype(env.action_space) self.states = states self.sess = sess or tf.get_default_session() vf_latent = vf_latent if vf_latent is not None else latent with tf.variable_scope('policy'): latent = tf.layers.flatten(latent) # Based on the action space, will select what probability distribution type self.pd, self.pi = self.pdtype.pdfromlatent(latent, init_scale=0.01) with tf.variable_scope('sample_action'): self.action = self.pd.sample() with tf.variable_scope('negative_log_probability'): # Calculate the neg log of our probability self.neglogp = self.pd.neglogp(self.action) with tf.variable_scope('value'): vf_latent = tf.layers.flatten(vf_latent) if estimate_q: assert isinstance(env.action_space, gym.spaces.Discrete) self.q = fc(vf_latent, 'q', env.action_space.n) self.value = self.q else: vf_latent = tf.layers.flatten(vf_latent) self.value = fc(vf_latent, 'value', 1, init_scale=0.01) self.value = self.value[:, 0] self.step_input = { 'observations': observations, 'dones': self.dones, } self.step_output = { 'actions': self.action, 'values': self.value, 'neglogpacs': self.neglogp, } if self.states: self.initial_state = np.zeros(self.states['current'].get_shape()) self.step_input.update({'states': self.states['current']}) self.step_output.update({ 'states': self.states['current'], 'next_states': self.states['next'] }) else: self.initial_state = None