def img_encoder(self, x, kind): if kind == 'small': # from A3C paper x = max_pool( tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [1, 1], pad="VALID")), 4) x = max_pool( tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [1, 1], pad="VALID")), 2) x = U.flattenallbut0(x) x = tf.nn.relu( tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0))) elif kind == 'large': # Nature DQN x = max_pool( tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [1, 1], pad="VALID")), 4) x = max_pool( tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [1, 1], pad="VALID")), 2) x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu( tf.layers.dense(x, 512, name='lin', kernel_initializer=U.normc_initializer(1.0))) else: raise NotImplementedError return x
def _init(self, ob_space, ac_space, kind): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) x = ob / 255.0 if kind == 'small': # from A3C paper x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0))) elif kind == 'large': # Nature DQN x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(tf.layers.dense(x, 512, name='lin', kernel_initializer=U.normc_initializer(1.0))) else: raise NotImplementedError logits = tf.layers.dense(x, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(logits) self.vpred = tf.layers.dense(x, 1, name='value', kernel_initializer=U.normc_initializer(1.0))[:,0] self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = self.pd.sample() # XXX self._act = U.function([stochastic, ob], [ac, self.vpred])
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613 nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc) nact = ac_space.n X = tf.placeholder(tf.float32, ob_shape) #obs print(ob_shape) self.pdtype = pdtype = make_pdtype(ac_space) with tf.variable_scope("model", reuse=reuse): ''' h = conv(X, 'c1', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), pad="SAME") h2 = conv(h, 'c2', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), pad="SAME") h3 = conv(h2, 'c3', nf=128, rf=3, stride=1, init_scale=np.sqrt(2), pad="SAME") h3 = conv_to_fc(h3) h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)) hh = conv(X, 'xc1', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), pad="SAME") hh2 = conv(hh, 'xc2', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), pad="SAME") hh3 = conv(hh2, 'xc3', nf=128, rf=3, stride=1, init_scale=np.sqrt(2), pad="SAME") hh3 = conv_to_fc(hh3) hh4 = fc(hh3, 'xfc1', nh=512, init_scale=np.sqrt(2)) pi = fc(h4, 'pi', nact, act=lambda x:x, init_scale=0.01) vf = fc(hh4, 'v', 1, act=lambda x:x)[:,0] ''' x = tf.nn.relu(U.conv2d(X, 32, "l1", [3, 3], [1, 1], pad="SAME")) x = tf.nn.relu(U.conv2d(x, 64, "l2", [3, 3], [1, 1], pad="SAME")) x = tf.nn.relu(U.conv2d(x, 128, "l3", [3, 3], [1, 1], pad="SAME")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 512, 'lin', U.normc_initializer(1.0))) y = tf.nn.relu(U.conv2d(X, 32, "yl1", [3, 3], [1, 1], pad="SAME")) y = tf.nn.relu(U.conv2d(y, 64, "yl2", [3, 3], [1, 1], pad="SAME")) y = tf.nn.relu(U.conv2d(y, 128, "yl3", [3, 3], [1, 1], pad="SAME")) y = U.flattenallbut0(y) y = tf.nn.relu(U.dense(y, 512, 'ylin', U.normc_initializer(1.0))) pi = U.dense(x, pdtype.param_shape()[0], "logits", U.normc_initializer(0.01)) vf = U.dense(y, 1, "value", U.normc_initializer(1.0))[:, 0] self.pd = self.pdtype.pdfromflat(pi) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X: ob}) self.X = X self.pi = pi self.vf = vf self.step = step self.value = value
def _init(self, ob_space, ac_space): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) obscaled = ob / 255.0 with tf.variable_scope("pol"): x = obscaled x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0))) logits = U.dense(x, pdtype.param_shape()[0], "logits", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(logits) with tf.variable_scope("vf"): x = obscaled x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0))) self.vpred = U.dense(x, 1, "value", U.normc_initializer(1.0)) self.vpredz = self.vpred self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = self.pd.sample() # XXX self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space, kind): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) x = ob / 255.0 if kind == 'small': # from A3C paper x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 256, 'lin', U.normc_initializer(1.0))) elif kind == 'large': # Nature DQN x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 512, 'lin', U.normc_initializer(1.0))) else: raise NotImplementedError logits = U.dense(x, pdtype.param_shape()[0], "logits", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(logits) self.vpred = U.dense(x, 1, "value", U.normc_initializer(1.0))[:,0] self.state_in = [] self.state_out = [] stochastic = tf.compat.v1.placeholder(dtype=tf.bool, shape=()) ac = self.pd.sample() # XXX self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space): self.pdtype = distributions.make_pdtype(ac_space) ob = U.get_placeholder(name='ob', dtype=tf.int32, shape=[None] + list(ob_space.shape)) next_blocks, my_grid, opp_grid = tf.split(ob, [16, 12 * 6, 12 * 6], axis=1) with tf.variable_scope('next_blocks'): next_blocks = tf.one_hot(next_blocks, depth=5) next_blocks = U.flattenallbut0(next_blocks) next_blocks = tf.nn.leaky_relu(tf.layers.dense(next_blocks, 12, name='l1', kernel_initializer=U.normc_initializer(1.0)), alpha=0.1) next_blocks = tf.nn.leaky_relu(tf.layers.dense(next_blocks, 12, name='l2', kernel_initializer=U.normc_initializer(1.0)), alpha=0.1) with tf.variable_scope('grids', reuse=False): my_grid = _grid_cnn(my_grid) with tf.variable_scope('grids', reuse=True): opp_grid = _grid_cnn(opp_grid) x = tf.concat([next_blocks, my_grid, opp_grid], axis=1) x = tf.nn.leaky_relu(tf.layers.dense(x, 64, name='lin', kernel_initializer=U.normc_initializer(1.0)), alpha=0.1) logits = tf.layers.dense(x, self.pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01)) self.pd = self.pdtype.pdfromflat(logits) self.vpred = tf.layers.dense(x, 1, name='value', kernel_initializer=U.normc_initializer(1.0))[:, 0] self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def _grid_cnn(x): x = tf.reshape(x, shape=[-1, 12, 6]) x = tf.one_hot(x, depth=7) x = tf.nn.leaky_relu(U.conv2d(x, 12, 'l1', [3, 3], [1, 1], pad='VALID'), alpha=0.1) x = tf.nn.leaky_relu(U.conv2d(x, 12, 'l2', [3, 3], [1, 1], pad='VALID'), alpha=0.1) x = U.flattenallbut0(x) return x
def build_graph(self, obs_ph, acs_ph, reuse=False): """ obs_ph: tf tensor shape of [None,84,84,4] acs_ph: tf tensor shape of [None,ac_dim] #one hot encoding """ with tf.variable_scope(self.scope): if reuse: tf.get_variable_scope().reuse_variables() one_hot_ac = tf.one_hot(acs_ph, self.num_actions, dtype=tf.float32) x = tf.concat([ obs_ph / 255.0, tf.tile(one_hot_ac[:, None, None, :], [1, 84, 84, 1]) ], axis=3) #[None,84,84,4+ac_dim] x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.tanh( tf.layers.dense(x, 512, name='lin1', kernel_initializer=U.normc_initializer(1.0))) logits = tf.layers.dense( x, 1, name='lin2', kernel_initializer=U.normc_initializer(1.0)) return logits
def _create_network(self): l = self.ob / 255.0 if self.kind == 'small': # from A3C paper l = tf.nn.relu(U.conv2d(l, 16, "l1", [8, 8], [4, 4], pad="VALID")) l = tf.nn.relu(U.conv2d(l, 32, "l2", [4, 4], [2, 2], pad="VALID")) l = U.flattenallbut0(l) l = tf.nn.relu(U.dense(l, 256, 'lin', U.normc_initializer(1.0))) elif self.kind == 'large': # Nature DQN l = tf.nn.relu(U.conv2d(l, 32, "l1", [8, 8], [4, 4], pad="VALID")) l = tf.nn.relu(U.conv2d(l, 64, "l2", [4, 4], [2, 2], pad="VALID")) l = tf.nn.relu(U.conv2d(l, 64, "l3", [3, 3], [1, 1], pad="VALID")) l = U.flattenallbut0(l) l = tf.nn.relu(U.dense(l, 512, 'lin', U.normc_initializer(1.0))) else: raise NotImplementedError self._create_logit_value(l, l)
def _build(name, x): if kind == 'small': # from A3C paper x = tf.nn.relu( U.conv2d(x, 16, "%s_l1" % name, [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu( U.conv2d(x, 32, "%s_l2" % name, [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu( tf.layers.dense( x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0))) elif kind == 'large': # Nature DQN x = tf.nn.relu( U.conv2d(x, 32, "%s_l1" % name, [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu( U.conv2d(x, 64, "%s_l2" % name, [4, 4], [2, 2], pad="VALID")) x = tf.nn.relu( U.conv2d(x, 64, "%s_l3" % name, [3, 3], [1, 1], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu( tf.layers.dense( x, 512, name='%s_lin' % name, kernel_initializer=U.normc_initializer(1.0))) else: raise NotImplementedError return x
def vggm1234(x, TRAIN_COVN=True): net = slim.convolution(x, 96, [7, 7], 2, padding='VALID', scope='conv1', activation_fn=tf.nn.relu, reuse=tf.AUTO_REUSE, trainable=TRAIN_COVN) net = tf.nn.lrn(net, depth_radius=5, bias=2, alpha=1e-4 * 1, beta=0.75) net = slim.pool(net, [3, 3], 'MAX', stride=2, padding='VALID', scope='pool1') net = slim.convolution(net, 256, [5, 5], 2, padding='VALID', scope='conv2', activation_fn=tf.nn.relu, reuse=tf.AUTO_REUSE, trainable=TRAIN_COVN) net = tf.nn.lrn(net, depth_radius=5, bias=2, alpha=1e-4 * 1, beta=0.75) net = slim.pool(net, [3, 3], 'MAX', stride=2, padding='VALID', scope='pool2') net = slim.convolution(net, 512, [3, 3], 1, padding='VALID', scope='conv3', activation_fn=tf.nn.relu, reuse=tf.AUTO_REUSE, trainable=TRAIN_COVN) net = slim.convolution(net, 512, [3, 3], 1, padding='VALID', scope='conv4', activation_fn=tf.nn.relu, reuse=tf.AUTO_REUSE, trainable=TRAIN_COVN) return U.flattenallbut0(net)
def _init(self, ob_space, ac_space): """ :param ob_space: (Gym Space) The observation space of the environment :param ac_space: (Gym Space) The action space of the environment """ obs, pdtype = self.get_obs_and_pdtype(ob_space, ac_space) obs_normalized = obs / 255.0 with tf.variable_scope(self.name + "/pol", reuse=self.reuse): layer_1 = tf.nn.relu(tf_utils.conv2d(obs_normalized, 8, "l1", [8, 8], [4, 4], pad="VALID")) layer_2 = tf.nn.relu(tf_utils.conv2d(layer_1, 16, "l2", [4, 4], [2, 2], pad="VALID")) layer_2 = tf_utils.flattenallbut0(layer_2) layer_3 = tf.nn.relu(tf.layers.dense(layer_2, 128, name='lin', kernel_initializer=tf_utils.normc_initializer(1.0))) logits = tf.layers.dense(layer_3, pdtype.param_shape()[0], name='logits', kernel_initializer=tf_utils.normc_initializer(0.01)) self.proba_distribution = pdtype.proba_distribution_from_flat(logits) with tf.variable_scope(self.name + "/vf", reuse=self.reuse): layer_1 = tf.nn.relu(tf_utils.conv2d(obs_normalized, 8, "l1", [8, 8], [4, 4], pad="VALID")) layer_2 = tf.nn.relu(tf_utils.conv2d(layer_1, 16, "l2", [4, 4], [2, 2], pad="VALID")) layer_2 = tf_utils.flattenallbut0(layer_2) layer_3 = tf.nn.relu(tf.layers.dense(layer_2, 128, name='lin', kernel_initializer=tf_utils.normc_initializer(1.0))) self.vpred = tf.layers.dense(layer_3, 1, name='value', kernel_initializer=tf_utils.normc_initializer(1.0)) self.vpredz = self.vpred self.state_in = [] self.state_out = [] if self.stochastic_ph is None: self.stochastic_ph = tf.placeholder(dtype=tf.bool, shape=()) action = self.proba_distribution.sample() self._act = tf_utils.function([self.stochastic_ph, obs], [action, self.vpred])
def img_encoder(self, img, kind, mode="input"): """mode denote where add the coord conv: "input" means add only after input tensor "all" means add after all-level tensors """ _, num_rows, num_cols, _ = img.get_shape().as_list() addcoord = AddCoords(x_dim=num_cols, y_dim=num_rows, with_r=False, skiptile=True) img_coord = addcoord(img) x = tf.nn.relu(U.conv2d(img_coord, 32, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(tf.layers.dense(x, 512, name='lin', kernel_initializer=U.normc_initializer(1.0))) return x
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, lstm_hid_size, kind): print("This is lstm policy for only sensors.") assert isinstance(ob_space, tuple) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob_p = U.get_placeholder(name="ob_physics", dtype=tf.float32, shape=[sequence_length] + list(ob_space[0].shape)) ob_f= U.get_placeholder(name="ob_frames", dtype=tf.float32, shape=[sequence_length]+list(ob_space[1].shape)) #process ob_p with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape = ob_space[0].shape) obpz = tf.clip_by_value((ob_p - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) #process ob_f x = ob_f / 255.0 if kind == 'small': # from A3C paper x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0))) elif kind == 'large': # Nature DQN x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(tf.layers.dense(x, 512, name='lin', kernel_initializer=U.normc_initializer(1.0))) else: raise NotImplementedError # lstm layer for memmory lstm_cell = tf.contrib.rnn.BasicLSTMCell(lstm_hid_size, state_is_tuple=True, name = "rnn") c_init = np.zeros((1, lstm_cell.state_size.c), np.float32) h_init = np.zeros((1, lstm_cell.state_size.h), np.float32) self.state_init = (c_init, h_init) c_in = U.get_placeholder(name="state_c", dtype=tf.float32,shape=(None, lstm_cell.state_size.c)) h_in = U.get_placeholder(name="state_h", dtype=tf.float32,shape=(None, lstm_cell.state_size.h)) self.state_in = (c_in, h_in) state_in = tf.contrib.rnn.LSTMStateTuple(c_in, h_in) lstm_outputs, lstm_states = lstm_cell(x, state_in) lstm_c, lstm_h = lstm_states self.state_out = (lstm_c, lstm_h) rnn_out = tf.reshape(lstm_outputs, (-1, lstm_hid_size)) # conjugate sensor and physics ob_last = tf.concat((rnn_out, obpz), axis = -1) # value network with tf.variable_scope("vf"): last_out = ob_last for i in range(num_hid_layers): last_out = tf.nn.relu(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0))) self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0] with tf.variable_scope("pol"): last_out = ob_last for i in range(num_hid_layers): last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(1.0))) logits = tf.layers.dense(last_out, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(logits) stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob_p, ob_f, c_in, h_in], [ac, self.vpred, lstm_c, lstm_h])
def _init(self, ob_space, sensor_space, ac_space, hid_size, num_hid_layers, kind, elm_mode): assert isinstance(ob_space, gym.spaces.Box) assert isinstance(sensor_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) ob_sensor = U.get_placeholder(name="ob_sensor", dtype=tf.float32, shape=[sequence_length] + list(sensor_space.shape)) x = ob / 255.0 x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID")) num_res_net_blocks = 3 for i in range(num_res_net_blocks): input_data = x for j in range(2): x = tf.nn.relu( U.conv2d(x, 32, "l%i" % (2 * i + 3 + j), filter_size=[3, 3], pad="SAME")) x = tf.nn.relu(tf.math.add(x, input_data)) x = U.flattenallbut0(x) x = tf.nn.relu( tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0))) ## Obfilter on sensor output with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=sensor_space.shape) obz_sensor = tf.clip_by_value( (ob_sensor - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz_sensor if not elm_mode: ## Adapted from mlp_policy for i in range(num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense( last_out, hid_size, name="vffc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) y = tf.layers.dense(last_out, 64, name="vffinal", kernel_initializer=U.normc_initializer(1.0)) else: last_out = tf.nn.tanh( tf.layers.dense(last_out, hid_size, name="vffc1", kernel_initializer=U.normc_initializer(1.0), trainable=False)) y = tf.layers.dense(last_out, 64, name="vffinal", kernel_initializer=U.normc_initializer(1.0)) x = tf.concat([x, y], 1) logits = tf.layers.dense(x, pdtype.param_shape()[0], name="logits", kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(logits) self.vpred = tf.layers.dense( x, 1, name="value", kernel_initializer=U.normc_initializer(1.0))[:, 0] # self.session.run(logits.kernel) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = self.pd.sample() # XXX self._act = U.function([stochastic, ob, ob_sensor], [ac, self.vpred, logits])
def _init(self, ob_space, ac_space, kind, num_options=2, dc=0, w_intfc=True): assert isinstance(ob_space, gym.spaces.Box) self.w_intfc = w_intfc self.state_in = [] self.state_out = [] self.dc = dc self.num_options = num_options self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) option = U.get_placeholder(name="option", dtype=tf.int32, shape=[None]) x = ob / 255.0 if kind == 'small': # from A3C paper x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) hidden = tf.nn.relu( tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0))) elif kind == 'large': # Nature DQN x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID")) x = U.flattenallbut0(x) hidden = tf.nn.relu( tf.layers.dense(x, 512, name='lin', kernel_initializer=U.normc_initializer(1.0))) else: raise NotImplementedError logits = dense3D2(hidden, pdtype.param_shape()[0], "polfinal", option, num_options=num_options, weight_init=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(logits) self.vpred = dense3D2(hidden, 1, "vffinal", option, num_options=num_options, weight_init=U.normc_initializer(1.0))[:, 0] self.tpred = tf.nn.sigmoid( dense3D2(tf.stop_gradient(hidden), 1, "termhead", option, num_options=num_options, weight_init=U.normc_initializer(1.0)))[:, 0] termination_sample = tf.greater( self.tpred, tf.random_uniform(shape=tf.shape(self.tpred), maxval=1.)) stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = self.pd.sample() # XXX #self.op_pi = tf.nn.softmax(U.dense(tf.stop_gradient(hidden), num_options, "OP", weight_init=U.normc_initializer(1.0))) self.op_pi = tf.nn.softmax( U.dense(hidden, num_options, "OPfinal", weight_init=U.normc_initializer(1.0))) self.intfc = tf.sigmoid( U.dense(hidden, num_options, "intfcfinal", weight_init=U.normc_initializer(1.0))) self._act = U.function([stochastic, ob, option], [ac]) self.get_term = U.function([ob, option], [termination_sample]) self.get_tpred = U.function([ob, option], [self.tpred]) self.get_vpred = U.function([ob, option], [self.vpred]) self._get_op_int = U.function([ob], [self.op_pi, self.intfc]) self._get_intfc = U.function([ob], [self.intfc]) self._get_op = U.function([ob], [self.op_pi])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, kind): assert isinstance(ob_space, tuple) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob_p = U.get_placeholder(name="ob_physics", dtype=tf.float32, shape=[sequence_length] + list(ob_space[0].shape)) ob_f = U.get_placeholder(name="ob_frames", dtype=tf.float32, shape=[sequence_length] + list(ob_space[1].shape)) #process ob_p with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space[0].shape) obpz = tf.clip_by_value((ob_p - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) #process ob_f x = ob_f / 255.0 if kind == 'small': # from A3C paper x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu( tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0))) elif kind == 'large': # Nature DQN x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu( tf.layers.dense(x, 512, name='lin', kernel_initializer=U.normc_initializer(1.0))) else: raise NotImplementedError ob_last = tf.concat((obpz, x), axis=-1) with tf.variable_scope("vf"): last_out = ob_last for i in range(num_hid_layers): last_out = tf.nn.relu( tf.layers.dense( last_out, hid_size, name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.vpred = tf.layers.dense( last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:, 0] with tf.variable_scope("pol"): last_out = ob_last for i in range(num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense( last_out, hid_size, name='fc%i' % (i + 1), kernel_initializer=U.normc_initializer(1.0))) logits = tf.layers.dense( last_out, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(logits) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = self.pd.sample() # XXX self._act = U.function([stochastic, ob_p, ob_f], [ac, self.vpred])
def __init__(self, input_shape, scope, args): assert len(input_shape) == 3 self.input_shape = input_shape # (W, H, Channels) self.scope = scope self.MASKS = args.masks self.Z_SIZE = args.z_size self.EPSILON = 1e-8 self.checkpoint_path = args.checkpoint_path if not os.path.exists(self.checkpoint_path): os.makedirs(self.checkpoint_path) self.trained_epochs = tf.Variable(0, dtype=tf.int32, name='trained_epochs', trainable=False) self.inc_trained_epochs = self.trained_epochs.assign_add(1) ## Build net with tf.variable_scope('input'): self.x_in = tf.placeholder(name="x_in", dtype="float", shape=(None, ) + self.input_shape) # Batch, W, H, Channels self.z_in = tf.placeholder(name="z_in", dtype="float", shape=(None, ) + (self.Z_SIZE,) ) # Batch, Z self.mask = tf.placeholder(name="mask", dtype="float", shape=(None, ) + self.input_shape[:-1] + (1,) ) # Batch, W, H, 1 with tf.variable_scope('is_training'): self.is_training = tf.placeholder(tf.bool, name="is_training") with tf.variable_scope('kl_tolerance'): self.kl_tolerance = tf.placeholder(name="kl_tolerance", dtype=tf.float32) # def build_VAE(x_in, mask, is_training, kl_tolerance, Z_SIZE): """ x_in (tf.placeholder): input (and target output) of the autoencoder network mask (tf.placeholder): is_person mask. Where this mask is True normal reconstruction_loss is computed. where it is False, loss is set to 0. is_training (tf.placeholder): is training kl_tolerance (scalar, or tf.placeholder): Z_SIZE (scalar): size of the latent z dimension """ is_training = self.is_training x = self.x_in _7 = 7 if input_shape[0] > 64 else 1 # either 1 or 7 (whether input is lidar or image) _3 = 3 if input_shape[0] > 64 else 1 # either 1 or 3 _3_else_2 = 3 if input_shape[0] > 64 else 2 with tf.variable_scope('encoder'): print("A0: {}".format(x.shape)) x = tf.layers.batch_normalization( tf.nn.relu(U.conv2d(x, 64, "l1", [_7, 7], [_3, 3], pad="SAME", summary_tag="Conv/Layer1")), training=is_training) print("A1: {}".format(x.shape)) x = tf.layers.max_pooling2d(x, (_3, 3), (_3, 3), padding="SAME", name="Conv/MaxPool") xres = x print("A2: {}".format(x.shape)) x = tf.layers.batch_normalization( tf.nn.relu(U.conv2d(x, 64, "l2", [_3, 3], [1, 1], pad="SAME", summary_tag="Conv/Layer2")), training=is_training) print("A3: {}".format(x.shape)) x = tf.layers.batch_normalization( U.conv2d(x, 64, "l3", [_3, 3], [1, 1], pad="SAME", summary_tag="Conv/Layer3"), training=is_training) print("A4: {}".format(x.shape)) xres2 = x x = tf.nn.relu(x + xres) x = tf.layers.batch_normalization( tf.nn.relu(U.conv2d(x, 64, "l4", [_3_else_2, 3], [1, 1], pad="SAME", summary_tag="Conv/Layer4")), training=is_training) print("A5: {}".format(x.shape)) x = tf.layers.batch_normalization( U.conv2d(x, 64, "l5", [_3_else_2, 3], [1, 1], pad="SAME", summary_tag="Conv/Layer5"), training=is_training) print("A6: {}".format(x.shape)) x = tf.nn.relu(x + xres2) x = tf.layers.average_pooling2d(x, (_3, 3), (_3, 3), padding="SAME", name="Conv/AvgPool") endconv_shape = x.shape print("A7: {}".format(x.shape)) x = U.flattenallbut0(x) endconv_flat_shape = x.shape print("A8: {}".format(x.shape)) x = tf.nn.relu(tf.layers.dense(x, 512, name='lin', kernel_initializer=U.normc_initializer(1.0))) print("A9: {}".format(x.shape)) tf.summary.histogram("encoder/lin/output", x) with tf.variable_scope('latent_space'): z_mu = tf.nn.sigmoid(tf.layers.dense(x, self.Z_SIZE, name='z_mu', kernel_initializer=U.normc_initializer(1.0))) z_logvar = tf.nn.relu(tf.layers.dense(x, self.Z_SIZE, name='z_logvar', kernel_initializer=U.normc_initializer(1.0))) z_sigma = tf.exp(z_logvar/2.0) z = tf.contrib.distributions.Normal(loc=z_mu, scale=z_sigma) x = z.sample(1)[0] print("Z: {}".format(x.shape)) self.z_mu = z_mu self.z_sigma = z_sigma self.z = z self.z_sample = x def build_decoder(z, is_training=self.is_training, output_shape=self.input_shape, scopename="decoder", reuse=False): with tf.variable_scope(scopename, reuse=reuse) as scope: x = z x = tf.nn.relu(tf.layers.dense(x, 512, name='z_inv', kernel_initializer=U.normc_initializer(1.0))) print("A9: {}".format(x.shape)) x = tf.nn.relu(tf.layers.dense(x, endconv_flat_shape[1], name='lin_inv', kernel_initializer=U.normc_initializer(1.0))) print("A8: {}".format(x.shape)) x = tf.reshape(x, (-1, endconv_shape[1], endconv_shape[2], endconv_shape[3])) print("A7: {}".format(x.shape)) # 'opposite' of average_pooling2d with stride # x = tf.image.resize_nearest_neighbor(x, (1*x.shape[1], 3*x.shape[2]), align_corners=True) x = tf.layers.conv2d_transpose(x, 64, (_3, 3), (_3, 3), activation=tf.nn.relu, padding="SAME", name="avgpool_inv") xres2 = x print("A6: {}".format(x.shape)) x = tf.layers.batch_normalization( tf.layers.conv2d_transpose(x, 64, (_3_else_2, 3), (1, 1), activation=tf.nn.relu, padding="SAME", name="l5_inv"), training=is_training) print("A5: {}".format(x.shape)) x = tf.layers.batch_normalization( tf.layers.conv2d_transpose(x, 64, (_3_else_2, 3), (1, 1), activation=tf.nn.relu, padding="SAME", name="l4_inv"), training=is_training) x = tf.nn.relu(x + xres2) xres = x print("A4: {}".format(x.shape)) x = tf.layers.batch_normalization( tf.layers.conv2d_transpose(x, 64, (_3, 3), (1, 1), activation=tf.nn.relu, padding="SAME", name="l3_inv"), training=is_training) print("A3: {}".format(x.shape)) x = tf.layers.batch_normalization( tf.layers.conv2d_transpose(x, 64, (_3, 3), (1, 1), activation=tf.nn.relu, padding="SAME", name="l2_inv"), training=is_training) print("A2: {}".format(x.shape)) x = tf.nn.relu(x + xres) x = tf.layers.conv2d_transpose(x, 64, (_3, 3), (_3, 3), activation=tf.nn.relu, padding="SAME", name="maxpool_inv") print("A1: {}".format(x.shape)) x = tf.layers.batch_normalization( tf.layers.conv2d_transpose(x, output_shape[2], (_7, 7), (_3, 3), activation=tf.nn.relu, padding="SAME", name="l1_inv"), training=is_training) print("A0: {}".format(x.shape)) y = x return y self.y = build_decoder(self.z_sample) # This must be done before creating the pure decoder, or tf will expect z_in to be fed self.batch_norm_update_op = tf.get_collection(tf.GraphKeys.UPDATE_OPS) # Create a separate decoder network with same variables fed by placeholder, not encoder # for off-training reconstruction self.reconstruction = build_decoder(self.z_in, reuse=True) # Losses with tf.variable_scope('reconstruction_loss'): self.avg_rec_abs_error = tf.reduce_mean(tf.abs(self.x_in - self.y), reduction_indices=[0,1,2]) # per channel # reconstruction_s_e = tf.square((self.x_in - self.y) / 255) # reconstruction square of normalized error reconstruction_s_e = tf.log(tf.cosh((self.x_in - self.y) / 255)) # reconstruction square of normalized error if self.MASKS: # apply mask (W, H) to p.pixel error (Batch, W, H, Channels) reconstruction_s_e = tf.boolean_mask(reconstruction_s_e, self.mask) reconstruction_loss = tf.reduce_mean(reconstruction_s_e, reduction_indices=[1,2,3]) # per example self.reconstruction_loss = tf.reduce_mean(reconstruction_loss) # average over batch # kl loss (reduce along z dimensions) kl_loss = - 0.5 * tf.reduce_mean( (1 + z_logvar - tf.square(z_mu) - tf.exp(z_logvar)), reduction_indices = 1 ) kl_loss = tf.maximum(kl_loss, self.kl_tolerance) # kl_loss per example self.kl_loss = tf.reduce_mean(kl_loss) # batch kl_loss self.loss = self.reconstruction_loss + self.kl_loss # add tensorboard summaries for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES): variable_summaries(var) self.merged_summaries = tf.summary.merge_all() # A placeholder for adding arbitrary images to tensorboard self.image_tensor = tf.placeholder(name="image", dtype="float", shape=(None, 1000, 1000, 4)) # Batch, W, H, Channels self.image_summary = tf.summary.image("Reconstructions/val", self.image_tensor) self.image_tensor2 = tf.placeholder(name="image2", dtype="float", shape=(None, 1000, 1000, 4)) # Batch, W, H, Channels self.image_summary2 = tf.summary.image("Reconstructions/valtarget", self.image_tensor2)
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, num_options=2,dc=0, kind='small'): assert isinstance(ob_space, gym.spaces.Box) self.dc = dc self.num_options = num_options self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) option = U.get_placeholder(name="option", dtype=tf.int32, shape=[None]) x = ob / 255.0 if kind == 'small': # from A3C paper x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 256, 'lin', U.normc_initializer(1.0))) elif kind == 'large': # Nature DQN x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 512, 'lin', U.normc_initializer(1.0))) else: raise NotImplementedError # Network to compute value function and termination probabilities with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = x last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0))) self.vpred = dense3D2(last_out, 1, "vffinal", option, num_options=num_options, weight_init=U.normc_initializer(1.0))[:,0] self.vpred_ent = dense3D2(last_out, 1, "vffinal_ent", option, num_options=num_options, weight_init=U.normc_initializer(1.0))[:,0] self.tpred = tf.nn.sigmoid(dense3D2(tf.stop_gradient(last_out), 1, "termhead", option, num_options=num_options, weight_init=U.normc_initializer(1.0)))[:,0] termination_sample = tf.greater(self.tpred, tf.random_uniform(shape=tf.shape(self.tpred),maxval=1.)) # Network to compute policy over options and intra_option policies last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc%i"%(i+1), weight_init=U.normc_initializer(1.0))) # if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Discrete): # mean = dense3D2(last_out, pdtype.param_shape()[0]//2, "polfinal", option, num_options=num_options, weight_init=U.normc_initializer(0.01)) # logstd = tf.get_variable(name="logstd", shape=[num_options, 1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) # pdparam = U.concatenate([mean, mean * 0.0 + logstd[option[0]]], axis=1) # else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.op_pi = tf.nn.softmax(U.dense(tf.stop_gradient(last_out), num_options, "OPfc%i"%(i+1), weight_init=U.normc_initializer(1.0))) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob, option], [ac, self.vpred, self.vpred_ent, last_out]) self._get_logits = U.function([stochastic, ob, option], [self.pd.logits] ) self._get_v = U.function([ob, option], [self.vpred]) self._get_v_ent = U.function([ob, option], [self.vpred_ent]) # Entropy value estimate self.get_term = U.function([ob, option], [termination_sample]) self.get_tpred = U.function([ob, option], [self.tpred]) self.get_vpred = U.function([ob, option], [self.vpred]) self.get_vpred_ent = U.function([ob, option], [self.vpred_ent]) # Entropy value estimate self._get_op = U.function([ob], [self.op_pi])
def __init__(self, action_space, observation_space, scope, args): self.scope = scope self.EPSILON = 1e-8 self.action_bound = [action_space.low, action_space.high] self.state_shape = observation_space[1].shape self.conv_state_shape = observation_space[0].shape DISCRETE = not args.continuous self.DIRECT_AGENT_OBS = len(observation_space) == 3 if self.DIRECT_AGENT_OBS: self.relobst_state_shape = list(observation_space[2].shape) # set to fix sized of relative obstacles self.MAX_N_REL_OBSTACLES = args.max_n_relative_obstacles if self.relobst_state_shape[1] > self.MAX_N_REL_OBSTACLES: raise ValueError( "Can only handle up to 10 dynamic obstacle states") self.relobst_state_shape[1] = self.MAX_N_REL_OBSTACLES self.relobst_state_shape = tuple(self.relobst_state_shape) if DISCRETE: assert len(action_space.shape) == 2 self.num_action_values = action_space.shape[1] else: assert len(action_space.shape) == 1 self.action_names = ['u', 'v', 'theta'] self.num_action = action_space.shape[0] self.cliprange = args.cliprange self.checkpoint_path = args.checkpoint_path if not os.path.exists(self.checkpoint_path): os.makedirs(self.checkpoint_path) self.environment = args.environment self.global_steps = tf.Variable(0, dtype=tf.int32, name='global_steps', trainable=False) self.inc_global_steps = self.global_steps.assign_add(1) ## Build net with tf.variable_scope('input'): self.s_conv = tf.placeholder(name="s_conv", dtype="float", shape=(None, ) + self.conv_state_shape) self.s = tf.placeholder(name="s", dtype="float", shape=(None, ) + self.state_shape) if self.DIRECT_AGENT_OBS: self.s_relobst = tf.placeholder(name="s_relobst", dtype="float", shape=(None, ) + self.relobst_state_shape) with tf.variable_scope('action'): if DISCRETE: self.a = tf.placeholder(name="a", shape=[None, self.num_action], dtype=tf.float32) else: self.a = tf.placeholder(name="a", shape=[None, self.num_action], dtype=tf.float32) with tf.variable_scope('target_returns'): self.target_returns = tf.placeholder(name="target_returns", shape=[None, 1], dtype=tf.float32) with tf.variable_scope('advantages'): self.advantage = tf.placeholder(name="advantage", shape=[None, 1], dtype=tf.float32) with tf.variable_scope('is_training'): self.is_training = tf.placeholder(tf.bool, name="is_training") with tf.variable_scope('entropy_coeff'): self.entropy_coeff = tf.placeholder(name="entropy_coeff", dtype=tf.float32) with tf.variable_scope('old_predicted_values'): self.old_value = tf.placeholder(name="old_value", shape=[None, 1], dtype=tf.float32) assert len(self.state_shape) == 1 assert self.state_shape[0] == 5 state_relgoal, state_vel = tf.split(self.s, [2, 3], axis=1) if self.DIRECT_AGENT_OBS: state_relobst = U.flattenallbut0(self.s_relobst) features_relgoal = tf.nn.relu( tf.layers.dense(state_relgoal, 32, name='s_relgoal_preproc', kernel_initializer=U.normc_initializer(1.0))) features_vel = tf.nn.relu( tf.layers.dense(state_vel, 32, name='s_vel_preproc', kernel_initializer=U.normc_initializer(1.0))) if self.DIRECT_AGENT_OBS: features_relobst = tf.nn.relu( tf.layers.dense(state_relobst, 32, name='s_relobst_preproc', kernel_initializer=U.normc_initializer(1.0))) # batch normalization features_relgoal = tf.layers.batch_normalization( features_relgoal, training=self.is_training) features_vel = tf.layers.batch_normalization(features_vel, training=self.is_training) if self.DIRECT_AGENT_OBS: features_relobst = tf.layers.batch_normalization( features_relobst, training=self.is_training) self.batch_norm_update_op = tf.get_collection(tf.GraphKeys.UPDATE_OPS) all_features = [features_relgoal, features_vel] if self.DIRECT_AGENT_OBS: all_features.append(features_relobst) x = tf.concat(all_features, axis=-1) x = tf.check_numerics(x, message="after concat") # # x = tf.nn.relu(tf.layers.dense(x, 256, name='merged_lin', kernel_initializer=U.normc_initializer(1.0))) def build_critic_net(inputs, scope): with tf.variable_scope(scope): dl1 = tf.contrib.layers.fully_connected( inputs=inputs, num_outputs=128, activation_fn=tf.nn.relu, scope='dl1') tf.summary.histogram("{}/dl1/output".format(scope), dl1) value = tf.contrib.layers.fully_connected( inputs=dl1, num_outputs=1, activation_fn=None, scope='value') #[:, 0] # initializer std 1.0 tf.summary.histogram("{}/value/output".format(scope), value) tf.summary.scalar("{}/value/output_max".format(scope), tf.reduce_max(value)) tf.summary.scalar("{}/value/output_min".format(scope), tf.reduce_min(value)) tf.summary.scalar("{}/value/target_max".format(scope), tf.reduce_max(self.target_returns)) tf.summary.scalar("{}/value/target_min".format(scope), tf.reduce_min(self.target_returns)) return value self.value = build_critic_net(x, 'value_net') def build_actor_net(inputs, scope, trainable, CONTINUOUS): with tf.variable_scope(scope): # Hidden layer dl1 = tf.contrib.layers.fully_connected( inputs=inputs, num_outputs=256, activation_fn=tf.nn.relu, trainable=trainable, scope='dl1') # Output layer and distribution if not CONTINUOUS: action_logits = tf.contrib.layers.fully_connected( inputs=dl1, num_outputs=self.num_action * self.num_action_values, activation_fn=tf.nn.relu, trainable=trainable, scope='action_logits') action_logits = tf.reshape( action_logits, (-1, self.num_action, self.num_action_values)) # Multinomial distribution (draw one out of num_action_values classes) # if 3 probs [0.4, 0.1, 0.5] and total_count = 1 # sample(1) -> [1, 0, 0], or [0, 1, 0], or [0, 0, 1] # prob([1, 0, 0]) -> 0.4 # total_count is the amount of draws per iteration. in this case 1 (single action) action_dist = tf.distributions.Categorical( logits=action_logits) else: mu = tf.contrib.layers.fully_connected( inputs=dl1, num_outputs=self.num_action, activation_fn=tf.nn.tanh, scope='mu') # adding epsilon here to prevent inf in normal distribution when sigma -> 0 sigma = self.EPSILON + tf.contrib.layers.fully_connected( inputs=dl1, num_outputs=self.num_action, activation_fn=tf.nn.softplus, trainable=trainable, scope='sigma') action_dist = tf.contrib.distributions.Normal(loc=mu, scale=sigma) param = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope) # tensorboard tf.summary.histogram("{}/dl1/output".format(scope), dl1) if not CONTINUOUS: action_outputs = tf.split(action_logits, self.num_action, axis=1) for action_name, out in zip(self.action_names, action_outputs): tf.summary.histogram( "{}/action_logits/output_{}".format( scope, action_name), out) else: mu_outputs = tf.split(mu, self.num_action, axis=-1) for action_name, out in zip(self.action_names, mu_outputs): tf.summary.histogram( "{}/mu/output_{}".format(scope, action_name), out) sigma_outputs = tf.split(sigma, self.num_action, axis=-1) for action_name, out in zip(self.action_names, sigma_outputs): tf.summary.histogram( "{}/sigma/output_{}".format(scope, action_name), out) # --- return action_dist, param pi, pi_param = build_actor_net(x, 'actor_net', trainable=True, CONTINUOUS=args.continuous) old_pi, old_pi_param = build_actor_net(x, 'old_actor_net', trainable=False, CONTINUOUS=args.continuous) self.syn_old_pi = [ oldp.assign(p) for p, oldp in zip(pi_param, old_pi_param) ] single_sample = tf.squeeze(pi.sample(1), axis=0) if DISCRETE: self.sample_op = single_sample # one_hot self.best_action_op = tf.one_hot( tf.argmax(tf.squeeze(pi.probs, axis=0), axis=-1), self.num_action_values) # one_hot else: self.sample_op = tf.clip_by_value(single_sample, self.action_bound[0][0], self.action_bound[1][0]) self.best_action_op = tf.clip_by_value(pi.mean(), self.action_bound[0][0], self.action_bound[1][0]) # tensorboard single_sample_outputs = tf.split(single_sample, self.num_action, axis=1) for action_name, out in zip(self.action_names, single_sample_outputs): tf.summary.histogram( "ActionDistribution/single_sample_{}".format(action_name), out) # Losses with tf.variable_scope('critic_loss'): diff_ypred_y = self.target_returns - self.value self.critic_loss_ = tf.square(diff_ypred_y) CLIP_VALUE_OPTIM = True if CLIP_VALUE_OPTIM: valueclipped = self.old_value + tf.clip_by_value( self.value - self.old_value, -self.cliprange, self.cliprange) self.clipped_critic_loss = tf.square(self.target_returns - valueclipped) self.critic_loss_ = tf.maximum(self.critic_loss_, self.clipped_critic_loss) self.critic_loss = tf.reduce_mean(self.critic_loss_) self.critic_loss = tf.check_numerics(self.critic_loss, message="after critic_loss") with tf.variable_scope('actor_loss'): self.entropy = pi.entropy() batch_entropy = tf.reduce_mean(self.entropy) ratio = pi.prob(self.a) / (old_pi.prob(self.a) + self.EPSILON ) #(old_pi.prob(self.a)+ 1e-5) # ratio = tf.exp( pi.log_prob(self.a) - old_pi.log_prob(self.a) ) # new / old #(old_pi.prob(self.a)+ 1e-5) pg_losses = -self.advantage * ratio pg_losses2 = -self.advantage * tf.clip_by_value( ratio, 1.0 - self.cliprange, 1.0 + self.cliprange) self.actor_loss = tf.reduce_mean(tf.maximum( pg_losses, pg_losses2)) - batch_entropy * self.entropy_coeff self.actor_loss = tf.check_numerics(self.actor_loss, message="after actor_loss") # diagnostics # if args.continuous: # entropy is not implemented for multinomial distribution if True: self.kl = tf.distributions.kl_divergence(pi, old_pi) tf.summary.histogram("Diagnostics/KL", self.kl) tf.summary.scalar("Diagnostics/MinibatchAvgKL", tf.reduce_mean(self.kl)) tf.summary.histogram("Diagnostics/Entropy", self.entropy) tf.summary.scalar("Diagnostics/MinibatchAvgEntropy", batch_entropy) #explained variance 1 = perfect, 0-1 good, 0 = might as well have predicted 0, < 0 worse than predicting 0 def reduce_variance(x): """ reduce all but batch dim, input shape (batch_size, N) result shape (batch_size, ) """ means = tf.reduce_mean(x, keepdims=True) sqdev = tf.square(x - means) return tf.reduce_mean(sqdev) self.ev = 1 - reduce_variance(diff_ypred_y) / reduce_variance( self.target_returns) tf.summary.scalar("Diagnostics/MinibatchExplainedVariance", self.ev) # add tensorboard summaries for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES): variable_summaries(var) self.merged_summaries = tf.summary.merge_all()
def _init(self, ob_space, ac_space, architecture_size): """ :param ob_space: (Gym Space) The observation space of the environment :param ac_space: (Gym Space) The action space of the environment :param architecture_size: (str) size of the policy's architecture (small as in A3C paper, large as in Nature DQN) """ obs, pdtype = self.get_obs_and_pdtype(ob_space, ac_space) with tf.variable_scope(self.name, reuse=self.reuse): normalized_obs = obs / 255.0 if architecture_size == 'small': # from A3C paper layer_1 = tf.nn.relu( tf_util.conv2d(normalized_obs, 16, "l1", [8, 8], [4, 4], pad="VALID")) layer_2 = tf.nn.relu( tf_util.conv2d(layer_1, 32, "l2", [4, 4], [2, 2], pad="VALID")) flattened_layer_2 = tf_util.flattenallbut0(layer_2) last_layer = tf.nn.relu( tf.layers.dense( flattened_layer_2, 256, name='lin', kernel_initializer=tf_util.normc_initializer(1.0))) elif architecture_size == 'large': # Nature DQN layer_1 = tf.nn.relu( tf_util.conv2d(normalized_obs, 32, "l1", [8, 8], [4, 4], pad="VALID")) layer_2 = tf.nn.relu( tf_util.conv2d(layer_1, 64, "l2", [4, 4], [2, 2], pad="VALID")) layer_3 = tf.nn.relu( tf_util.conv2d(layer_2, 64, "l3", [3, 3], [1, 1], pad="VALID")) flattened_layer_3 = tf_util.flattenallbut0(layer_3) last_layer = tf.nn.relu( tf.layers.dense( flattened_layer_3, 512, name='lin', kernel_initializer=tf_util.normc_initializer(1.0))) else: raise NotImplementedError logits = tf.layers.dense( last_layer, pdtype.param_shape()[0], name='logits', kernel_initializer=tf_util.normc_initializer(0.01)) self.proba_distribution = pdtype.proba_distribution_from_flat( logits) self.vpred = tf.layers.dense( last_layer, 1, name='value', kernel_initializer=tf_util.normc_initializer(1.0))[:, 0] self.state_in = [] self.state_out = [] if self.stochastic_ph is None: self.stochastic_ph = tf.placeholder(dtype=tf.bool, shape=()) action = self.proba_distribution.sample() self._act = tf_util.function([self.stochastic_ph, obs], [action, self.vpred])
def __build_graph(self, ob_space, ac_space, gaussian_fixed_var=True): self.pdtype = pdtype = make_pdtype(ac_space) assert not isinstance(ob_space, gym.spaces.tuple.Tuple) ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None] + list(ob_space.shape)) ob_g, ob_l = tf.split(ob, 2, axis=1) ob_g = tf.squeeze(ob_g, axis=1) - 128.0 ob_l = tf.squeeze(ob_l, axis=1) - 128.0 # Conv layer net = slim.convolution(ob_g, 96, [7, 7], 2, padding='VALID', scope='conv1', activation_fn=tf.nn.relu, reuse=tf.AUTO_REUSE) net = tf.nn.lrn(net, depth_radius=5, bias=2, alpha=1e-4 * 1, beta=0.75) net = slim.pool(net, [3, 3], 'MAX', stride=2, padding='VALID', scope='pool1') net = slim.convolution(net, 256, [5, 5], 2, padding='VALID', scope='conv2', activation_fn=tf.nn.relu, reuse=tf.AUTO_REUSE) net = tf.nn.lrn(net, depth_radius=5, bias=2, alpha=1e-4 * 1, beta=0.75) net = slim.pool(net, [3, 3], 'MAX', stride=2, padding='VALID', scope='pool2') net = slim.convolution(net, 512, [3, 3], 1, padding='VALID', scope='conv3', activation_fn=tf.nn.relu, reuse=tf.AUTO_REUSE) net_g = slim.convolution(net, 512, [3, 3], 1, padding='VALID', scope='conv4', activation_fn=tf.nn.relu, reuse=tf.AUTO_REUSE) net = slim.convolution(ob_l, 96, [7, 7], 2, padding='VALID', scope='conv1', activation_fn=tf.nn.relu, reuse=tf.AUTO_REUSE) net = tf.nn.lrn(net, depth_radius=5, bias=2, alpha=1e-4 * 1, beta=0.75) net = slim.pool(net, [3, 3], 'MAX', stride=2, padding='VALID', scope='pool1') net = slim.convolution(net, 256, [5, 5], 2, padding='VALID', scope='conv2', activation_fn=tf.nn.relu, reuse=tf.AUTO_REUSE) net = tf.nn.lrn(net, depth_radius=5, bias=2, alpha=1e-4 * 1, beta=0.75) net = slim.pool(net, [3, 3], 'MAX', stride=2, padding='VALID', scope='pool2') net = slim.convolution(net, 512, [3, 3], 1, padding='VALID', scope='conv3', activation_fn=tf.nn.relu, reuse=tf.AUTO_REUSE) net_l = slim.convolution(net, 512, [3, 3], 1, padding='VALID', scope='conv4', activation_fn=tf.nn.relu, reuse=tf.AUTO_REUSE) # Concat Features self.feat = feat = tf.concat( [U.flattenallbut0(net_g), U.flattenallbut0(net_l)], 1) # fcs_actor net = slim.fully_connected(feat, 512, scope='polfc1', activation_fn=tf.nn.relu) # pdparam = slim.fully_connected(net, 4, scope='polfc2', activation_fn=None) mean = slim.fully_connected(net, pdtype.param_shape()[0] // 2, scope='polfc2', activation_fn=None) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, logstd], axis=1) self.pd = pdtype.pdfromflat(pdparam) # fcs_value net = slim.fully_connected(feat, 512, scope='vffc1', activation_fn=tf.nn.relu) self.vpred = slim.fully_connected(net, 1, scope='vffc2', activation_fn=None) # change for BC stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self.ac = ac self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, sensor_space, ac_space, hid_size, num_hid_layers, kind): assert isinstance(ob_space, gym.spaces.Box) assert isinstance(sensor_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) ob_sensor = U.get_placeholder(name="ob_sensor", dtype=tf.float32, shape=[sequence_length] + list(sensor_space.shape)) ## Obfilter on sensor output with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=sensor_space.shape) obz_sensor = tf.clip_by_value( (ob_sensor - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) #x = tf.nn.relu(tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0))) ## Adapted from mlp_policy last_out = obz_sensor for i in range(num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense(last_out, hid_size, name="vffc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) y = tf.layers.dense(last_out, 64, name="vffinal", kernel_initializer=U.normc_initializer(1.0)) #y = ob_sensor #y = obz_sensor #y = tf.nn.relu(U.dense(y, 64, 'lin_ob', U.normc_initializer(1.0))) x = ob / 255.0 if kind == 'small': # from A3C paper x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu( tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0))) elif kind == 'large': # Nature DQN x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu( tf.layers.dense(x, 64, name='lin', kernel_initializer=U.normc_initializer(1.0))) else: raise NotImplementedError print(x.shape, y.shape) x = tf.concat([x, y], 1) ## Saver # self.saver = tf.train.Saver() logits = tf.layers.dense(x, pdtype.param_shape()[0], name="logits", kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(logits) self.vpred = tf.layers.dense( x, 1, name="value", kernel_initializer=U.normc_initializer(1.0))[:, 0] self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = self.pd.sample() # XXX self._act = U.function([stochastic, ob, ob_sensor], [ac, self.vpred, logits])
def _init(self, ob_space, ac_space, kind): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) ob_2 = U.get_placeholder( name="ob_2", dtype=tf.float32, shape=[sequence_length] + [5]) # observations to feed in after convolutions ob_2_fc = tf.nn.relu( tf.layers.dense(ob_2, 64, name='s2_preproc', kernel_initializer=U.normc_initializer(1.0))) x = ob / 25. if kind == 'small': # from A3C paper x = tf.nn.relu(U.conv2d(x, 16, "l1", [2, 8], [1, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 32, "l2", [2, 4], [1, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu( tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0))) x = tf.concat([x, ob_2_fc], axis=-1) elif kind == 'large': # Nature DQN x = tf.nn.relu(U.conv2d(x, 32, "l1", [2, 8], [1, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l2", [2, 4], [1, 2], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l3", [2, 3], [1, 1], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu( tf.layers.dense(x, 512, name='lin', kernel_initializer=U.normc_initializer(1.0))) x = tf.concat([x, ob_2_fc], axis=-1) else: raise NotImplementedError x = tf.nn.relu( tf.layers.dense(x, 256, name='merged_lin', kernel_initializer=U.normc_initializer(1.0))) logits = tf.layers.dense(x, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(logits) self.vpred = tf.layers.dense( x, 1, name='value', kernel_initializer=U.normc_initializer(1.0))[:, 0] self.state_in = [] self.state_out = [] stochastic = tf.placeholder(name="stochastic", dtype=tf.bool, shape=()) ac = self.pd.sample() # XXX self._act = U.function([stochastic, ob, ob_2], [ac, self.vpred])
def _init(self, sensor_name, sensor_shape, ac_space, measure_name, measure_shape, init_std=1.0): self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None self.sensor = utils.get_placeholder(name=sensor_name, dtype=tf.float32, shape=[sequence_length] + list(sensor_shape)) self.measure = utils.get_placeholder(name=measure_name, dtype=tf.float32, shape=[sequence_length] + list(measure_shape)) with tf.variable_scope("measurefilter"): self.ms_rms = RunningMeanStd(shape=measure_shape) obscaled = self.sensor / 255.0 m = tf.clip_by_value( (self.measure - self.ms_rms.mean) / self.ms_rms.std, -5.0, 5.0) with tf.variable_scope("vf"): x = obscaled x = tf.nn.relu( utils.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu( utils.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID")) m = tf.nn.tanh( tf.layers.dense( m, 32, name="fc1", kernel_initializer=utils.normc_initializer(1.0))) x = utils.flattenallbut0(x) x = tf.nn.relu( tf.layers.dense( x, 128, name='lin', kernel_initializer=utils.normc_initializer(1.0))) self.vpred = tf.layers.dense( x, 1, name='value', kernel_initializer=utils.normc_initializer(1.0)) self.vpredz = self.vpred with tf.variable_scope("pol"): x = obscaled x = tf.nn.relu( utils.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu( utils.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID")) x = utils.flattenallbut0(x) x = tf.nn.relu( tf.layers.dense( x, 128, name='lin', kernel_initializer=utils.normc_initializer(1.0))) self.action_dim = ac_space.shape[0] self.dist_diagonal = True self.varphi = x self.varphi_dim = 128 stddev_init = np.ones([1, self.action_dim]) * init_std prec_init = 1. / (np.multiply(stddev_init, stddev_init)) # 1 x |a| self.prec = tf.get_variable( name="prec", shape=[1, self.action_dim], initializer=tf.constant_initializer(prec_init)) kt_init = np.ones([self.varphi_dim, self.action_dim ]) * 0.5 / self.varphi_dim ktprec_init = kt_init * prec_init self.ktprec = tf.get_variable( name="ktprec", shape=[self.varphi_dim, self.action_dim], initializer=tf.constant_initializer(ktprec_init)) kt = tf.divide(self.ktprec, self.prec) mean = tf.matmul(x, kt) logstd = tf.log(tf.sqrt(1. / self.prec)) self.prec_get_flat = utils.GetFlat([self.prec]) self.prec_set_from_flat = utils.SetFromFlat([self.prec]) self.ktprec_get_flat = utils.GetFlat([self.ktprec]) self.ktprec_set_from_flat = utils.SetFromFlat([self.ktprec]) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) self.scope = tf.get_variable_scope().name self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = utils.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = utils.function([stochastic, self.sensor], [ac, self.vpred]) # Get all policy parameters vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope + '/pol') # Remove log-linear parameters ktprec and prec to get only non-linear parameters del vars[-1] del vars[-1] beta_params = vars # Flat w_beta beta_len = np.sum( [np.prod(p.get_shape().as_list()) for p in beta_params]) w_beta_var = tf.placeholder(dtype=tf.float32, shape=[beta_len]) # Unflatten w_beta beta_shapes = list(map(tf.shape, beta_params)) w_beta_unflat_var = self.unflatten_tensor_variables( w_beta_var, beta_shapes) # w_beta^T * \grad_beta \varphi(s)^T v = tf.placeholder(dtype=self.varphi.dtype, shape=self.varphi.get_shape(), name="v_in_Rop") features_beta = self.alternative_Rop(self.varphi, beta_params, w_beta_unflat_var, v) self.features_beta = utils.function([self.sensor, w_beta_var, v], features_beta)