def build_graph(self): self.observation = tf.placeholder( tf.float32, [None] + list(self.env.observation_space.shape), name='inputs') # out = self.observation out = U.dense(self.observation, 10, 'layer1', weight_init=tf.contrib.layers.xavier_initializer(), bias=True, activation=tf.nn.tanh, summary=self.summary) out = U.dense(out, 10, 'layer2', weight_init=tf.contrib.layers.xavier_initializer(), bias=True, activation=tf.nn.tanh, summary=self.summary) activation = tf.nn.tanh if self.env.continuous else None self.actions = U.dense( out, self.num_actions, 'output', weight_init=tf.contrib.layers.xavier_initializer(), bias=True, activation=activation, summary=self.summary)
def deconv_net(scope, latent_variable): with tf.variable_scope(scope): x = latent_variable x = U.dense(x, 2048, 'l3', U.normc_initializer(1.0)) x = tf.nn.relu(U.dense(x, 128 * 8 * 11, 'l4', U.normc_initializer(1.0))) x = tf.reshape(x, [tf.shape(x)[0], 8, 11, 128]) # Unflatten x = tf.nn.relu( U.conv2d_transpose(x, [4, 4, 128, 128], [tf.shape(x)[0], 19, 25, 128], "uc1", [4, 4], [2, 2], pad="VALID")) x = tf.nn.relu( U.conv2d_transpose(x, [6, 6, 128, 128], [tf.shape(x)[0], 38, 50, 128], "uc2", [6, 6], [2, 2], pad="SAME")) x = tf.nn.relu( U.conv2d_transpose(x, [6, 6, 128, 128], [tf.shape(x)[0], 80, 105, 128], "uc3", [6, 6], [2, 2], pad="VALID")) x = U.conv2d_transpose(x, [8, 8, 1, 128], [tf.shape(x)[0], 160, 210, 1], "uc4", [8, 8], [2, 2], pad="SAME") return x
def build(self, image, mos_score): net = tf.reshape(image, [-1, 32, 32, 3]) net = self.block(net, 32) net = self.block(net, 64) net = self.block(net, 128) net = self.block(net, 256) net = self.block(net, 512) net1 = tf.reshape(net, (-1, 512)) net1 = U.dense(net1, 512, 'fc1') net1 = U.swish(net1) net1 = tf.nn.dropout(net1, keep_prob = self.prob) net1 = U.dense(net1, 1, 'fc2') net2 = tf.reshape(net, (-1, 512)) net2 = U.dense(net2, 512, 'fc1_weight') net2 = U.swish(net2) net2 = tf.nn.dropout(net2, keep_prob = self.prob) net2 = U.dense(net2, 1, 'fc2_weight') net2 = tf.nn.relu(net2) + 1e-6 self.loss_op = self.weighted_loss(net1, net2, mos_score) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.minimize(self.loss_op)
def build_model(self): with tf.variable_scope('dense') as scope: d1 = tf.nn.relu( tf_util.dense(name='d1', x=self.state, weight_init=tf_util.normc_initializer(), size=self.net_param['d1'])) d2 = tf.nn.relu( tf_util.dense(name='d2', x=d1, weight_init=tf_util.normc_initializer(), size=self.net_param['d2'])) d3 = tf.nn.relu( tf_util.dense(name='d3', x=d2, weight_init=tf_util.normc_initializer(), size=self.net_param['d3'])) self.action = tf.tanh( tf_util.dense(name='out', x=d3, weight_init=tf_util.normc_initializer(), size=self.action_size))
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) #obz = ob #with tf.variable_scope("obfilter"): # self.ob_rms = RunningMeanStd(shape=ob_space.shape) #obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = ob for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] last_out = ob for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = U.dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def encoder_net(self, img, latent_dim): x = img x = tf.nn.relu(U.dense(x, 1200, 'l1', U.normc_initializer(1.0))) x = tf.nn.relu(U.dense(x, 1200, 'l2', U.normc_initializer(1.0))) mu = U.dense(x, latent_dim, 'l3_1', U.normc_initializer(1.0)) # 32 logvar = U.dense(x, latent_dim, 'l3_2', U.normc_initializer(1.0)) # 32 return mu, logvar
def decoder_net(self, latent_variable): x = latent_variable x = tf.nn.tanh(U.dense(x, 1200, 'l4', U.normc_initializer(1.0))) x = tf.nn.tanh(U.dense(x, 1200, 'l5', U.normc_initializer(1.0))) x = tf.nn.tanh(U.dense(x, 1200, 'l6', U.normc_initializer(1.0))) x_logit = U.dense(x, 4096, 'l7', U.normc_initializer(1.0)) x_mean = tf.nn.sigmoid(x_logit) return x_logit, x_mean
def _make_net(self, o): # Process observation. if self.connection_type == 'ff': x = o for ilayer, hd in enumerate(self.hidden_dims): x = self.nonlin( U.dense(x, hd, 'l{}'.format(ilayer), U.normc_initializer(1.0))) else: raise NotImplementedError(self.connection_type) # Map to action. adim = self.ac_space.shape[0] ahigh = self.ac_space.high alow = self.ac_space.low assert isinstance(self.ac_bins, str) ac_bin_mode, ac_bin_arg = self.ac_bins.split(':') if ac_bin_mode == 'uniform': # Uniformly spaced bins, from ac_space.low to ac_space.high. num_ac_bins = int(ac_bin_arg) aidx_na = bins(x, adim, num_ac_bins, 'out') ac_range_1a = (ahigh - alow)[None, :] a = (1. / (num_ac_bins - 1.) * tf.to_float(aidx_na) * ac_range_1a + alow[None, :]) elif ac_bin_mode == 'custom': # Custom bins specified as a list of values from -1 to 1. # The bins are rescaled to ac_space.low to ac_space.high. acvals_k = np.array(list(map(float, ac_bin_arg.split(','))), dtype=np.float32) logger.info('Custom action values: ' + ' '.join('{:.3f}'.format(x) for x in acvals_k)) assert acvals_k.ndim == 1 and acvals_k[0] == -1 and acvals_k[ -1] == 1 acvals_ak = ((ahigh - alow)[:, None] / (acvals_k[-1] - acvals_k[0]) * (acvals_k - acvals_k[0])[None, :] + alow[:, None]) aidx_na = bins(x, adim, len(acvals_k), 'out') # Values in [0, k-1]. a = tf.gather_nd( acvals_ak, tf.concat([ tf.tile( np.arange(adim)[None, :, None], [tf.shape(aidx_na)[0], 1, 1]), 2, tf.expand_dims(aidx_na, -1) ]) # (n, a, 2) ) # (n, a) elif ac_bin_mode == 'continuous': a = U.dense(x, adim, 'out', U.normc_initializer(0.01)) else: raise NotImplementedError(ac_bin_mode) return a
def decoder_net(self, latent_variable): x = latent_variable x = U.dense(x, 256, 'l2', U.normc_initializer(1.0)) x = tf.nn.relu(U.dense(x, 1024, 'l3', U.normc_initializer(1.0))) x = tf.reshape(x, [tf.shape(x)[0], 4,4,64]) # Unflatten [4, 4, 64] x = tf.nn.relu(U.conv2d_transpose(x, [4,4,64,64], [tf.shape(x)[0], 8,8,64], "uc1", [2, 2], pad="SAME")) # [8, 8, 64] x = tf.nn.relu(U.conv2d_transpose(x, [4,4,32,64], [tf.shape(x)[0], 16,16,32], "uc2", [2, 2], pad="SAME")) # [16, 16, 32] x = tf.nn.relu(U.conv2d_transpose(x, [4,4,32,32], [tf.shape(x)[0], 32,32,32], "uc3", [2, 2], pad="SAME")) # [32, 32, 32] x = U.conv2d_transpose(x, [4,4,3,32], [tf.shape(x)[0], 64,64,3], "uc4", [2, 2], pad="SAME") # [64, 64, 1] return x
def reconstruct_fc(caps): with tf.variable_scope("reconstruction"): init = U.normc_initializer(0.1) fc1 = U.dense(caps, 512, name = "fc1", weight_init = init) fc1_act = tf.nn.relu(fc1) fc2 = U.dense(fc1_act, 1024, name = "fc2", weight_init = init) fc2_act = tf.nn.relu(fc2) fc3 = U.dense(fc2_act, 784, name = "fc3", weight_init = init) fc3_act = tf.nn.sigmoid(fc3) return fc3_act
def reconstruct_fc(caps): with tf.variable_scope("reconstruction"): init = U.normc_initializer(0.1) fc1 = U.dense(caps, 512, name="fc1", weight_init=init) fc1_act = tf.nn.relu(fc1) fc2 = U.dense(fc1_act, 1024, name="fc2", weight_init=init) fc2_act = tf.nn.relu(fc2) fc3 = U.dense(fc2_act, 784, name="fc3", weight_init=init) fc3_act = tf.nn.sigmoid(fc3) return fc3_act
def proj_net(scope, img, latent_dim): with tf.variable_scope(scope): x = img x = tf.nn.relu(U.conv2d(x, 64, "c1", [8, 8], [2, 2], pad="SAME")) x = tf.nn.relu(U.conv2d(x, 128, "c2", [6, 6], [2, 2], pad="SAME")) x = tf.nn.relu(U.conv2d(x, 128, "c3", [6, 6], [2, 2], pad="SAME")) x = tf.nn.relu(U.conv2d(x, 128, "c4", [4, 4], [2, 2], pad="SAME")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 2048, 'l1', U.normc_initializer(1.0))) x = U.dense(x, latent_dim, 'l2', U.normc_initializer(1.0)) return x
def encoder_net(self, img, latent_dim): x = img x = tf.nn.relu(U.conv2d(x, 32, "c1", [4, 4], [2, 2], pad = "SAME")) # [32, 32, 32] x = tf.nn.relu(U.conv2d(x, 32, "c2", [4, 4], [2, 2], pad = "SAME")) # [16, 16, 32] x = tf.nn.relu(U.conv2d(x, 64, "c3", [4, 4], [2, 2], pad = "SAME")) # [8, 8, 64] x = tf.nn.relu(U.conv2d(x, 64, "c4", [4, 4], [2, 2], pad = "SAME")) # [4, 4, 64] x = U.flattenallbut0(x) # [1024] x = tf.nn.relu(U.dense(x, 256, 'l1', U.normc_initializer(1.0))) # 1024 mu = U.dense(x, latent_dim, 'l1_1', U.normc_initializer(1.0)) # 32 logvar = U.dense(x, latent_dim, 'l1_2', U.normc_initializer(1.0)) # 32 return mu, logvar
def _build_graph(self): self.output = self.input if self.train_config.hidden_sizes: for i, hidden_size in enumerate(self.train_config.hidden_sizes): self.output = tf_util.dense(self.output, hidden_size, "hidden_{}".format(i)) self.output = tf.nn.relu(self.output) if self.train_config.dropout_rate > 0: self.output = tf_util.dropout( self.output, 1.0 - self.train_config.dropout_rate, self.is_training_phase) self.output = tf_util.dense(self.output, self.output_size, "last_layer")
def classifier_net(self, z1, z2, feat_size, latent_dim, cls_L, cls_batch_per_gpu): with tf.variable_scope("classifier") as scope: z1 = tf.reshape(z1, (cls_batch_per_gpu, -1, latent_dim)) z2 = tf.reshape(z2, (cls_batch_per_gpu, -1, latent_dim)) warn("z1: {}".format(np.shape(z1))) z_diff = U.sum(z1 - z2, axis=1) / cls_L warn("z_diff: {}".format(np.shape(z_diff))) x = U.dense(z_diff, feat_size, 'cls_fc1', U.normc_initializer(1.0)) return x
def _init(self, ob_space, ac_space): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None sy_ob = U.get_placeholder(name="sy_ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) obscaled = sy_ob / 255.0 with tf.variable_scope("pol"): x = obscaled x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0))) logits = U.dense(x, pdtype.param_shape()[0], "logits", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(logits) with tf.variable_scope("vf"): x = obscaled x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0))) self.vpred = U.dense(x, 1, "value", U.normc_initializer(1.0)) self.vpredz = self.vpred self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) sy_ac = self.pd.sample() # XXX self._act = U.function([stochastic, sy_ob], [sy_ac, self.vpred])
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('expert_policy_file', type=str) parser.add_argument('envname', type=str) parser.add_argument('--render', action='store_true') parser.add_argument("--max_timesteps", type=int) parser.add_argument('--num_rollouts', type=int, default=20, help='Number of expert roll outs') args = parser.parse_args() print('loading and building expert policy') policy_fn = load_policy.load_policy(args.expert_policy_file) print('loaded and built') with tf.Session(): tf_util.initialize() import gym env = gym.make(args.envname) max_steps = args.max_timesteps or env.spec.timestep_limit def run_exp(func, returns, observations, actions, bc=False): for i in range(args.num_rollouts): # print('iter', i) obs = env.reset() done = False totalr = 0. steps = 0 while not done: observations.append(obs) action = func(obs[None, :]) #if steps % 1000 == 0: # print(type(action)) obs, r, done, _ = env.step(action) if bc: action = policy_fn(obs[None, :]) # obs, r, done, _ = env.step(action) actions.append(action) totalr += r steps += 1 if args.render: env.render() #if steps % 100 == 0: # print("%i/%i" % (steps, max_steps)) if steps >= max_steps: break returns.append(totalr) # print('returns', returns) print('mean return', np.mean(returns)) print('std of return', np.std(returns)) returns0 = [] observations0 = [] actions0 = [] print("running expert steps") run_exp(policy_fn, returns0, observations0, actions0) expert_data = { 'observations': np.array(observations0), 'actions': np.array(actions0) } actions1 = expert_data['actions'] actions_size = actions1.shape[0] actions_dims = actions1.shape[1] * actions1.shape[2] expert_data['actions'] = np.reshape(expert_data['actions'], (actions_size, actions_dims)) # setting up models print(expert_data['observations'].shape, expert_data['actions'].shape) inputs = tf_util.get_placeholder( 'inputs', tf.float32, [None, expert_data['observations'].shape[1]]) labels = tf_util.get_placeholder('labels', tf.float32, [None, actions_dims]) # models name = args.envname d1 = tf_util.dense(inputs, 128, 'd1') d2 = tf_util.dense(inputs, 128, 'd2') # d2 = tf_util.dropout(d1, 0.95) d3 = tf_util.wndense(d1, 128, 'd3') pred = tf_util.densenobias(d3, actions_dims, 'output') #print(type(expert_data['actions']), type(pred)) loss_func = tf.losses.mean_squared_error(labels, pred) loss = tf.reduce_mean(loss_func) optimizer = tf.train.AdamOptimizer().minimize(loss) # evaluations tf_util.initialize() # grid search parameters def train_model(x, y): for i in range(args.num_rollouts): ls = 0 batch_size = int(actions_size / 4) batch_num = int(actions_size / batch_size) for j in range(batch_num): start = batch_num * j end = start + batch_size op_eval, ls_current = tf_util.eval([optimizer, loss], { inputs: x[start:end], labels: y[start:end] }) # print('batch ', j, ls_current) ls += ls_current #print('iter ', i, ls.shape) def model_eval(obs): p = tf_util.eval([pred], {inputs: obs}) return np.array(p) print("running behaviour cloning") train_model(expert_data['observations'], expert_data['actions']) run_exp(model_eval, [], [], []) print("running DAgger") for i in range(args.num_rollouts): # for i in range(10): print(len(observations0), len(actions0)) run_exp(model_eval, [], observations0, actions0, True) expert_data = { 'observations': np.array(observations0), 'actions': np.array(actions0) } expert_data['actions'] = np.reshape( expert_data['actions'], (expert_data['actions'].shape[0], actions_dims)) train_model(expert_data['observations'], expert_data['actions'])
def _make_net(self, o): o_cnn = o[:,:-3] o_self = o[:,-3:] x_cnn = tf.reshape(o_cnn, [-1, 9, 9, 9]) x_cnn = tf.layers.conv2d(x_cnn, 32, 3, data_format='channels_last', name='cnn1', activation=self.nonlin) #x_cnn = tf.layers.batch_normalization(x_cnn, axis=3) x_cnn = tf.layers.conv2d(x_cnn, 32, 3, data_format='channels_last', name='cnn2', activation=self.nonlin) #x_cnn = tf.layers.batch_normalization(x_cnn, axis=3) x_cnn = tf.layers.conv2d(x_cnn, 32, 3, data_format='channels_last', name='cnn3', activation=self.nonlin) #x_cnn = tf.layers.batch_normalization(x_cnn, axis=3) x_cnn = tf.reshape(x_cnn, [-1, 288]) #x_cnn = tf.Print(x_cnn, [tf.shape(x_cnn)], message='x_cnn shape is:') x_cnn = self.nonlin(U.dense(x_cnn, 256, 'ff1', U.normc_initializer(1.0))) x_self = o_self x = tf.concat([x_cnn, x_self], 1) x = self.nonlin(U.dense(x, 256, 'ff2', U.normc_initializer(1.0))) ''' # Process observation if self.connection_type == 'ff': x = o for ilayer, hd in enumerate(self.hidden_dims): x = self.nonlin(U.dense(x, hd, 'l{}'.format(ilayer), U.normc_initializer(1.0))) else: raise NotImplementedError(self.connection_type) ''' # Map to action adim, ahigh, alow = 1, self.ac_space.n - 1, 0 assert isinstance(self.ac_bins, str) ac_bin_mode, ac_bin_arg = self.ac_bins.split(':') if ac_bin_mode == 'uniform': # Uniformly spaced bins, from ac_space.low to ac_space.high num_ac_bins = int(ac_bin_arg) aidx_na = bins(x, adim, num_ac_bins, 'out') # 0 ... num_ac_bins-1 #aidx_na = tf.Print(aidx_na, [aidx_na], message='aidx_na: ') a = tf.nn.softmax(aidx_na) #tf.sigmoid(aidx_na) #ac_range_1a = (ahigh - alow)[None, :] #a = 1. / (num_ac_bins - 1.) * tf.to_float(aidx_na) * ac_range_1a + alow[None, :] elif ac_bin_mode == 'custom': # Custom bins specified as a list of values from -1 to 1 # The bins are rescaled to ac_space.low to ac_space.high acvals_k = np.array(list(map(float, ac_bin_arg.split(','))), dtype=np.float32) logger.info('Custom action values: ' + ' '.join('{:.3f}'.format(x) for x in acvals_k)) assert acvals_k.ndim == 1 and acvals_k[0] == -1 and acvals_k[-1] == 1 acvals_ak = ( (ahigh - alow)[:, None] / (acvals_k[-1] - acvals_k[0]) * (acvals_k - acvals_k[0])[None, :] + alow[:, None] ) aidx_na = bins(x, adim, len(acvals_k), 'out') # values in [0, k-1] a = tf.gather_nd( acvals_ak, tf.concat([ tf.tile(np.arange(adim)[None, :, None], [tf.shape(aidx_na)[0], 1, 1]), tf.expand_dims(aidx_na, -1) ],2) # (n,a,2) ) # (n,a) elif ac_bin_mode == 'continuous': a = U.dense(x, adim, 'out', U.normc_initializer(0.01)) else: raise NotImplementedError(ac_bin_mode) return a
def bins(x, dim, num_bins, name): scores = U.dense(x, dim * num_bins, name, U.normc_initializer(0.01)) #scores = tf.Print(scores, [scores], message='scores: ') scores_nab = tf.reshape(scores, [-1, dim, num_bins]) #scores_nab = tf.Print(scores_nab, [scores_nab], message='scores_nab: ') return scores_nab #tf.argmax(scores_nab, 2) # 0 ... num_bins-1
def bins(x, dim, num_bins, name): scores = U.dense(x, dim * num_bins, name, U.normc_initializer(0.01)) scores_nab = tf.reshape(scores, [-1, dim, num_bins]) return tf.argmax(scores_nab, 2)