def update(self, batch, update_actor=True): """Updates parameters of TD3 actor and critic given samples from the batch. Args: batch: A list of timesteps from environment. update_actor: a boolean variable, whether to perform a policy update. """ obs = contrib_eager_python_tfe.Variable( np.stack(batch.obs).astype('float32')) action = contrib_eager_python_tfe.Variable( np.stack(batch.action).astype('float32')) next_obs = contrib_eager_python_tfe.Variable( np.stack(batch.next_obs).astype('float32')) mask = contrib_eager_python_tfe.Variable( np.stack(batch.mask).astype('float32')) if self.get_reward is not None: reward = self.get_reward(obs, action, next_obs) else: reward = contrib_eager_python_tfe.Variable( np.stack(batch.reward).astype('float32')) if self.use_td3: self._update_critic_td3(obs, action, next_obs, reward, mask) else: self._update_critic_ddpg(obs, action, next_obs, reward, mask) if self.critic_step.numpy() % self.policy_update_freq == 0: if update_actor: self._update_actor(obs, mask) soft_update(self.actor.variables, self.actor_target.variables, self.tau) soft_update(self.critic.variables, self.critic_target.variables, self.tau)
def __init__(self, input_dim, subsampling_rate, lambd=10.0, gail_loss='airl'): """Initializes actor, critic, target networks and optimizers. Args: input_dim: size of the observation space. subsampling_rate: subsampling rate that was used for expert trajectories. lambd: gradient penalty coefficient for wgan. gail_loss: gail loss to use. """ self.subsampling_rate = subsampling_rate self.lambd = lambd self.gail_loss = gail_loss with tf.variable_scope('discriminator'): self.disc_step = contrib_eager_python_tfe.Variable(0, dtype=tf.int64, name='step') self.discriminator = Discriminator(input_dim) self.discriminator_optimizer = tf.train.AdamOptimizer() self.discriminator_optimizer._create_slots( self.discriminator.variables) # pylint: disable=protected-access
def scatter_update(ref: tfe.Variable, indices, updates): _ref = tfe.Variable(tf.cast(ref, tf.int32), trainable=False, name='hoge') del ref _updates = tf.cast(updates, tf.int32) x = tf.scatter_update(_ref, indices, _updates) update = tf.cast(x, tf.bool) return update
def main(_): """Run td3/ddpg evaluation.""" contrib_eager_python_tfe.enable_eager_execution() if FLAGS.use_gpu: tf.device('/device:GPU:0').__enter__() tf.gfile.MakeDirs(FLAGS.log_dir) summary_writer = contrib_summary.create_file_writer( FLAGS.log_dir, flush_millis=10000) env = gym.make(FLAGS.env) if FLAGS.wrap_for_absorbing: env = lfd_envs.AbsorbingWrapper(env) obs_shape = env.observation_space.shape act_shape = env.action_space.shape with tf.variable_scope('actor'): actor = Actor(obs_shape[0], act_shape[0]) random_reward, _ = do_rollout( env, actor, None, num_trajectories=10, sample_random=True) reward_scale = contrib_eager_python_tfe.Variable(1, name='reward_scale') saver = contrib_eager_python_tfe.Saver(actor.variables + [reward_scale]) last_checkpoint = tf.train.latest_checkpoint(FLAGS.load_dir) with summary_writer.as_default(): while True: last_checkpoint = wait_for_next_checkpoint(FLAGS.load_dir, last_checkpoint) total_numsteps = int(last_checkpoint.split('-')[-1]) saver.restore(last_checkpoint) average_reward, average_length = do_rollout( env, actor, None, noise_scale=0.0, num_trajectories=FLAGS.num_trials) logging.info( 'Evaluation: average episode length %d, average episode reward %f', average_length, average_reward) print('Evaluation: average episode length {}, average episode reward {}'. format(average_length, average_reward)) with contrib_summary.always_record_summaries(): if reward_scale.numpy() != 1.0: contrib_summary.scalar( 'reward/scaled', (average_reward - random_reward) / (reward_scale.numpy() - random_reward), step=total_numsteps) contrib_summary.scalar('reward', average_reward, step=total_numsteps) contrib_summary.scalar('length', average_length, step=total_numsteps)
def benchmark(batch_size, iters, seed=1, cuda=True, verbose=False): global final_loss, W_flat tf.set_random_seed(seed) np.random.seed(seed) images = tf.constant(u.get_mnist_images(batch_size).T) images = images[:batch_size] if cuda: images = images.gpu() data = images if cuda: device = '/gpu:0' else: device = '' device_ctx = tf.device(device) device_ctx.__enter__() visible_size = 28 * 28 hidden_size = 196 initial_val = tf.zeros([visible_size * hidden_size]) if W_flat is None: W_flat = tfe.Variable(initial_val, name='W_flat') W_flat.assign(initial_val) def loss_fn(w_flat): w = tf.reshape(w_flat, [visible_size, hidden_size]) x = tf.matmul(data, w) x = tf.sigmoid(x) x = tf.matmul(x, w, transpose_b=True) x = tf.sigmoid(x) return tf.reduce_mean(tf.square(x - data)) value_and_gradients_fn = tfe.value_and_gradients_function(loss_fn) def opfunc(x): # returns (value, gradient) value, grads = value_and_gradients_fn(x) return value, grads[0] # initialize weights W_flat.assign(u.ng_init(visible_size, hidden_size).flatten()) state = Struct() config = Struct() config.maxIter = iters config.verbose = True x, f_hist, currentFuncEval = lbfgs(opfunc, W_flat, config, state, verbose) if verbose: u.summarize_time() return final_loss
def main(_): tf.enable_eager_execution() envs = [ 'HalfCheetah-v1', 'Hopper-v1', 'Ant-v1', 'Walker2d-v1', 'Reacher-v1' ] for ienv, env in enumerate(envs): print('Processing environment %d of %d: %s' % (ienv + 1, len(envs), env)) h5_filename = os.path.join(FLAGS.src_data_dir, '%s.h5' % env) trajectories = h5py.File(h5_filename, 'r') if (set(trajectories.keys()) != set( ['a_B_T_Da', 'len_B', 'obs_B_T_Do', 'r_B_T'])): raise ValueError('Unexpected key set in file %s' % h5_filename) replay_buffer = ReplayBuffer() if env.find('Reacher') > -1: max_len = 50 else: max_len = 1000 for i in range(50): print(' Processing trajectory %d of 50 (len = %d)' % (i + 1, trajectories['len_B'][i])) for j in range(trajectories['len_B'][i]): mask = 1 if j + 1 == trajectories['len_B'][i]: if trajectories['len_B'][i] == max_len: mask = 1 else: mask = 0 replay_buffer.push_back( trajectories['obs_B_T_Do'][i][j], trajectories['a_B_T_Da'][i][j], trajectories['obs_B_T_Do'][i][(j + 1) % trajectories['len_B'][i]], [trajectories['r_B_T'][i][j]], [mask], j == trajectories['len_B'][i] - 1) replay_buffer_var = contrib_eager_python_tfe.Variable( '', name='expert_replay_buffer') saver = contrib_eager_python_tfe.Saver([replay_buffer_var]) odir = os.path.join(FLAGS.dst_data_dir, env) print('Saving results to checkpoint in directory: %s' % odir) tf.gfile.MakeDirs(odir) replay_buffer_var.assign(pickle.dumps(replay_buffer)) saver.save(os.path.join(odir, 'expert_replay_buffer'))
def __init__(self, model, learning_rate, training_iters, batch_size): self.model = model self.learning_rate = tfe.Variable(learning_rate) self.training_iters = training_iters self.batch_size = batch_size self.checkpoint = None self.lr_step = 0 def get_lr(): if self.lr_step > 0 and self.lr_step % training_iters == 0: self.learning_rate.assign_sub(self.learning_rate * 0.005) self.lr_step += 1 return self.learning_rate self.optimizer = tf.train.AdamOptimizer(get_lr)
def __init__(self, emoji_img, **kwargs): super(EmojiCNN, self).__init__() # parameters self.batch_size = kwargs.get("batch_size", 128) self.output_size = kwargs.get("output_size", 4) # model self.img_emb = tfe.Variable(emoji_img, name="imgs", trainable=False) self.conv_1 = Conv2D(32, (5, 5), activation="relu") self.max_pool_1 = MaxPool2D((2, 2)) self.conv_2 = Conv2D(32, (5, 5), activation="relu") self.max_pool_2 = MaxPool2D((4, 4)) self.conv_3 = Conv2D(32, (5, 5), activation="relu") self.max_pool_3 = MaxPool2D((4, 4)) self.flatten = Flatten() self.output_layer = Dense(self.output_size, activation="softmax")
def main(_): tf.enable_eager_execution() if not FLAGS.data_path: raise ValueError("Must specify --data-path") corpus = Datasets(FLAGS.data_path) train_data = _divide_into_batches(corpus.train, FLAGS.batch_size) eval_data = _divide_into_batches(corpus.valid, 10) have_gpu = tfe.num_gpus() > 0 use_cudnn_rnn = not FLAGS.no_use_cudnn_rnn and have_gpu with tf.device("/device:GPU:0" if have_gpu else None): # Make learning_rate a Variable so it can be included in the checkpoint # and we can resume training with the last saved learning_rate. learning_rate = tfe.Variable(20.0, name="learning_rate") model = PTBModel(corpus.vocab_size(), FLAGS.embedding_dim, FLAGS.hidden_dim, FLAGS.num_layers, FLAGS.dropout, use_cudnn_rnn) optimizer = tf.train.GradientDescentOptimizer(learning_rate) checkpoint = tfe.Checkpoint( learning_rate=learning_rate, model=model, # GradientDescentOptimizer has no state to checkpoint, but noting it # here lets us swap in an optimizer that does. optimizer=optimizer) # Restore existing variables now (learning_rate), and restore new variables # on creation if a checkpoint exists. checkpoint.restore(tf.train.latest_checkpoint(FLAGS.logdir)) sys.stderr.write("learning_rate=%f\n" % learning_rate.numpy()) best_loss = None for _ in range(FLAGS.epoch): train(model, optimizer, train_data, FLAGS.seq_len, FLAGS.clip) eval_loss = evaluate(model, eval_data) if not best_loss or eval_loss < best_loss: if FLAGS.logdir: checkpoint.save(os.path.join(FLAGS.logdir, "ckpt")) best_loss = eval_loss else: learning_rate.assign(learning_rate / 4.0) sys.stderr.write( "eval_loss did not reduce in this epoch, " "changing learning rate to %f for the next epoch\n" % learning_rate.numpy())
def main(_): tfe.enable_eager_execution() if not FLAGS.data_path: raise ValueError("Must specify --data_path") corpus = Corpus(FLAGS.data_path) # TODO(ashankar): Remove _batchify and _get_batch and use the Datasets API # instead. train_data = _batchify(corpus.train, FLAGS.batch_size) eval_data = _batchify(corpus.valid, 10) have_gpu = tfe.num_gpus() > 0 use_cudnn_rnn = not FLAGS.no_use_cudnn_rnn and have_gpu with tfe.restore_variables_on_create( tf.train.latest_checkpoint(FLAGS.logdir)): with tf.device("/device:GPU:0" if have_gpu else None): # Make learning_rate a Variable so it can be included in the checkpoint # and we can resume training with the last saved learning_rate. learning_rate = tfe.Variable(20.0, name="learning_rate") sys.stderr.write("learning_rate=%f\n" % learning_rate.numpy()) model = PTBModel(corpus.vocab_size(), FLAGS.embedding_dim, FLAGS.hidden_dim, FLAGS.num_layers, FLAGS.dropout, use_cudnn_rnn) optimizer = tf.train.GradientDescentOptimizer(learning_rate) best_loss = None for _ in range(FLAGS.epoch): train(model, optimizer, train_data, FLAGS.seq_len, FLAGS.clip) eval_loss = evaluate(model, eval_data) if not best_loss or eval_loss < best_loss: if FLAGS.logdir: tfe.Saver(model.trainable_weights + [learning_rate]).save( os.path.join(FLAGS.logdir, "ckpt")) best_loss = eval_loss else: learning_rate.assign(learning_rate / 4.0) sys.stderr.write( "eval_loss did not reduce in this epoch, " "changing learning rate to %f for the next epoch\n" % learning_rate.numpy())
def __init__(self, input_dim, obs_dim, ac_dim, goal_dim, subsampling_rate, lambd=10.0, gail_loss='airl', use_s_p=False, only_s=False): """Initializes actor, critic, target networks and optimizers. Args: input_dim: size of the observation space. subsampling_rate: subsampling rate that was used for expert trajectories. lambd: gradient penalty coefficient for wgan. gail_loss: gail loss to use. use_s_p: if (s, s', g) is used instead of (s, a, g) """ self.subsampling_rate = subsampling_rate self.lambd = lambd self.gail_loss = gail_loss self.use_s_p = use_s_p self.only_s = only_s with tf.variable_scope('discriminator'): self.disc_step = tfe.Variable(0, dtype=tf.int64, name='step') self.discriminator = Discriminator(input_dim) self.discriminator_optimizer = tf.train.AdamOptimizer() self.discriminator_optimizer._create_slots(self.discriminator.variables) # pylint: disable=protected-access obs = self.obs = tf.placeholder(shape=(None, obs_dim), dtype=tf.float32) expert_obs = self.expert_obs = tf.placeholder(shape=(None, obs_dim), dtype=tf.float32) goal = self.goal = tf.placeholder(shape=(None, goal_dim), dtype=tf.float32) expert_goal = self.expert_goal = tf.placeholder(shape=(None, goal_dim), dtype=tf.float32) # expert_mask = tfe.Variable(np.stack(expert_batch.mask).astype('float32')) # Since expert trajectories were resampled but no absorbing state, # statistics of the states changes, we need to adjust weights accordingly. # expert_mask = tf.maximum(0, -expert_mask) # expert_weight = expert_mask / self.subsampling_rate + (1 - expert_mask) action = self.action = tf.placeholder(shape=(None, ac_dim), dtype=tf.float32) expert_action = self.expert_action = tf.placeholder(shape=(None, ac_dim), dtype=tf.float32) if self.only_s: inputs = tf.concat([obs, goal], -1) expert_inputs = tf.concat([expert_obs, expert_goal], -1) else: inputs = tf.concat([obs, goal, action], -1) expert_inputs = tf.concat([expert_obs, expert_goal, expert_action], -1) # Avoid using tensorflow random functions since it's impossible to get # the state of the random number generator used by TensorFlow. alpha = self.alpha = tf.placeholder(shape=(None, 1), dtype=tf.float32) # alpha = tfe.Variable(alpha.astype('float32')) inter = alpha * inputs + (1 - alpha) * expert_inputs # with tf.GradientTape() as tape: output = self.discriminator(inputs) expert_output = self.discriminator(expert_inputs) with tf.contrib.summary.record_summaries_every_n_global_steps( 100, self.disc_step): gan_loss = tfgan_losses.modified_discriminator_loss( expert_output, output, label_smoothing=0.0, # real_weights=expert_weight ) tf.contrib.summary.scalar( 'discriminator/expert_output', tf.reduce_mean(expert_output), step=self.disc_step) tf.contrib.summary.scalar( 'discriminator/policy_output', tf.reduce_mean(output), step=self.disc_step) # with tf.GradientTape() as tape2: # tape2.watch(inter) output = self.discriminator(inter) grad = tf.gradients(output, [inter])[0] grad_penalty = tf.reduce_mean(tf.pow(tf.norm(grad, axis=-1) - 1, 2)) loss = gan_loss + self.lambd * grad_penalty with tf.contrib.summary.record_summaries_every_n_global_steps( 100, self.disc_step): tf.contrib.summary.scalar( 'discriminator/grad_penalty', grad_penalty, step=self.disc_step) with tf.contrib.summary.record_summaries_every_n_global_steps( 100, self.disc_step): tf.contrib.summary.scalar( 'discriminator/loss', gan_loss, step=self.disc_step) grads = tf.gradients(loss, self.discriminator.variables) self.update_ops = self.discriminator_optimizer.apply_gradients( zip(grads, self.discriminator.variables), global_step=self.disc_step) self.airl_rew = self.discriminator(inputs) self.gail_rew = -tf.log(1 - tf.nn.sigmoid(self.discriminator(inputs)) + 1e-8) self.normalized_rew = tf.nn.sigmoid(self.discriminator(inputs)) self.negative_rew = tf.log(tf.nn.sigmoid(self.discriminator(inputs)) + 1e-8)
def set_images(self, content_path, style_path): self.content = self.read_image(content_path) self.style = self.read_image( style_path, output_size=self.content.get_shape().as_list()[1:3]) self.output = tfe.Variable( tf.random_normal(self.content.get_shape(), dtype=tf.float32))
cfg = tf.ConfigProto() cfg.gpu_options.allow_growth = True # cfg.gpu_options.per_process_gpu_memory_fraction = 0.1 return tf.Session(config=cfg) get_session() tfe.enable_eager_execution() tfe.executing_eagerly() # => True if __name__ == '__main__': x = np.zeros((2, 4)) x[0, 2:] = 1 x[0, 0:2] = 0.5 var_x = tfe.Variable(x) a = tf.not_equal(var_x, 0.5) y = np.zeros((2, 4)) y[0, 0] = 1 y[0, 1] = 2 y[0, 2] = 3 y[0, 3] = 4 indices = tf.where(a) print(var_x) print(a) y_a = tf.gather_nd(y, indices) print(indices) print(y_a)
if y_pred is not None: plt.plot(X.numpy(), y_pred.numpy(), c='r', linewidth=5) plt.show() del fig if __name__ == '__main__': num_samples, data_W, data_b = 1000, 3, 2 # Load training dataset. X, y = load_data(n=num_samples, W=data_W, b=data_b) # Model variables. W = tfe.Variable(tf.zeros(shape=()), name="weights") b = tfe.Variable(tf.zeros(shape=()), name="biases") epochs = 500 learning_rate = 1e-2 for epoch in range(epochs): dW, db = grad(X, y, W, b) # Update W & b. W.assign_sub(learning_rate * dW) b.assign_sub(learning_rate * db) loss = loss_func(prediction(X, W, b), y) print(('\rEpoch {:,}\tLoss {:3f}\tW = {:.2f}' '\tb={:.2f}').format(epoch + 1, loss.numpy(), W.numpy(),
def main(_): ref = tfe.Variable([False, False, False], trainable=False, name='hoge') indices = tf.range(3) updates = tf.constant([True, True, True]) print(scatter_update(ref, indices, updates))
tfe.enable_eager_execution() @tfe.custom_gradient def my_matmul(x, y): result = x @ y def grad(dr): return [dr @ tf.transpose(y), tf.transpose(x) @ dr] return result, grad lr = 0.25 n = 2 x = tfe.Variable(tf.ones((n, n)), name="x") y = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) def loss_fn(x): return tf.reduce_sum(my_matmul(x, y)) loss_grads_fn = tfe.value_and_gradients_function(loss_fn) for step in range(5): loss, grads = loss_grads_fn(x) print("loss =", loss.numpy()) x.assign_sub(lr * grads[0]) assert loss.numpy() == -96
import tensorflow as tf from tensorflow.contrib.eager.python import tfe tf.enable_eager_execution() x_data = tf.random_normal([ 1000, ]) x_noise = tf.random_normal([ 1000, ]) y_label = 3 * x_data + x_noise w = tfe.Variable(5.) b = tfe.Variable(10.) def mse(label, predict): loss = tf.losses.mean_squared_error(label, predict) return loss optimizer = tf.train.GradientDescentOptimizer(0.003) for step in range(3000): with tf.GradientTape(persistent=True) as tape: y_predict = w * x_data + b l = mse(y_label, y_predict) w_grad, b_grad = tape.gradient(l, [w, b])
def main(): np.random.seed(1) tf.set_random_seed(2) dtype = np.float32 lambda_ = 3e-3 lr = 0.2 dsize = 2 def t(mat): return tf.transpose(mat) def regularized_inverse(mat): n = int(mat.shape[0]) return tf.linalg.inv(mat + lambda_ * tf.eye(n, dtype=dtype)) train_images = np.asarray([[0, 1], [2, 3]]) X = tf.constant(train_images[:, :dsize].astype(dtype)) W1_0 = np.asarray([[0., 1], [2, 3]]).astype(dtype) / 10 W2_0 = np.asarray([[4., 5], [6, 7]]).astype(dtype) / 10 W1 = tfe.Variable(W1_0, name='W1') W2 = tfe.Variable(W2_0, name='W2') forward = [] backward = [] forward_inv = [] backward_inv = [] @tfe.custom_gradient def capturing_matmul(W, A): forward.append(A) def grad(B): backward.append(B) return [B @ tf.transpose(A), tf.transpose(W) @ B] return W @ A, grad @tfe.custom_gradient def kfac_matmul(W, A): def grad(B): kfac_A = forward_inv.pop() @ A kfac_B = backward_inv.pop() @ B return [kfac_B @ tf.transpose(kfac_A), tf.transpose(W) @ B] return W @ A, grad matmul = tf.matmul def loss_fn(synthetic=False): x = tf.nn.sigmoid(matmul(W1, X)) x = tf.nn.sigmoid(matmul(W2, x)) if synthetic: noise = tf.random_normal(X.shape) target = tf.constant((x + noise).numpy()) else: target = X err = target - x loss = tf.reduce_sum(err * err) / 2 / dsize return loss loss_and_grads = tfe.implicit_value_and_gradients(loss_fn) optimizer = tf.train.GradientDescentOptimizer(learning_rate=lr) for step in range(10): del backward[:] del forward[:] del forward_inv[:] del backward_inv[:] matmul = capturing_matmul loss, grads_and_vars = loss_and_grads(True) backward.reverse() for i in range(len(backward)): backward[i] = backward[i] * dsize def cov(X): return X @ t(X) / dsize def invcov(X): return regularized_inverse(cov(X)) for i in range(2): forward_inv.append(invcov(forward[i])) backward_inv.append(invcov(backward[i])) matmul = kfac_matmul loss, grads_and_vars = loss_and_grads() print("Step %3d loss %10.9f" % (step, loss.numpy())) optimizer.apply_gradients(grads_and_vars) target = 1.251444697 # with proper random sampling assert abs(loss.numpy() - target) < 1e-9, abs(loss.numpy() - target)
def _update_critic_td3(self, obs, action, next_obs, reward, mask): """Updates parameters of td3 critic given samples from the batch. Args: obs: A tfe.Variable with a batch of observations. action: A tfe.Variable with a batch of actions. next_obs: A tfe.Variable with a batch of next observations. reward: A tfe.Variable with a batch of rewards. mask: A tfe.Variable with a batch of masks. """ # Avoid using tensorflow random functions since it's impossible to get # the state of the random number generator used by TensorFlow. target_action_noise = np.random.normal( size=action.get_shape(), scale=self.policy_noise).astype('float32') target_action_noise = contrib_eager_python_tfe.Variable( target_action_noise) target_action_noise = tf.clip_by_value(target_action_noise, -self.policy_noise_clip, self.policy_noise_clip) noisy_action_targets = self.actor_target( next_obs) + target_action_noise clipped_noisy_action_targets = tf.clip_by_value( noisy_action_targets, -1, 1) if self.use_absorbing_state: # Starting from the goal state we can execute only non-actions. a_mask = tf.maximum(0, mask) q_next1, q_next2 = self.critic_target( next_obs, clipped_noisy_action_targets * a_mask) q_next = tf.reduce_min(tf.concat([q_next1, q_next2], -1), -1, keepdims=True) q_target = reward + self.discount * q_next else: q_next1, q_next2 = self.critic_target( next_obs, clipped_noisy_action_targets) q_next = tf.reduce_min(tf.concat([q_next1, q_next2], -1), -1, keepdims=True) q_target = reward + self.discount * mask * q_next with tf.GradientTape() as tape: q_pred1, q_pred2 = self.critic(obs, action) critic_loss = tf.losses.mean_squared_error( q_target, q_pred1) + tf.losses.mean_squared_error( q_target, q_pred2) grads = tape.gradient(critic_loss, self.critic.variables) self.critic_optimizer.apply_gradients(zip(grads, self.critic.variables), global_step=self.critic_step) if self.use_absorbing_state: with contrib_summary.record_summaries_every_n_global_steps( 100, self.critic_step): a_mask = tf.maximum(0, -mask) if tf.reduce_sum(a_mask).numpy() > 0: contrib_summary.scalar('critic/absorbing_reward', tf.reduce_sum(reward * a_mask) / tf.reduce_sum(a_mask), step=self.critic_step) with contrib_summary.record_summaries_every_n_global_steps( 100, self.critic_step): contrib_summary.scalar('critic/loss', critic_loss, step=self.critic_step)
def main(_): """Run td3/ddpg training.""" contrib_eager_python_tfe.enable_eager_execution() if FLAGS.use_gpu: tf.device('/device:GPU:0').__enter__() tf.gfile.MakeDirs(FLAGS.log_dir) summary_writer = contrib_summary.create_file_writer( FLAGS.log_dir, flush_millis=10000) tf.set_random_seed(FLAGS.seed) np.random.seed(FLAGS.seed) random.seed(FLAGS.seed) env = gym.make(FLAGS.env) env.seed(FLAGS.seed) if FLAGS.learn_absorbing: env = lfd_envs.AbsorbingWrapper(env) if FLAGS.env in ['HalfCheetah-v2', 'Ant-v1']: rand_actions = int(1e4) else: rand_actions = int(1e3) obs_shape = env.observation_space.shape act_shape = env.action_space.shape subsampling_rate = env._max_episode_steps // FLAGS.trajectory_size # pylint: disable=protected-access lfd = gail.GAIL( obs_shape[0] + act_shape[0], subsampling_rate=subsampling_rate, gail_loss=FLAGS.gail_loss) if FLAGS.algo == 'td3': model = ddpg_td3.DDPG( obs_shape[0], act_shape[0], use_td3=True, policy_update_freq=2, actor_lr=FLAGS.actor_lr, get_reward=lfd.get_reward, use_absorbing_state=FLAGS.learn_absorbing) else: model = ddpg_td3.DDPG( obs_shape[0], act_shape[0], use_td3=False, policy_update_freq=1, actor_lr=FLAGS.actor_lr, get_reward=lfd.get_reward, use_absorbing_state=FLAGS.learn_absorbing) random_reward, _ = do_rollout( env, model.actor, None, num_trajectories=10, sample_random=True) replay_buffer_var = contrib_eager_python_tfe.Variable( '', name='replay_buffer') expert_replay_buffer_var = contrib_eager_python_tfe.Variable( '', name='expert_replay_buffer') # Save and restore random states of gym/numpy/python. # If the job is preempted, it guarantees that it won't affect the results. # And the results will be deterministic (on CPU) and reproducible. gym_random_state_var = contrib_eager_python_tfe.Variable( '', name='gym_random_state') np_random_state_var = contrib_eager_python_tfe.Variable( '', name='np_random_state') py_random_state_var = contrib_eager_python_tfe.Variable( '', name='py_random_state') reward_scale = contrib_eager_python_tfe.Variable(1, name='reward_scale') saver = contrib_eager_python_tfe.Saver( model.variables + lfd.variables + [replay_buffer_var, expert_replay_buffer_var, reward_scale] + [gym_random_state_var, np_random_state_var, py_random_state_var]) tf.gfile.MakeDirs(FLAGS.save_dir) eval_saver = contrib_eager_python_tfe.Saver(model.actor.variables + [reward_scale]) tf.gfile.MakeDirs(FLAGS.eval_save_dir) last_checkpoint = tf.train.latest_checkpoint(FLAGS.save_dir) if last_checkpoint is None: expert_saver = contrib_eager_python_tfe.Saver([expert_replay_buffer_var]) last_checkpoint = os.path.join(FLAGS.expert_dir, 'expert_replay_buffer') expert_saver.restore(last_checkpoint) expert_replay_buffer = pickle.loads(expert_replay_buffer_var.numpy()) expert_reward = expert_replay_buffer.get_average_reward() logging.info('Expert reward %f', expert_reward) print('Expert reward {}'.format(expert_reward)) reward_scale.assign(expert_reward) expert_replay_buffer.subsample_trajectories(FLAGS.num_expert_trajectories) if FLAGS.learn_absorbing: expert_replay_buffer.add_absorbing_states(env) # Subsample after adding absorbing states, because otherwise we can lose # final states. print('Original dataset size {}'.format(len(expert_replay_buffer))) expert_replay_buffer.subsample_transitions(subsampling_rate) print('Subsampled dataset size {}'.format(len(expert_replay_buffer))) replay_buffer = ReplayBuffer() total_numsteps = 0 prev_save_timestep = 0 prev_eval_save_timestep = 0 else: saver.restore(last_checkpoint) replay_buffer = pickle.loads(zlib.decompress(replay_buffer_var.numpy())) expert_replay_buffer = pickle.loads( zlib.decompress(expert_replay_buffer_var.numpy())) total_numsteps = int(last_checkpoint.split('-')[-1]) prev_save_timestep = total_numsteps prev_eval_save_timestep = total_numsteps env.unwrapped.np_random.set_state( pickle.loads(gym_random_state_var.numpy())) np.random.set_state(pickle.loads(np_random_state_var.numpy())) random.setstate(pickle.loads(py_random_state_var.numpy())) with summary_writer.as_default(): while total_numsteps < FLAGS.training_steps: # Decay helps to make the model more stable. # TODO(agrawalk): Use tf.train.exponential_decay model.actor_lr.assign( model.initial_actor_lr * pow(0.5, total_numsteps // 100000)) logging.info('Learning rate %f', model.actor_lr.numpy()) rollout_reward, rollout_timesteps = do_rollout( env, model.actor, replay_buffer, noise_scale=FLAGS.exploration_noise, rand_actions=rand_actions, sample_random=(model.actor_step.numpy() == 0), add_absorbing_state=FLAGS.learn_absorbing) total_numsteps += rollout_timesteps logging.info('Training: total timesteps %d, episode reward %f', total_numsteps, rollout_reward) print('Training: total timesteps {}, episode reward {}'.format( total_numsteps, rollout_reward)) with contrib_summary.always_record_summaries(): contrib_summary.scalar( 'reward/scaled', (rollout_reward - random_reward) / (reward_scale.numpy() - random_reward), step=total_numsteps) contrib_summary.scalar('reward', rollout_reward, step=total_numsteps) contrib_summary.scalar('length', rollout_timesteps, step=total_numsteps) if len(replay_buffer) >= FLAGS.min_samples_to_start: for _ in range(rollout_timesteps): time_step = replay_buffer.sample(batch_size=FLAGS.batch_size) batch = TimeStep(*zip(*time_step)) time_step = expert_replay_buffer.sample(batch_size=FLAGS.batch_size) expert_batch = TimeStep(*zip(*time_step)) lfd.update(batch, expert_batch) for _ in range(FLAGS.updates_per_step * rollout_timesteps): time_step = replay_buffer.sample(batch_size=FLAGS.batch_size) batch = TimeStep(*zip(*time_step)) model.update( batch, update_actor=model.critic_step.numpy() >= FLAGS.policy_updates_delay) if total_numsteps - prev_save_timestep >= FLAGS.save_interval: replay_buffer_var.assign(zlib.compress(pickle.dumps(replay_buffer))) expert_replay_buffer_var.assign( zlib.compress(pickle.dumps(expert_replay_buffer))) gym_random_state_var.assign( pickle.dumps(env.unwrapped.np_random.get_state())) np_random_state_var.assign(pickle.dumps(np.random.get_state())) py_random_state_var.assign(pickle.dumps(random.getstate())) saver.save( os.path.join(FLAGS.save_dir, 'checkpoint'), global_step=total_numsteps) prev_save_timestep = total_numsteps if total_numsteps - prev_eval_save_timestep >= FLAGS.eval_save_interval: eval_saver.save( os.path.join(FLAGS.eval_save_dir, 'checkpoint'), global_step=total_numsteps) prev_eval_save_timestep = total_numsteps
def __init__(self, input_dim, action_dim, discount=0.99, tau=0.005, actor_lr=1e-3, critic_lr=1e-3, use_td3=True, policy_noise=0.2, policy_noise_clip=0.5, policy_update_freq=2, get_reward=None, use_absorbing_state=False): """Initializes actor, critic, target networks and optimizers. The class handles absorbing state properly. Absorbing state corresponds to a state which a policy gets in after reaching a goal state and stays there forever. For most RL problems, we can just assign 0 to all reward after the goal. But for GAIL, we need to have an actual absorbing state. Args: input_dim: size of the observation space. action_dim: size of the action space. discount: reward discount. tau: target networks update coefficient. actor_lr: actor learning rate. critic_lr: critic learning rate. use_td3: whether to use standard ddpg or td3. policy_noise: std of gaussian added to critic action input. policy_noise_clip: clip added gaussian noise. policy_update_freq: perform policy update once per n steps. get_reward: a function that given (s,a,s') returns a reward. use_absorbing_state: whether to use an absorbing state or not. """ self.discount = discount self.tau = tau self.use_td3 = use_td3 self.policy_noise = policy_noise self.policy_noise_clip = policy_noise_clip self.policy_update_freq = policy_update_freq self.get_reward = get_reward self.use_absorbing_state = use_absorbing_state with tf.variable_scope('actor'): self.actor = Actor(input_dim, action_dim) with tf.variable_scope('target'): self.actor_target = Actor(input_dim, action_dim) self.initial_actor_lr = actor_lr self.actor_lr = contrib_eager_python_tfe.Variable(actor_lr, name='lr') self.actor_step = contrib_eager_python_tfe.Variable(0, dtype=tf.int64, name='step') self.actor_optimizer = tf.train.AdamOptimizer( learning_rate=self.actor_lr) self.actor_optimizer._create_slots(self.actor.variables) # pylint: disable=protected-access soft_update(self.actor.variables, self.actor_target.variables) with tf.variable_scope('critic'): if self.use_td3: self.critic = CriticTD3(input_dim + action_dim) with tf.variable_scope('target'): self.critic_target = CriticTD3(input_dim + action_dim) else: self.critic = CriticDDPG(input_dim + action_dim) with tf.variable_scope('target'): self.critic_target = CriticDDPG(input_dim + action_dim) self.critic_step = contrib_eager_python_tfe.Variable( 0, dtype=tf.int64, name='step') self.critic_optimizer = tf.train.AdamOptimizer( learning_rate=critic_lr) self.critic_optimizer._create_slots(self.critic.variables) # pylint: disable=protected-access soft_update(self.critic.variables, self.critic_target.variables)
def do_rollout(env, actor, replay_buffer, noise_scale=0.1, num_trajectories=1, rand_actions=0, sample_random=False, add_absorbing_state=False): """Do N rollout. Args: env: environment to train on. actor: policy to take actions. replay_buffer: replay buffer to collect samples. noise_scale: std of gaussian noise added to a policy output. num_trajectories: number of trajectories to collect. rand_actions: number of random actions before using policy. sample_random: whether to sample a random trajectory or not. add_absorbing_state: whether to add an absorbing state. Returns: An episode reward and a number of episode steps. """ total_reward = 0 total_timesteps = 0 for _ in range(num_trajectories): obs = env.reset() episode_timesteps = 0 while True: if (replay_buffer is not None and len(replay_buffer) < rand_actions) or sample_random: action = env.action_space.sample() else: tfe_obs = contrib_eager_python_tfe.Variable([obs.astype('float32')]) action = actor(tfe_obs).numpy()[0] if noise_scale > 0: action += np.random.normal(size=action.shape) * noise_scale action = action.clip(-1, 1) next_obs, reward, done, _ = env.step(action) # Extremely important, otherwise Q function is not stationary! # Taken from: https://github.com/sfujim/TD3/blob/master/main.py#L123 if not done or episode_timesteps + 1 == env._max_episode_steps: # pylint: disable=protected-access done_mask = Mask.NOT_DONE.value else: done_mask = Mask.DONE.value total_reward += reward episode_timesteps += 1 total_timesteps += 1 if replay_buffer is not None: if (add_absorbing_state and done and episode_timesteps < env._max_episode_steps): # pylint: disable=protected-access next_obs = env.get_absorbing_state() replay_buffer.push_back(obs, action, next_obs, [reward], [done_mask], done) if done: break obs = next_obs # Add an absorbing state that is extremely important for GAIL. if add_absorbing_state and (replay_buffer is not None and episode_timesteps < env._max_episode_steps): # pylint: disable=protected-access action = np.zeros(env.action_space.shape) absorbing_state = env.get_absorbing_state() # done=False is set to the absorbing state because it corresponds to # a state where gym environments stopped an episode. replay_buffer.push_back(absorbing_state, action, absorbing_state, [0.0], [Mask.ABSORBING.value], False) return total_reward / num_trajectories, total_timesteps // num_trajectories
import tensorflow as tf from tensorflow.contrib.eager.python import tfe tf.enable_eager_execution() x = tfe.Variable(initial_value=tf.random_uniform([1], -1., 1.), name='x') def loss(input): return tf.sigmoid(input) grad_vars = tfe.implicit_gradients(loss) opt = tf.train.GradientDescentOptimizer(learning_rate=1) for i in range(1000): for j in range(50): opt.apply_gradients(grad_vars(x)) if i % 50 == 0: loss_val = loss(x) print(i, "Optimal Value : ", loss_val.numpy(), "Val (X) : ", x.numpy())
def update(self, batch, expert_batch): """Updates the WGAN potential function or GAN discriminator. Args: batch: A batch from training policy. expert_batch: A batch from the expert. """ obs = contrib_eager_python_tfe.Variable( np.stack(batch.obs).astype('float32')) expert_obs = contrib_eager_python_tfe.Variable( np.stack(expert_batch.obs).astype('float32')) expert_mask = contrib_eager_python_tfe.Variable( np.stack(expert_batch.mask).astype('float32')) # Since expert trajectories were resampled but no absorbing state, # statistics of the states changes, we need to adjust weights accordingly. expert_mask = tf.maximum(0, -expert_mask) expert_weight = expert_mask / self.subsampling_rate + (1 - expert_mask) action = contrib_eager_python_tfe.Variable( np.stack(batch.action).astype('float32')) expert_action = contrib_eager_python_tfe.Variable( np.stack(expert_batch.action).astype('float32')) inputs = tf.concat([obs, action], -1) expert_inputs = tf.concat([expert_obs, expert_action], -1) # Avoid using tensorflow random functions since it's impossible to get # the state of the random number generator used by TensorFlow. alpha = np.random.uniform(size=(inputs.get_shape()[0], 1)) alpha = contrib_eager_python_tfe.Variable(alpha.astype('float32')) inter = alpha * inputs + (1 - alpha) * expert_inputs with tf.GradientTape() as tape: output = self.discriminator(inputs) expert_output = self.discriminator(expert_inputs) with contrib_summary.record_summaries_every_n_global_steps( 100, self.disc_step): gan_loss = contrib_gan_python_losses_python_losses_impl.modified_discriminator_loss( expert_output, output, label_smoothing=0.0, real_weights=expert_weight) contrib_summary.scalar('discriminator/expert_output', tf.reduce_mean(expert_output), step=self.disc_step) contrib_summary.scalar('discriminator/policy_output', tf.reduce_mean(output), step=self.disc_step) with tf.GradientTape() as tape2: tape2.watch(inter) output = self.discriminator(inter) grad = tape2.gradient(output, [inter])[0] grad_penalty = tf.reduce_mean(tf.pow( tf.norm(grad, axis=-1) - 1, 2)) loss = gan_loss + self.lambd * grad_penalty with contrib_summary.record_summaries_every_n_global_steps( 100, self.disc_step): contrib_summary.scalar('discriminator/grad_penalty', grad_penalty, step=self.disc_step) with contrib_summary.record_summaries_every_n_global_steps( 100, self.disc_step): contrib_summary.scalar('discriminator/loss', gan_loss, step=self.disc_step) grads = tape.gradient(loss, self.discriminator.variables) self.discriminator_optimizer.apply_gradients( zip(grads, self.discriminator.variables), global_step=self.disc_step)
def main(_): """Run td3/ddpg training.""" tfe.enable_eager_execution() if FLAGS.use_gpu: tf.device('/device:GPU:0').__enter__() if FLAGS.expert_dir.find(FLAGS.env) == -1: raise ValueError('Expert directory must contain the environment name') tf.set_random_seed(FLAGS.seed) np.random.seed(FLAGS.seed) random.seed(FLAGS.seed) env = gym.make(FLAGS.env) env.seed(FLAGS.seed) obs_shape = env.observation_space.shape act_shape = env.action_space.shape expert_replay_buffer_var = tfe.Variable('', name='expert_replay_buffer') saver = tfe.Saver([expert_replay_buffer_var]) tf.gfile.MakeDirs(FLAGS.save_dir) with tf.variable_scope('actor'): actor = Actor(obs_shape[0], act_shape[0]) expert_saver = tfe.Saver(actor.variables) best_checkpoint = None best_reward = float('-inf') checkpoint_state = tf.train.get_checkpoint_state(FLAGS.expert_dir) for checkpoint in checkpoint_state.all_model_checkpoint_paths: expert_saver.restore(checkpoint) expert_reward, _ = do_rollout(env, actor, replay_buffer=None, noise_scale=0.0, num_trajectories=10) if expert_reward > best_reward: best_reward = expert_reward best_checkpoint = checkpoint expert_saver.restore(best_checkpoint) expert_replay_buffer = ReplayBuffer() expert_reward, _ = do_rollout( env, actor, replay_buffer=expert_replay_buffer, noise_scale=0.0, num_trajectories=FLAGS.num_expert_trajectories) logging.info('Expert reward %f', expert_reward) print('Expert reward {}'.format(expert_reward)) expert_replay_buffer_var.assign(pickle.dumps(expert_replay_buffer)) saver.save(os.path.join(FLAGS.save_dir, 'expert_replay_buffer'))
def main(_): """Run td3/ddpg training.""" contrib_eager_python_tfe.enable_eager_execution() if FLAGS.use_gpu: tf.device('/device:GPU:0').__enter__() tf.gfile.MakeDirs(FLAGS.log_dir) summary_writer = contrib_summary.create_file_writer(FLAGS.log_dir, flush_millis=10000) tf.set_random_seed(FLAGS.seed) np.random.seed(FLAGS.seed) random.seed(FLAGS.seed) env = gym.make(FLAGS.env) env.seed(FLAGS.seed) if FLAGS.env in ['HalfCheetah-v2', 'Ant-v1']: rand_actions = int(1e4) else: rand_actions = int(1e3) obs_shape = env.observation_space.shape act_shape = env.action_space.shape if FLAGS.algo == 'td3': model = ddpg_td3.DDPG(obs_shape[0], act_shape[0], use_td3=True, policy_update_freq=2, actor_lr=1e-3) else: model = ddpg_td3.DDPG(obs_shape[0], act_shape[0], use_td3=False, policy_update_freq=1, actor_lr=1e-4) replay_buffer_var = contrib_eager_python_tfe.Variable('', name='replay_buffer') gym_random_state_var = contrib_eager_python_tfe.Variable( '', name='gym_random_state') np_random_state_var = contrib_eager_python_tfe.Variable( '', name='np_random_state') py_random_state_var = contrib_eager_python_tfe.Variable( '', name='py_random_state') saver = contrib_eager_python_tfe.Saver( model.variables + [replay_buffer_var] + [gym_random_state_var, np_random_state_var, py_random_state_var]) tf.gfile.MakeDirs(FLAGS.save_dir) reward_scale = contrib_eager_python_tfe.Variable(1, name='reward_scale') eval_saver = contrib_eager_python_tfe.Saver(model.actor.variables + [reward_scale]) tf.gfile.MakeDirs(FLAGS.eval_save_dir) last_checkpoint = tf.train.latest_checkpoint(FLAGS.save_dir) if last_checkpoint is None: replay_buffer = ReplayBuffer() total_numsteps = 0 prev_save_timestep = 0 prev_eval_save_timestep = 0 else: saver.restore(last_checkpoint) replay_buffer = pickle.loads(zlib.decompress( replay_buffer_var.numpy())) total_numsteps = int(last_checkpoint.split('-')[-1]) assert len(replay_buffer) == total_numsteps prev_save_timestep = total_numsteps prev_eval_save_timestep = total_numsteps env.unwrapped.np_random.set_state( pickle.loads(gym_random_state_var.numpy())) np.random.set_state(pickle.loads(np_random_state_var.numpy())) random.setstate(pickle.loads(py_random_state_var.numpy())) with summary_writer.as_default(): while total_numsteps < FLAGS.training_steps: rollout_reward, rollout_timesteps = do_rollout( env, model.actor, replay_buffer, noise_scale=FLAGS.exploration_noise, rand_actions=rand_actions) total_numsteps += rollout_timesteps logging.info('Training: total timesteps %d, episode reward %f', total_numsteps, rollout_reward) print('Training: total timesteps {}, episode reward {}'.format( total_numsteps, rollout_reward)) with contrib_summary.always_record_summaries(): contrib_summary.scalar('reward', rollout_reward, step=total_numsteps) contrib_summary.scalar('length', rollout_timesteps, step=total_numsteps) if len(replay_buffer) >= FLAGS.min_samples_to_start: for _ in range(rollout_timesteps): time_step = replay_buffer.sample( batch_size=FLAGS.batch_size) batch = TimeStep(*zip(*time_step)) model.update(batch) if total_numsteps - prev_save_timestep >= FLAGS.save_interval: replay_buffer_var.assign( zlib.compress(pickle.dumps(replay_buffer))) gym_random_state_var.assign( pickle.dumps(env.unwrapped.np_random.get_state())) np_random_state_var.assign( pickle.dumps(np.random.get_state())) py_random_state_var.assign(pickle.dumps(random.getstate())) saver.save(os.path.join(FLAGS.save_dir, 'checkpoint'), global_step=total_numsteps) prev_save_timestep = total_numsteps if total_numsteps - prev_eval_save_timestep >= FLAGS.eval_save_interval: eval_saver.save(os.path.join(FLAGS.eval_save_dir, 'checkpoint'), global_step=total_numsteps) prev_eval_save_timestep = total_numsteps
plt.figure(dpi=300, figsize=(20, 20)) plt.subplots_adjust(left=0.0, right=1.0, bottom=0.0, top=1.0) plt.imshow(a) #plt.show() plt.savefig('temp.png') Y, X = np.mgrid[-1.3:1.3:0.001, -2:1:0.001] Z = X + 1j * Y num_gpus = tfe.num_gpus() if num_gpus > 0: with tf.device('gpu:0'): xs = tf.constant(Z.astype(np.complex64)) zs = tfe.Variable(xs) ns = tfe.Variable(tf.zeros_like(xs, tf.float32)) else: with tf.device('/cpu:0'): xs = tf.constant(Z.astype(np.complex64)) zs = tfe.Variable(xs) ns = tfe.Variable(tf.zeros_like(xs, tf.float32)) # Operation to update the zs and the iteration count. # # Note: We keep computing zs after they diverge! This # is very wasteful! There are better, if a little # less simple, ways to do this. def compute(zs, ns):