def trpo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, delta=0.01, vf_lr=1e-3, train_v_iters=80, damping_coeff=0.1, cg_iters=10, backtrack_iters=10, backtrack_coeff=0.8, lam=0.97, max_ep_len=1000, logger_kwargs=dict(), save_freq=10, algo='trpo'): """ Trust Region Policy Optimization (with support for Natural Policy Gradient) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: ============ ================ ======================================== Symbol Shape Description ============ ================ ======================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``info`` N/A | A dict of any intermediate quantities | (from calculating the policy or log | probabilities) which are needed for | analytically computing KL divergence. | (eg sufficient statistics of the | distributions) ``info_phs`` N/A | A dict of placeholders for old values | of the entries in ``info``. ``d_kl`` () | A symbol for computing the mean KL | divergence between the current policy | (``pi``) and the old policy (as | specified by the inputs to | ``info_phs``) over the batch of | states given in ``x_ph``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) ============ ================ ======================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to TRPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) delta (float): KL-divergence limit for TRPO / NPG update. (Should be small for stability. Values like 0.01, 0.05.) vf_lr (float): Learning rate for value function optimizer. train_v_iters (int): Number of gradient descent steps to take on value function per epoch. damping_coeff (float): Artifact for numerical stability, should be smallish. Adjusts Hessian-vector product calculation: .. math:: Hv \\rightarrow (\\alpha I + H)v where :math:`\\alpha` is the damping coefficient. Probably don't play with this hyperparameter. cg_iters (int): Number of iterations of conjugate gradient to perform. Increasing this will lead to a more accurate approximation to :math:`H^{-1} g`, and possibly slightly-improved performance, but at the cost of slowing things down. Also probably don't play with this hyperparameter. backtrack_iters (int): Maximum number of steps allowed in the backtracking line search. Since the line search usually doesn't backtrack, and usually only steps back once when it does, this hyperparameter doesn't often matter. backtrack_coeff (float): How far back to step during backtracking line search. (Always between 0 and 1, usually above 0.5.) lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. algo: Either 'trpo' or 'npg': this code supports both, since they are almost the same. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph, plus placeholders for old pdist (for KL) pi, logp, logp_pi, info, info_phs, d_kl, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] + core.values_as_sorted_list(info_phs) # Every step, get: action, value, logprob, & info for pdist (for computing kl div) get_action_ops = [pi, v, logp_pi] + core.values_as_sorted_list(info) # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) info_shapes = {k: v.shape.as_list()[1:] for k,v in info_phs.items()} buf = GAEBuffer(obs_dim, act_dim, local_steps_per_epoch, info_shapes, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts) # TRPO losses ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) pi_loss = -tf.reduce_mean(ratio * adv_ph) v_loss = tf.reduce_mean((ret_ph - v)**2) # Optimizer for value function train_vf = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) # Symbols needed for CG solver pi_params = core.get_vars('pi') gradient = core.flat_grad(pi_loss, pi_params) v_ph, hvp = core.hessian_vector_product(d_kl, pi_params) if damping_coeff > 0: hvp += damping_coeff * v_ph # Symbols for getting and setting params get_pi_params = core.flat_concat(pi_params) set_pi_params = core.assign_params_from_flat(v_ph, pi_params) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def cg(Ax, b): """ Conjugate gradient algorithm (see https://en.wikipedia.org/wiki/Conjugate_gradient_method) """ x = np.zeros_like(b) r = b.copy() # Note: should be 'b - Ax(x)', but for x=0, Ax(x)=0. Change if doing warm start. p = r.copy() r_dot_old = np.dot(r,r) for _ in range(cg_iters): z = Ax(p) alpha = r_dot_old / (np.dot(p, z) + EPS) x += alpha * p r -= alpha * z r_dot_new = np.dot(r,r) p = r + (r_dot_new / r_dot_old) * p r_dot_old = r_dot_new return x def update(): # Prepare hessian func, gradient eval inputs = {k:v for k,v in zip(all_phs, buf.get())} Hx = lambda x : mpi_avg(sess.run(hvp, feed_dict={**inputs, v_ph: x})) g, pi_l_old, v_l_old = sess.run([gradient, pi_loss, v_loss], feed_dict=inputs) g, pi_l_old = mpi_avg(g), mpi_avg(pi_l_old) # Core calculations for TRPO or NPG x = cg(Hx, g) alpha = np.sqrt(2*delta/(np.dot(x, Hx(x))+EPS)) old_params = sess.run(get_pi_params) def set_and_eval(step): sess.run(set_pi_params, feed_dict={v_ph: old_params - alpha * x * step}) return mpi_avg(sess.run([d_kl, pi_loss], feed_dict=inputs)) if algo=='npg': # npg has no backtracking or hard kl constraint enforcement kl, pi_l_new = set_and_eval(step=1.) elif algo=='trpo': # trpo augments npg with backtracking line search, hard kl for j in range(backtrack_iters): kl, pi_l_new = set_and_eval(step=backtrack_coeff**j) if kl <= delta and pi_l_new <= pi_l_old: logger.log('Accepting new params at step %d of line search.'%j) logger.store(BacktrackIters=j) break if j==backtrack_iters-1: logger.log('Line search failed! Keeping old params.') logger.store(BacktrackIters=j) kl, pi_l_new = set_and_eval(step=0.) # Value function updates for _ in range(train_v_iters): sess.run(train_vf, feed_dict=inputs) v_l_new = sess.run(v_loss, feed_dict=inputs) # Log changes from update logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): agent_outs = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1,-1)}) a, v_t, logp_t, info_t = agent_outs[0][0], agent_outs[1], agent_outs[2], agent_outs[3:] o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v_t, logp_t, info_t) logger.store(VVals=v_t) # Update obs (critical!) o = o2 terminal = d or (ep_len == max_ep_len) if terminal or (t==local_steps_per_epoch-1): if not(terminal): print('Warning: trajectory cut off by epoch at %d steps.'%ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = 0 if d else sess.run(v, feed_dict={x_ph: o.reshape(1,-1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs-1): logger.save_state({'env': env}, None) # Perform TRPO or NPG update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('KL', average_only=True) if algo=='trpo': logger.log_tabular('BacktrackIters', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular()
def sac(env_fn, logger_kwargs=dict(), network_params=dict(), rl_params=dict()): # env params thresh = rl_params['thresh'] # control params seed = rl_params['seed'] epochs = rl_params['epochs'] steps_per_epoch = rl_params['steps_per_epoch'] replay_size = rl_params['replay_size'] batch_size = rl_params['batch_size'] start_steps = rl_params['start_steps'] max_ep_len = rl_params['max_ep_len'] save_freq = rl_params['save_freq'] render = rl_params['render'] # rl params gamma = rl_params['gamma'] polyak = rl_params['polyak'] lr = rl_params['lr'] grad_clip_val = rl_params['grad_clip_val'] # entropy params alpha = rl_params['alpha'] target_entropy = rl_params['target_entropy'] logger = EpochLogger(**logger_kwargs) if save_freq is not None: logger.save_config(locals()) train_env, test_env = env_fn(), env_fn() obs = train_env.observation_space act = train_env.action_space tf.set_random_seed(seed) np.random.seed(seed) train_env.seed(seed) train_env.action_space.np_random.seed(seed) test_env.seed(seed) test_env.action_space.np_random.seed(seed) # get the size after resize obs_dim = network_params['input_dims'] act_dim = act.shape[0] # init a state buffer for storing last m states train_state_buffer = StateBuffer(m=obs_dim[2]) test_state_buffer = StateBuffer(m=obs_dim[2]) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = placeholders(obs_dim, act_dim, obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope('main'): mu, pi, logp_pi, q1_a, q2_a = build_models(x_ph, a_ph, act, act_dim, network_params) with tf.variable_scope('main', reuse=True): # compose q with pi, for pi-learning _, _, _, q1_pi, q2_pi = build_models(x_ph, pi, act, act_dim, network_params) # get actions and log probs of actions for next states, for Q-learning _, pi_next, logp_pi_next, _, _ = build_models(x2_ph, a_ph, act, act_dim, network_params) # Target value network with tf.variable_scope('target'): _, _, _, q1_pi_targ, q2_pi_targ = build_models(x2_ph, pi_next, act, act_dim, network_params) # alpha Params if target_entropy == 'auto': target_entropy = tf.cast(-act_dim, tf.float32) else: target_entropy = tf.cast(target_entropy, tf.float32) log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=0.0) if alpha == 'auto': # auto tune alpha alpha = tf.exp(log_alpha) else: # fixed alpha alpha = tf.get_variable('alpha', dtype=tf.float32, initializer=alpha) # Count variables var_counts = tuple(count_vars(scope) for scope in ['log_alpha', 'main/pi', 'main/q1', 'main/q2', 'main']) print("""\nNumber of other parameters: alpha: %d, pi: %d, q1: %d, q2: %d, total: %d\n"""%var_counts) # Min Double-Q: min_q_pi = tf.minimum(q1_pi, q2_pi) min_q_pi_targ = tf.minimum(q1_pi_targ, q2_pi_targ) # Targets for Q and V regression q_backup = tf.stop_gradient(r_ph + gamma*(1-d_ph)*(min_q_pi_targ - alpha*logp_pi_next)) # critic losses q1_loss = 0.5 * tf.reduce_mean((q_backup - q1_a)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2_a)**2) value_loss = q1_loss + q2_loss # Soft actor losses pi_loss = tf.reduce_mean(alpha * logp_pi - min_q_pi) # alpha loss for temperature parameter alpha_backup = tf.stop_gradient(logp_pi + target_entropy) alpha_loss = -tf.reduce_mean(log_alpha * alpha_backup) # Policy train op pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr, epsilon=1e-04) if grad_clip_val is not None: gvs = pi_optimizer.compute_gradients(pi_loss, var_list=get_vars('main/pi')) capped_gvs = [(ClipIfNotNone(grad, grad_clip_val), var) for grad, var in gvs] train_pi_op = pi_optimizer.apply_gradients(capped_gvs) else: train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op value_optimizer = tf.train.AdamOptimizer(learning_rate=lr, epsilon=1e-04) with tf.control_dependencies([train_pi_op]): if grad_clip_val is not None: gvs = value_optimizer.compute_gradients(value_loss, var_list=get_vars('main/q')) capped_gvs = [(ClipIfNotNone(grad, grad_clip_val), var) for grad, var in gvs] train_value_op = value_optimizer.apply_gradients(capped_gvs) else: train_value_op = value_optimizer.minimize(value_loss, var_list=get_vars('main/q')) alpha_optimizer = tf.train.AdamOptimizer(learning_rate=lr, epsilon=1e-04) with tf.control_dependencies([train_value_op]): train_alpha_op = alpha_optimizer.minimize(alpha_loss, var_list=get_vars('log_alpha')) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) # All ops to call during one training step step_ops = [pi_loss, q1_loss, q2_loss, q1_a, q2_a, logp_pi, target_entropy, alpha_loss, alpha, train_pi_op, train_value_op, train_alpha_op, target_update] # Initializing targets to match main variables target_init = tf.group([tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) sess = tf.Session(config=tf_config) sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving if save_freq is not None: logger.setup_tf_saver(sess, inputs={'x_ph': x_ph, 'a_ph': a_ph}, outputs={'mu': mu, 'pi': pi, 'q1_a': q1_a, 'q2_a': q2_a}) def get_action(state, deterministic=False): state = state.astype('float32') / 255. act_op = mu if deterministic else pi return sess.run(act_op, feed_dict={x_ph: [state]})[0] def reset(env, state_buffer): o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 o = process_image_observation(o, obs_dim, thresh) state = state_buffer.init_state(init_obs=o) return o, r, d, ep_ret, ep_len, state def test_agent(n=10, render=True): for j in range(n): o, r, d, ep_ret, ep_len, test_state = reset(test_env, test_state_buffer) if render: test_env.render() while not(d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(test_state, True)) o = process_image_observation(o, obs_dim, thresh) test_state = test_state_buffer.append_state(o) ep_ret += r ep_len += 1 if render: test_env.render() if render: test_env.close() logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len, state = reset(train_env, train_state_buffer) total_steps = steps_per_epoch * epochs save_iter = 0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t > start_steps: a = get_action(state) else: a = train_env.action_space.sample() # Step the env o2, r, d, _ = train_env.step(a) o2 = process_image_observation(o2, obs_dim, thresh) next_state = train_state_buffer.append_state(o2) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len==max_ep_len else d # Store experience to replay buffer replay_buffer.store(state, a, r, next_state, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 state = next_state if d or (ep_len == max_ep_len): """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = {x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], } outs = sess.run(step_ops, feed_dict) logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2], Q1Vals=outs[3], Q2Vals=outs[4], LogPi=outs[5], TargEntropy=outs[6], LossAlpha=outs[7], Alpha=outs[8]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len, state = reset(train_env, train_state_buffer) # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if save_freq is not None: if (epoch % save_freq == 0) or (epoch == epochs-1): logger.save_state({'env': train_env}, itr=save_iter) save_iter+=1 # Test the performance of the deterministic version of the agent. test_agent(n=2, render=render) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LogPi', average_only=True) logger.log_tabular('TargEntropy', average_only=True) logger.log_tabular('Alpha', average_only=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) logger.log_tabular('LossAlpha', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular()
def bcq_learn(env_set="Hopper-v2", seed=0, buffer_type="FinalSigma0.5_env_0_1000K", batch_size=100, eval_freq=int(1e2), max_timesteps=float(2e6), lr=1e-3, save_freq=int(1e2), logger_kwargs=dict()): eval_freq = save_freq if eval_freq=="save_freq" else eval_freq device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("running on device:", device) """set up logger""" global logger logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) file_name = "BCQbatchpolicy_%s_%s" % (env_set, seed) print("---------------------------------------") print ("Task: " + file_name) print("Evaluate Policy every", eval_freq * batch_size / 1e6, 'epoches; Total', max_timesteps * batch_size / 1e6, 'epoches') print("---------------------------------------") if not os.path.exists("./results"): os.makedirs("./results") env = gym.make(env_set) test_env = gym.make(env_set) # Set seeds env.seed(seed) test_env.seed(seed) env.action_space.np_random.seed(seed) test_env.action_space.np_random.seed(seed) torch.manual_seed(seed) np.random.seed(seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) # Initialize policy policy = BCQ_batchpolicy.BCQ(state_dim, action_dim, max_action, lr=lr) # Load buffer if 'sac' in buffer_type: replay_buffer = utils.BEAR_ReplayBuffer() desire_stop_dict = {'Hopper-v2': 1000, 'Walker2d-v2': 500, 'HalfCheetah-v2': 4000, 'Ant-v2': 750} buffer_name = buffer_type.replace('env', env_set).replace('crt', str(desire_stop_dict[env_set])) replay_buffer.load(buffer_name) buffer_name += '_1000K' #setting_name = setting_name.replace('crt', str(desire_stop_dict[env_set])) elif 'Final' in buffer_type or 'sigma'in buffer_type: replay_buffer = utils.ReplayBuffer() buffer_name = buffer_type.replace('env', env_set) replay_buffer.load(buffer_name) elif 'optimal' in buffer_type: buffer_name = buffer_type.replace('env', env_set) replay_buffer = utils.ReplayBuffer() replay_buffer.load(buffer_name) else: raise FileNotFoundError('! Unknown type of dataset %s' % buffer_type) training_iters, epoch = 0, 0 while training_iters < max_timesteps: epoch += eval_freq * batch_size / 1e6 bcq_state_dict = policy.train(replay_buffer, iterations=int(eval_freq), batch_size=batch_size, logger=logger) if (training_iters % save_freq == 0): logger.save_state(bcq_state_dict, training_iters) avgtest_reward = evaluate_policy(policy, test_env) training_iters += eval_freq logger.log_tabular('Epoch', epoch) logger.log_tabular('AverageTestEpRet', avgtest_reward) logger.log_tabular('TotalSteps', training_iters) logger.log_tabular('QLoss', average_only=True) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('ActLoss', with_min_and_max=True) logger.dump_tabular()
def ppo(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Create actor-critic module ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) # Sync params across processes sync_params(ac) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Set up optimizers for policy and value function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # Set up model saving logger.setup_pytorch_saver(ac) # Prepare for interaction with environment start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32)) next_o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v, logp) logger.store(VVals=v) # Update obs (critical!) o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: if epoch_ended and not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32)) else: v = 0 buf.finish_path(v) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) def update(): # Set up function for computing PPO policy loss def compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data[ 'adv'], data['logp'] # Policy loss pi, logp = ac.pi(obs, act) ratio = torch.exp(logp - logp_old) clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() # Useful extra info approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item() pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac) return loss_pi, pi_info # Set up function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs) - ret)**2).mean() data = buf.get() pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with multiple steps of gradient descent for i in range(train_pi_iters): pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() logger.store(StopIter=i) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old)) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def vpg_linesearch_penalty(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, pi_lr=3e-3, vf_lr=1e-3, train_v_iters=80, lam=0.97, max_ep_len=1000, backtrack_iters=500, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to VPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = VPGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # VPG objectives pi_loss = -tf.reduce_mean(logp * adv_ph) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute # Optimizers ##train_pi is not used train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) ##op for taking a gradient step ## use AdamOptimizer as it is adjusts the learning rate opt = tf.train.AdamOptimizer(learning_rate=pi_lr) pi_name = "pi" scope_variable = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=pi_name) grads_and_vars = opt.compute_gradients(pi_loss, scope_variable) pi_grad_step_op = opt.apply_gradients(grads_and_vars) pi_grad_norm = tf.global_norm([item[0] for item in grads_and_vars]) #policy params - need to get and save policy params pi_params = core.get_vars('pi') gradient = core.flat_grad(pi_loss, pi_params) v_ph = tf.placeholder(tf.float32, shape=gradient.shape) get_pi_params = core.flat_concat(pi_params) set_pi_params = core.assign_params_from_flat(v_ph, pi_params) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def batch_run(inputs): penalty = 0 aa, v_t, logp_t = sess.run(get_action_ops, feed_dict=inputs) for aindex in range(len(aa)): o, r, d, _ = env.step(aa[aindex]) penalty = penalty + env.penalty_sa(o, aa[aindex]) if d: o = env.reset() break #print("batch_run, size {}, Penaty {} ".format(aindex,env.penalty_sa(o,aa[aindex]))) return penalty def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Policy gradient step #sess.run(train_pi, feed_dict=inputs) # Value function learning for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) ##linesearch - backtracking iteration using penalty save_params = sess.run(get_pi_params) for j in range(backtrack_iters): old_penalty = batch_run(inputs) pi_l_old = sess.run([pi_loss], feed_dict=inputs) sess.run(pi_grad_step_op, feed_dict=inputs) new_penalty = batch_run(inputs) pi_l_new = sess.run([pi_loss], feed_dict=inputs) if new_penalty < old_penalty: #if pi_l_new <= pi_l_old: #print("Accepting params at iter {} pi_l_new={} pi_l_old={}".format(j, pi_l_new, pi_l_old)) #print("Accepting params at iter {} new_penalty={} old_penalty={}".format(j, new_penalty, old_penalty)) save_params = sess.run(get_pi_params) sess.run(set_pi_params, feed_dict={v_ph: save_params}) #break sess.run(set_pi_params, feed_dict={v_ph: save_params}) # Log changes from update pi_l_new, v_l_new, kl = sess.run([pi_loss, v_loss, approx_kl], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) o, r, d, _ = env.step(a[0]) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform VPG update! update() # Log info about epoch #logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', average_only=True) #logger.log_tabular('EpLen', average_only=True) #logger.log_tabular('VVals', with_min_and_max=True) #logger.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch) #logger.log_tabular('LossPi', average_only=True) #logger.log_tabular('LossV', average_only=True) #logger.log_tabular('DeltaLossPi', average_only=True) #logger.log_tabular('DeltaLossV', average_only=True) #logger.log_tabular('Entropy', average_only=True) #logger.log_tabular('KL', average_only=True) #logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular()
def klucb_bs_sac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``mu`` (batch, act_dim) | Computes mean actions from policy | given states. ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. Critical: must be differentiable | with respect to policy parameters all | the way through action sampling. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). ``q2_pi`` (batch,) | Gives the composition of ``q2`` and | ``pi`` for states in ``x_ph``: | q2(x, pi(x)). ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope('main'): mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v = actor_critic( x_ph, a_ph, **ac_kwargs) # Target value network with tf.variable_scope('target'): _, _, _, _, _, _, _, v_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) # mu = tf.squeeze(mu,axis=1) # pi = tf.squeeze(pi,axis=1) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main']) print(('\nNumber of parameters: \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t v: %d, \t total: %d\n')%var_counts) print(mu.shape, pi.shape, logp_pi.shape, q1.shape, q2.shape, q1_pi.shape, q2_pi.shape, v.shape, tf.expand_dims(d_ph, 1).shape, tf.expand_dims(d_ph, 1).shape, v_targ.shape) # Min Double-Q: min_q_pi = tf.minimum(q1_pi, q2_pi) # Targets for Q and V regression q_backup = tf.stop_gradient( tf.expand_dims(r_ph, 1) + gamma * (1 - tf.expand_dims(d_ph, 1)) * v_targ) # q_backup = tf.stop_gradient(r_ph + gamma*(1-d_ph)) v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi) # Soft actor-critic losses pi_loss = tf.reduce_mean(alpha * logp_pi - q1_pi) q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2) v_loss = 0.5 * tf.reduce_mean((v_backup - v)**2) value_loss = q1_loss + q2_loss + v_loss # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars('main/q') + get_vars('main/v') with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # All ops to call during one training step step_ops = [ pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, train_pi_op, train_value_op, target_update ] # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2, 'v': v }) def get_action(o, head, deterministic=False): # act_op = mu[:,p_head,:] if deterministic else pi[:,p_head,:] act_op = mu if deterministic else pi return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1)})[0, head, :] def test_agent(n, head): global sess, mu, pi, q1, q2, q1_pi, q2_pi for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 # head = np.random.randint(num_heads, size = 1)[0] while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, head, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs policy = LM_DSEE(ac_kwargs['num_heads'], rho=0.49, lower=-50, amplitude=4450) policy.startGame() returns = [] choices = [] head = policy.choice() # print ('Total number of heads', ac_kwargs['num_heads']) # Main loop: collect experience in env and update/log each epoch train_end = start_time for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t > start_steps: a = get_action(o, head) else: a = env.action_space.sample() # a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ train_start = time.time() # print (t//steps_per_epoch, "Playing time", train_start - train_end) policy.getReward(head, ep_ret) returns.append(ep_ret) choices.append(head) head = policy.choice() for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], } # tic = time.time() outs = sess.run(step_ops, feed_dict) # toc = time.time() # print (toc-tic) logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2], LossV=outs[3], Q1Vals=outs[4], Q2Vals=outs[5], VVals=outs[6], LogPi=outs[7]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 train_end = time.time() # print (t//steps_per_epoch, "Training time", train_end - train_start) # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: test_start = time.time() epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. head = policy.choice() test_agent(n=10, head=head) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() test_end = time.time() # print (t//steps_per_epoch, "Testing time", test_end - test_start) # print ("*"*30) print(returns, choices)
def ddpg(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, update_after=1000, update_every=50, act_noise=0.1, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Deep Deterministic Policy Gradient (DDPG) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with an ``act`` method, a ``pi`` module, and a ``q`` module. The ``act`` method and ``pi`` module should accept batches of observations as inputs, and ``q`` should accept a batch of observations and a batch of actions as inputs. When called, these should return: =========== ================ ====================================== Call Output Shape Description =========== ================ ====================================== ``act`` (batch, act_dim) | Numpy array of actions for each | observation. ``pi`` (batch, act_dim) | Tensor containing actions from policy | given observations. ``q`` (batch,) | Tensor containing the current estimate | of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to DDPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Create actor-critic module and target networks ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) ac_targ = deepcopy(ac) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.q]) logger.log('\nNumber of parameters: \t pi: %d, \t q: %d\n' % var_counts) # Set up function for computing DDPG Q-loss def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] q = ac.q(o, a) assert len(q.shape) == 1 and q.shape[ 0] == batch_size, 'Expected shape (%d,), got %s' % (batch_size, str(q.shape)) # Bellman backup for Q function with torch.no_grad(): q_pi_targ = ac_targ.q(o2, ac_targ.pi(o2)) backup = r + gamma * (1 - d) * q_pi_targ # MSE loss against Bellman backup loss_q = ((q - backup)**2).mean() # Useful info for logging loss_info = dict(QVals=q.detach().numpy()) return loss_q, loss_info # Set up function for computing DDPG pi loss def compute_loss_pi(data): o = data['obs'] q_pi = ac.q(o, ac.pi(o)) return -q_pi.mean() # Set up optimizers for policy and q-function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) q_optimizer = Adam(ac.q.parameters(), lr=q_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(data): # First run one gradient descent step for Q. q_optimizer.zero_grad() loss_q, loss_info = compute_loss_q(data) loss_q.backward() q_optimizer.step() # Freeze Q-network so you don't waste computational effort # computing gradients for it during the policy learning step. for p in ac.q.parameters(): p.requires_grad = False # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi = compute_loss_pi(data) loss_pi.backward() pi_optimizer.step() # Unfreeze Q-network so you can optimize it at next DDPG step. for p in ac.q.parameters(): p.requires_grad = True # Record things logger.store(LossQ=loss_q.item(), LossPi=loss_pi.item(), **loss_info) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) def get_action(o, noise_scale): a = ac.act(torch.as_tensor(o, dtype=torch.float32)) a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(): for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, _ = test_env.step(get_action(o, 0)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy (with some noise, via act_noise). if t > start_steps: a = get_action(o, act_noise) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Update handling if t >= update_after and t % update_every == 0: for _ in range(update_every): batch = replay_buffer.sample_batch(batch_size) update(data=batch) # End of epoch handling if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('QVals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def sad(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000, update_after=1000, update_every=50, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=100, demo_file=''): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Create actor-critic module and target networks ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs).to(device) ac_targ = deepcopy(ac).to(device) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters()) # Expert replay buffer demo_buffer = DemoBuffer() demo_buffer.load(demo_file) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple( core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2]) logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' % var_counts) # Set up function for computing SAC Q-losses def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] o, a, r, o2, d = Variable(o), Variable(a), Variable(r), Variable( o2), Variable(d) q1 = ac.q1(o, a) q2 = ac.q2(o, a) # Bellman backup for Q functions with torch.no_grad(): # Target actions come from *current* policy a2, logp_a2 = ac.pi(o2) # Target Q-values q1_pi_targ = ac_targ.q1(o2, a2) q2_pi_targ = ac_targ.q2(o2, a2) q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) backup = r + gamma * (1 - d) * (q_pi_targ - alpha * logp_a2) # MSE loss against Bellman backup loss_q1 = ((q1 - backup)**2).mean() loss_q2 = ((q2 - backup)**2).mean() loss_q = loss_q1 + loss_q2 # Useful info for logging q_info = dict(Q1Vals=q1.cpu().detach().numpy(), Q2Vals=q2.cpu().detach().numpy()) return loss_q, q_info # Set up function for computing SAC pi loss def compute_loss_pi(data): o = data['obs'] o = Variable(o) pi, logp_pi = ac.pi(o) q1_pi = ac.q1(o, pi) q2_pi = ac.q2(o, pi) q_pi = torch.min(q1_pi, q2_pi) # Entropy-regularized policy loss loss_pi = (alpha * logp_pi - q_pi).mean() # Useful info for logging pi_info = dict(LogPi=logp_pi.cpu().detach().numpy()) return loss_pi, pi_info # Set up optimizers for policy and q-function pi_optimizer = Adam(ac.pi.parameters(), lr=lr) q_optimizer = Adam(q_params, lr=lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(data): # First run one gradient descent step for Q1 and Q2 q_optimizer.zero_grad() loss_q, q_info = compute_loss_q(data) loss_q.backward() q_optimizer.step() # Record things logger.store(LossQ=loss_q.item(), **q_info) # Freeze Q-networks so you don't waste computational effort # computing gradients for them during the policy learning step. for p in q_params: p.requires_grad = False # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) loss_pi.backward() pi_optimizer.step() # Unfreeze Q-networks so you can optimize it at next DDPG step. for p in q_params: p.requires_grad = True # Record things logger.store(LossPi=loss_pi.item(), **pi_info) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) # all_obs, all_action = torch.tensor(demo_buffer.obs_buf, device=device), demo_buffer.act_buf # stack = [] # global counter # counter = 0 def get_action(o, deterministic=False): o = torch.as_tensor(o, device=device, dtype=torch.float32) # norm = torch.norm(all_obs - o, dim=1) # idx_min = torch.argmin(norm) # if norm[idx_min] < 0.3: # action = all_action[idx_min.item()] # stack.append(1) # else: action = ac.act(o, deterministic).cpu().numpy() return action def test_agent(): for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 writer.add_scalar(tag='test_reward', scalar_value=ep_ret, global_step=t) test_reward_buffer.append((t, ep_ret)) logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 for i in range(update_every * 10): batch = demo_buffer.sample_batch(batch_size) update(batch) # if i % update_every == 1: # logger.log_tabular('LossQ', average_only=True) # logger.dump_tabular() # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy. if t > start_steps: a = get_action(o) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) writer.add_scalar(tag='train_reward', scalar_value=ep_ret, global_step=t) train_reward_buffer.append((t, ep_ret)) o, ep_ret, ep_len = env.reset(), 0, 0 # Update handling if t >= update_after and t % update_every == 0: for j in range(update_every): batch = replay_buffer.sample_batch(batch_size) # batch = core.merge_batch(batch1, batch2) update(data=batch) # End of epoch handling if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({'env': env}, None) output_dir = logger_kwargs['output_dir'] + '/' test_rewards = np.array(test_reward_buffer) train_rewards = np.array(train_reward_buffer) train_file_name = os.path.join( output_dir, '{}_train_rewards.npy'.format(seed)) test_file_name = os.path.join( output_dir, '{}_test_rewards.npy'.format(seed)) np.save(train_file_name, train_rewards) np.save(test_file_name, test_rewards) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) # logger.log_tabular('Demo action', len(stack)) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() writer.close()
def vpg(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, pi_lr=3e-4, vf_lr=1e-3, train_v_iters=80, lam=0.97, max_ep_len=1000, logger_kwargs=dict(), save_freq=10, n_test_episodes=100): """ Vanilla Policy Gradient (with GAE-Lambda for advantage estimation) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to VPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. n_test_episodes (int): Number of episodes for test agent evaluation at the end of each epoch. """ # Special function to avoid certain slowdowns from PyTorch + MPI combo. #setup_pytorch_for_mpi() # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env_fn() def space_dim(space): if isinstance(space, Box): return space.shape elif isinstance(space, Discrete): return space.n else: raise ValueError obs_dim = space_dim(env.observation_space) act_dim = space_dim(env.action_space) # Create actor-critic module ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) # if torch.cuda.device_count() > 1: # ac.pi = nn.DataParallel(ac.pi) # ac.v = nn.DataParallel(ac.v) ac.to(device) # Sync params across processes #sync_params(ac) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Set up experience buffer local_steps_per_epoch = steps_per_epoch #local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = VPGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Set up function for computing VPG policy loss def compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[ 'logp'] # Policy loss pi, logp = ac.pi(obs, act) loss_pi = -(logp * adv).mean() # Useful extra info approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() pi_info = dict(kl=approx_kl, ent=ent) return loss_pi, pi_info # Set up function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs) - ret)**2).mean() # Set up optimizers for policy and value function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(): data = buf.get() # Get loss and info values before update pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with a single step of gradient descent pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) loss_pi.backward() #mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() #mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update kl, ent = pi_info['kl'], pi_info_old['ent'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old)) def test_agent(): test_env = env_fn() o, test_ep_ret, test_ep_len = test_env.reset(), 0, 0 num_episodes = 0 while num_episodes < n_test_episodes: a, _, _ = ac.step( torch.as_tensor(o, dtype=torch.float32, device=device)) o2, r, d, _ = env.step(a) test_ep_ret += r test_ep_len += 1 o = o2 timeout = ep_len == max_ep_len terminal = d or timeout if timeout or terminal: logger.store(TestEpRet=test_ep_ret) num_episodes += 1 o, test_ep_ret, test_ep_len = test_env.reset(), 0, 0 # Prepare for interaction with environment start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v, logp = ac.step( torch.as_tensor(o, dtype=torch.float32, device=device)) next_o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v, logp) logger.store(VVals=v) # Update obs (critical!) o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: if epoch_ended and not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _ = ac.step( torch.as_tensor(o, dtype=torch.float32, device=device)) else: v = 0 buf.finish_path(v) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform VPG update! update() test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('Time', time.time() - start_time) wandb.log(logger.log_current_row, step=epoch) logger.dump_tabular()
def ppo(workload_file, model_path, ac_kwargs=dict(), seed=0, traj_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10, pre_trained=0, trained_model=None, attn=False, shuffle=False, backfil=False, skip=False, score_type=0, batch_job_slice=0, sched_algo=4): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env = HPCEnvSkip(shuffle=shuffle, backfil=backfil, skip=skip, job_score_type=score_type, batch_job_slice=batch_job_slice, build_sjf=False, sched_algo=sched_algo) env.seed(seed) env.my_init(workload_file=workload_file, sched_file=model_path) obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space ac_kwargs['attn'] = attn # Inputs to computation graph buf = PPOBuffer(obs_dim, act_dim, traj_per_epoch * JOB_SEQUENCE_SIZE, gamma, lam) if pre_trained: sess = tf.Session() model = restore_tf_graph(sess, trained_model) logger.log('load pre-trained model') # Count variables var_counts = tuple(count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) x_ph = model['x'] a_ph = model['a'] mask_ph = model['mask'] adv_ph = model['adv'] ret_ph = model['ret'] logp_old_ph = model['logp_old_ph'] pi = model['pi'] v = model['v'] # logits = model['logits'] out = model['out'] logp = model['logp'] logp_pi = model['logp_pi'] pi_loss = model['pi_loss'] v_loss = model['v_loss'] approx_ent = model['approx_ent'] approx_kl = model['approx_kl'] clipfrac = model['clipfrac'] clipped = model['clipped'] # Optimizers # graph = tf.get_default_graph() # op = sess.graph.get_operations() # [print(m.values()) for m in op] # train_pi = graph.get_tensor_by_name('pi/conv2d/kernel/Adam:0') # train_v = graph.get_tensor_by_name('v/conv2d/kernel/Adam:0') train_pi = tf.get_collection("train_pi")[0] train_v = tf.get_collection("train_v")[0] # train_pi_optimizer = MpiAdamOptimizer(learning_rate=pi_lr, name='AdamLoad') # train_pi = train_pi_optimizer.minimize(pi_loss) # train_v_optimizer = MpiAdamOptimizer(learning_rate=vf_lr, name='AdamLoad') # train_v = train_v_optimizer.minimize(v_loss) # sess.run(tf.variables_initializer(train_pi_optimizer.variables())) # sess.run(tf.variables_initializer(train_v_optimizer.variables())) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, mask_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi, out] else: x_ph, a_ph = placeholders_from_spaces(env.observation_space, env.action_space) # y_ph = placeholder(JOB_SEQUENCE_SIZE*3) # 3 is the number of sequence features mask_ph = placeholder(env.action_space.n) adv_ph, ret_ph, logp_old_ph = placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi, v, out = actor_critic(x_ph, a_ph, mask_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, mask_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi, out] # Experience buffer # Count variables var_counts = tuple(count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = tf.train.AdamOptimizer( learning_rate=pi_lr).minimize(pi_loss) train_v = tf.train.AdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) tf.add_to_collection("train_pi", train_pi) tf.add_to_collection("train_v", train_v) # Setup model saving # logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'action_probs': action_probs, 'log_picked_action_prob': log_picked_action_prob, 'v': v}) logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph, 'adv': adv_ph, 'mask': mask_ph, 'ret': ret_ph, 'logp_old_ph': logp_old_ph }, outputs={ 'pi': pi, 'v': v, 'out': out, 'pi_loss': pi_loss, 'logp': logp, 'logp_pi': logp_pi, 'v_loss': v_loss, 'approx_ent': approx_ent, 'approx_kl': approx_kl, 'clipped': clipped, 'clipfrac': clipfrac }) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() [o, co], r, d, ep_ret, ep_len, show_ret, sjf, f1, skip_count = env.reset( ), 0, False, 0, 0, 0, 0, 0, 0 # Main loop: collect experience in env and update/log each epoch start_time = time.time() for epoch in range(epochs): t = 0 discard_times = 0 while True: # [no_skip, skip] lst = [1, 1] #for i in range(0, MAX_QUEUE_SIZE * JOB_FEATURES, JOB_FEATURES): # job = o[i:i + JOB_FEATURES] # # the skip time of will_skip job exceeds MAX_SKIP_TIME # if job[-2] == 1.0: # lst = [1,0] a, v_t, logp_t, output = sess.run(get_action_ops, feed_dict={ x_ph: o.reshape(1, -1), mask_ph: np.array(lst).reshape(1, -1) }) # print(a, end=" ") ''' action = np.random.choice(np.arange(MAX_QUEUE_SIZE), p=action_probs) log_action_prob = np.log(action_probs[action]) ''' if buf.ptr - buf.path_start_idx >= 10 * JOB_SEQUENCE_SIZE or buf.ptr >= buf.max_size: discard_times += 1 buf.ptr = buf.path_start_idx [ o, co ], r, d, ep_ret, ep_len, show_ret, sjf, f1, skip_count = env.reset( ), 0, False, 0, 0, 0, 0, 0, 0 continue # save and log buf.store(o, None, a, np.array(lst), r, v_t, logp_t) logger.store(VVals=v_t) if a[0] == 1: skip_count += 1 o, r, d, r2, sjf_t, f1_t = env.step(a[0]) ep_ret += r ep_len += 1 show_ret += r2 sjf += sjf_t f1 += f1_t if d: t += 1 buf.finish_path(r) logger.store(EpRet=ep_ret, EpLen=ep_len, ShowRet=show_ret, SJF=sjf, F1=f1, SkipRatio=skip_count / ep_len) [ o, co ], r, d, ep_ret, ep_len, show_ret, sjf, f1, skip_count = env.reset( ), 0, False, 0, 0, 0, 0, 0, 0 if t >= traj_per_epoch: # print ("state:", state, "\nlast action in a traj: action_probs:\n", action_probs, "\naction:", action) break # print("Sample time:", (time.time()-start_time)/num_total, num_total) # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! # start_time = time.time() update() # print("Train time:", time.time()-start_time) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', with_min_and_max=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * traj_per_epoch * JOB_SEQUENCE_SIZE) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('ShowRet', average_only=True) logger.log_tabular('SJF', average_only=True) logger.log_tabular('F1', average_only=True) logger.log_tabular('SkipRatio', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def ude_td3_ConcreteD_batchP( env_fn, render_env=False, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, reward_scale=5, without_start_steps=True, batch_size=100, # TODO: change it back to 10000 start_steps=10000, #start_steps=10000, without_delay_train=False, act_noise=0.1, target_noise=0.2, noise_clip=0.5, policy_delay=2, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, n_post_action=10, uncertainty_method='dropout', sample_obs_std=1, uncertainty_driven_exploration=False, uncertainty_policy_delay=5000, dropout_rate=0.1, concentration_factor=0.1, minimum_exploration_level=0): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Deterministically computes actions | from policy given states. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to TD3. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) target_noise (float): Stddev for smoothing noise added to target policy. noise_clip (float): Limit for absolute value of target policy smoothing noise. policy_delay (int): Policy will only be updated once every policy_delay times for each update of the Q-networks. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # TODO: Test no start steps if without_start_steps: start_steps = batch_size logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) print('Creating networks ...') # Main outputs from computation graph with tf.variable_scope('main'): pi, _, pi_dropout_mask_generator, pi_dropout_mask_phs, \ q1, _, q1_dropout_mask_generator, q1_dropout_mask_phs, q1_pi, _, \ q2, _, q2_dropout_mask_generator, q2_dropout_mask_phs = actor_critic(x_ph, a_ph, **ac_kwargs, dropout_rate=0) # Random Network Distillation with tf.variable_scope('random_net_distill'): # RND Target and Predictor Network rnd_lr = 1e-3 rnd_targ_act, \ rnd_pred_act, rnd_pred_act_reg, rnd_pred_act_dropout_mask_generator, rnd_pred_act_dropout_mask_phs, \ rnd_targ_cri, \ rnd_pred_cri, rnd_pred_cri_reg, rnd_pred_cri_dropout_mask_generator, rnd_pred_cri_dropout_mask_phs = core.random_net_distill(x_ph, a_ph, **ac_kwargs, dropout_rate=0) # TODO: add environment model learning transition dynamics # TODO: Calculate Uncertainty of Q-value function # Initialize uncertainty module obs_set_size = 10 track_obs_set_unc_frequency = 100 # every 100 steps pi_unc_module = DropoutUncertaintyModule( act_dim, obs_dim, n_post_action, obs_set_size, track_obs_set_unc_frequency, x_ph, a_ph, ac_kwargs, dropout_rate, logger_kwargs, tf_var_scope_main='main', tf_var_scope_target='target', tf_var_scope_rnd='random_net_distill') # Target policy network with tf.variable_scope('target'): pi_targ, _, pi_dropout_mask_generator_targ, pi_dropout_mask_phs_targ, \ _, _, _, _, _, _, \ _, _, _, _ = actor_critic(x2_ph, a_ph, **ac_kwargs, dropout_rate=dropout_rate) pi_targ = pi_targ[0] # Target Q networks with tf.variable_scope('target', reuse=True): # TODO: add with_out_policy_smoothing # Target policy smoothing, by adding clipped noise to target actions epsilon = tf.random_normal(tf.shape(pi_targ), stddev=target_noise) epsilon = tf.clip_by_value(epsilon, -noise_clip, noise_clip) a2 = pi_targ + epsilon a2 = tf.clip_by_value(a2, -act_limit, act_limit) # Target Q-values, using action from target policy _, _, _, _, \ q1_targ, _, q1_dropout_mask_generator_targ, q1_dropout_mask_phs_targ, _, _, \ q2_targ, _, q2_dropout_mask_generator_targ, q2_dropout_mask_phs_targ = actor_critic(x2_ph, a2, **ac_kwargs, dropout_rate=dropout_rate) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size, logger_fname='experiences_log.txt', **logger_kwargs) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main']) print( '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t total: %d\n' % var_counts) # TODO: use conservative estimation of Q # Bellman backup for Q functions, using Clipped Double-Q targets def post_sample_q1_and_q2(feed_dictionary, batch_size): dropout_masks_set_q1 = q1_dropout_mask_generator_targ.generate_dropout_mask( n_post_action) dropout_masks_set_q2 = q2_dropout_mask_generator_targ.generate_dropout_mask( n_post_action) q1_targ_post = np.zeros((n_post_action, batch_size)) q2_targ_post = np.zeros((n_post_action, batch_size)) for mask_i in range(len(q1_dropout_mask_phs_targ)): feed_dictionary[q1_dropout_mask_phs_targ[ mask_i]] = dropout_masks_set_q1[mask_i] feed_dictionary[q2_dropout_mask_phs_targ[ mask_i]] = dropout_masks_set_q2[mask_i] q1_targ_post = sess.run(q1_targ, feed_dict=feed_dictionary) q2_targ_post = sess.run(q2_targ, feed_dict=feed_dictionary) min_q_targ = np.minimum(q1_targ_post.mean(axis=1), q2_targ_post.mean(axis=1)) return min_q_targ # min_q_targ = tf.placeholder(dtype=tf.float32) # backup = tf.stop_gradient(r_ph + gamma*(1-d_ph)*min_q_targ) min_q_targ = tf.minimum(q1_targ[0], q2_targ[0]) backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * min_q_targ) # TD3 losses pi_loss = -tf.reduce_mean(q1_pi[0]) q1_loss = tf.reduce_mean((q1[0] - backup)**2) q2_loss = tf.reduce_mean((q2[0] - backup)**2) q_loss = q1_loss + q2_loss # Separate train ops for pi, q pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q')) # RND losses and train ops rnd_loss_act = tf.reduce_mean( (rnd_pred_act[0] - rnd_targ_act)**2) + rnd_pred_act_reg / batch_size rnd_optimizer_act = tf.train.AdamOptimizer(learning_rate=rnd_lr) train_rnd_op_act = rnd_optimizer_act.minimize( rnd_loss_act, var_list=get_vars('random_net_distill/rnd_pred_act')) rnd_loss_cri = tf.reduce_mean( (rnd_pred_cri[0] - rnd_targ_cri)**2) + rnd_pred_cri_reg / batch_size rnd_optimizer_cri = tf.train.AdamOptimizer(learning_rate=rnd_lr) train_rnd_op_cri = rnd_optimizer_cri.minimize( rnd_loss_cri, var_list=get_vars('random_net_distill/rnd_pred_cri')) # Polyak averaging for target variables target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'pi': pi, 'q1': q1, 'q2': q2 }) def set_dropout_mask_and_post_number_to_one(feed_dictionary, *argv): """Set all dropout masks and post sample number in argv to one.""" for dropout_mask_ph in argv: for mask_i in range(len(dropout_mask_ph)): feed_dictionary[dropout_mask_ph[mask_i]] = np.ones( [1, dropout_mask_ph[mask_i].shape.as_list()[1]]) return feed_dictionary def set_dropout_mask_randomly_and_post_nuber_to_one( feed_dictionary, mask_phs, mask_generators): if len(mask_phs) != len(mask_generators): raise ValueError('mask_phs and mask_generators do not match.') else: for i in range(len(mask_phs)): dropout_mask_ph = mask_phs[i] dropout_masks = mask_generators[i].generate_dropout_mask( post_size=1) for mask_i in range(len(dropout_mask_ph)): feed_dictionary[ dropout_mask_ph[mask_i]] = dropout_masks[mask_i] return feed_dictionary def get_action_train(o, noise_scale, pi_unc_module, step_index): feed_dictionary = {x_ph: o.reshape(1, -1)} # Set dropout masks to one feed_dictionary = set_dropout_mask_and_post_number_to_one( feed_dictionary, rnd_pred_act_dropout_mask_phs, rnd_pred_cri_dropout_mask_phs, pi_dropout_mask_phs, q1_dropout_mask_phs, q2_dropout_mask_phs) # RND actor rnd_t_act, rnd_p_act = sess.run([rnd_targ_act, rnd_pred_act], feed_dict=feed_dictionary) rnd_t_act = rnd_t_act[0] rnd_p_act = rnd_p_act[0] rnd_e_act = np.sqrt(np.sum(rnd_p_act - rnd_t_act)**2) # Generate action if uncertainty_driven_exploration: # 1. Generate action Prediction a_prediction = sess.run(pi, feed_dict=feed_dictionary)[0][0] # 2. Generate post sampled actions # TODO: get covariance based on online and target policy respectively and calculate the difference a_post = pi_unc_module.get_post_samples_act(o, sess, step_index) rnd_a_post = pi_unc_module.get_post_samples_rnd_act( o, sess, step_index) # 3. Generate uncertainty-driven exploratory action a = np.zeros((act_dim, )) if act_dim > 1: # TODO: compute correlation rather than covariance a_cov = np.cov(a_post, rowvar=False) a_cov_shaped = concentration_factor * a_cov rnd_a_cov = np.cov(rnd_a_post, rowvar=False) a = np.random.multivariate_normal(a_prediction, a_cov_shaped, 1)[0] unc_a = a_cov unc_rnd_a = rnd_a_cov else: a_std = np.std(a_post, axis=0) a_std_shaped = concentration_factor * a_std + minimum_exploration_level * np.ones( a_std.shape) rnd_a_cov = np.std(rnd_a_post, axis=0) # TODO: only keep one a = np.random.normal(a_prediction, a_std_shaped, 1)[0] unc_a = a_std unc_rnd_a = rnd_a_cov else: for mask_i in range(len(pi_dropout_mask_phs)): feed_dictionary[pi_dropout_mask_phs[mask_i]] = np.ones( [1, pi_dropout_mask_phs[mask_i].shape.as_list()[1]]) a = sess.run(pi, feed_dict=feed_dictionary)[0][0] a += noise_scale * np.random.randn(act_dim) unc_a = 0 unc_rnd_a = 0 a = np.clip(a, -act_limit, act_limit) # TODO: use uncertainty as intrinsic reward unc_based_reward = np.mean(np.abs(unc_a)) # TODO: should the a_ph be a or a_prediction?? feed_dictionary[a_ph] = a.reshape(1, -1) # Generate post sampled q values q1_post, q2_post = pi_unc_module.get_post_samples_q( o, a, sess, step_index) rnd_q_post = pi_unc_module.get_post_samples_rnd_cri( o, a, sess, step_index) unc_q1 = np.std(q1_post, axis=0) unc_q2 = np.std(q2_post, axis=0) unc_rnd_q = np.std(rnd_q_post, axis=0) q1_pred = sess.run(q1, feed_dict=feed_dictionary)[0][0] q2_pred = sess.run(q2, feed_dict=feed_dictionary)[0][0] # RND critic rnd_t_cri, rnd_p_cri = sess.run([rnd_targ_cri, rnd_pred_cri], feed_dict=feed_dictionary) rnd_t_cri = rnd_t_cri[0] rnd_p_cri = rnd_p_cri[0] rnd_e_cri = np.sqrt(np.sum(rnd_p_cri - rnd_t_cri)**2) return a, \ q1_pred, q2_pred, q1_post, q2_post,\ unc_a, unc_based_reward, unc_q1, unc_q2,\ unc_rnd_a, unc_rnd_q,\ rnd_t_act, rnd_p_act, rnd_e_act,\ rnd_t_cri, rnd_p_cri, rnd_e_cri def get_action_test(o): """Get deterministic action without exploration.""" feed_dictionary = {x_ph: o.reshape(1, -1)} for mask_i in range(len(pi_dropout_mask_phs)): feed_dictionary[pi_dropout_mask_phs[mask_i]] = np.ones( [1, pi_dropout_mask_phs[mask_i].shape.as_list()[1]]) a = sess.run(pi, feed_dict=feed_dictionary)[0][0] return np.clip(a, -act_limit, act_limit) def test_agent(n=10): for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, _ = test_env.step(get_action_test(o)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 ep_q1_var, ep_q2_var, ep_unc_a, ep_unc_q1, ep_unc_q2, \ ep_unc_rnd_a, ep_unc_rnd_q, ep_rnd_e_act, ep_rnd_e_cri = 0, 0, 0, 0, 0, 0, 0, 0, 0 total_steps = steps_per_epoch * epochs # No dropout and no post sample for training phase: set all dropout masks to 1 and post_size to 1 feed_dict_train = {} feed_dict_train = set_dropout_mask_and_post_number_to_one( feed_dict_train, pi_dropout_mask_phs, q1_dropout_mask_phs, q2_dropout_mask_phs, pi_dropout_mask_phs_targ, q1_dropout_mask_phs_targ, q2_dropout_mask_phs_targ, rnd_pred_act_dropout_mask_phs, rnd_pred_cri_dropout_mask_phs) # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ if t > start_steps: # import pdb; pdb.set_trace() a, \ q1_pred, q2_pred, q1_post, q2_post,\ unc_a, unc_based_reward, unc_q1, unc_q2, \ unc_rnd_a, unc_rnd_q, \ rnd_t_act, rnd_p_act, rnd_e_act,\ rnd_t_cri, rnd_p_cri, rnd_e_cri = get_action_train(o, act_noise, pi_unc_module, step_index=t) else: a = env.action_space.sample() # TODO:keep the same dimension with real covariance if uncertainty_driven_exploration: unc_a = np.zeros((act_dim, act_dim)) unc_rnd_a = np.zeros((act_dim, act_dim)) else: unc_a = 0 unc_rnd_a = 0 q1_pred, q2_pred = 0, 0 q1_post = np.zeros((n_post_action, )) q2_post = np.zeros((n_post_action, )) unc_q1, unc_q2, unc_rnd_q = 0, 0, 0 unc_based_reward = 0 rnd_t_act, rnd_p_act, rnd_e_act, rnd_t_cri, rnd_p_cri, rnd_e_cri = 0, 0, 0, 0, 0, 0 # Sample an observation set to track their uncertainty trajectories if t > start_steps: if pi_unc_module.obs_set_is_empty: pi_unc_module.sample_obs_set_from_replay_buffer(replay_buffer) if t % pi_unc_module.track_obs_set_unc_frequency == 0: pi_unc_module.calculate_obs_set_uncertainty( sess, t // steps_per_epoch, t) # TODO: try more frequent update to avoid bad dropout masks (perhaps not necessary because we can get # larger sample size now.) # Update uncertainty policy to current policy if t % uncertainty_policy_delay == 0: # pi_unc_module.uncertainty_policy_update(sess) pi_unc_module.update_weights_of_main_unc(sess) # Step the env if render_env: env.render() o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 ep_q1_var += np.var(q1_post) ep_q2_var += np.var(q2_post) # TODO: we cannot use this as uncertainty, because if the policy learns some dimension is correlated then the # corresponding element will be 1 in covariance matrix. ep_unc_a += np.sum(unc_a) ep_unc_rnd_a += np.sum(unc_rnd_a) ep_unc_q1 += unc_q1 ep_unc_q2 += unc_q2 ep_unc_rnd_q += unc_rnd_q ep_rnd_e_act += rnd_e_act ep_rnd_e_cri += rnd_e_cri # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, reward_scale * r, o2, d, t, steps_per_epoch, start_time, unc_a=unc_a, unc_rnd_a=unc_rnd_a, unc_q1=unc_q1, unc_q2=unc_q2, unc_rnd_q=unc_rnd_q, q1_pred=q1_pred, q2_pred=q2_pred, q1_post=q1_post, q2_post=q2_post, rnd_e_act=rnd_e_act, rnd_e_cri=rnd_e_cri) # replay_buffer.store(o, a, r + unc_based_reward, o2, d, uncertainty, t, steps_per_epoch, start_time) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if without_delay_train: batch = replay_buffer.sample_batch(batch_size) feed_dict_train[x_ph] = batch['obs1'] feed_dict_train[x2_ph] = batch['obs2'] feed_dict_train[a_ph] = batch['acts'] feed_dict_train[r_ph] = batch['rews'] feed_dict_train[d_ph] = batch['done'] # Train Random Net Distillation rnd_step_ops_act = [ rnd_loss_act, rnd_targ_act, rnd_pred_act, train_rnd_op_act ] rnd_outs_act = sess.run(rnd_step_ops_act, feed_dict_train) logger.store(LossRnd=rnd_outs_act[0]) # Train q q_step_ops = [q_loss, q1, q2, train_q_op] outs = sess.run(q_step_ops, feed_dict_train) logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2]) # Delayed policy update outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict_train) logger.store(LossPi=outs[0]) if d or (ep_len == max_ep_len): """ Perform all TD3 updates at the end of the trajectory (in accordance with source code of TD3 published by original authors). """ if not without_delay_train: for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict_train[x_ph] = batch['obs1'] feed_dict_train[x2_ph] = batch['obs2'] feed_dict_train[a_ph] = batch['acts'] feed_dict_train[r_ph] = batch['rews'] feed_dict_train[d_ph] = batch['done'] # Train Random Net Distillation # change dropout masks every training step mask_phs = [ rnd_pred_act_dropout_mask_phs, rnd_pred_cri_dropout_mask_phs ] mask_generators = [ rnd_pred_act_dropout_mask_generator, rnd_pred_cri_dropout_mask_generator ] feed_dict_train = set_dropout_mask_randomly_and_post_nuber_to_one( feed_dict_train, mask_phs, mask_generators) rnd_step_ops_act = [ rnd_loss_act, rnd_targ_act, rnd_pred_act, train_rnd_op_act ] rnd_outs_act = sess.run(rnd_step_ops_act, feed_dict_train) logger.store(LossRndAct=rnd_outs_act[0]) rnd_step_ops_cri = [ rnd_loss_cri, rnd_targ_cri, rnd_pred_cri, train_rnd_op_cri ] rnd_outs_cri = sess.run(rnd_step_ops_cri, feed_dict_train) logger.store(LossRndCri=rnd_outs_cri[0]) # Train Q-value function # feed_dict_train[min_q_targ] = post_sample_q1_and_q2(feed_dict_train, batch_size) q_step_ops = [q_loss, q1, q2, train_q_op] outs = sess.run(q_step_ops, feed_dict_train) logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2]) if j % policy_delay == 0: # Delayed policy update outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict_train) logger.store(LossPi=outs[0]) # No weight update delay pi_unc_module.update_weights_of_rnd_unc(sess) logger.store(EpRet=ep_ret, EpLen=ep_len, EpQ1Var=ep_q1_var, EpQ2Var=ep_q2_var, EpUncAct=ep_unc_a, EpUncRndAct=ep_unc_rnd_a, EpUncQ1=ep_unc_q1, EpUncQ2=ep_unc_q2, EpUncRndQ=ep_unc_rnd_q, EpRndErrorAct=ep_rnd_e_act, EpRndErrorCri=ep_rnd_e_cri) o, r, d, ep_ret, ep_len, ep_q1_var, ep_q2_var,\ ep_unc_a, ep_unc_q1, ep_unc_q2,\ ep_unc_rnd_a, ep_unc_rnd_q,\ ep_rnd_e_act, ep_rnd_e_cri = env.reset(), 0, False, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('LossRndAct', average_only=True) logger.log_tabular('LossRndCri', average_only=True) logger.log_tabular('EpQ1Var', with_min_and_max=True) logger.log_tabular('EpQ2Var', with_min_and_max=True) logger.log_tabular('EpUncAct', with_min_and_max=True) logger.log_tabular('EpUncRndAct', with_min_and_max=True) logger.log_tabular('EpUncQ1', with_min_and_max=True) logger.log_tabular('EpUncQ2', with_min_and_max=True) logger.log_tabular('EpUncRndQ', with_min_and_max=True) logger.log_tabular('EpRndErrorAct', with_min_and_max=True) logger.log_tabular('EpRndErrorCri', with_min_and_max=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def oac(env_fn, logger_kwargs=dict(), network_params=dict(), rl_params=dict()): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # env params thresh = rl_params['thresh'] # control params seed = rl_params['seed'] epochs = rl_params['epochs'] steps_per_epoch = rl_params['steps_per_epoch'] replay_size = rl_params['replay_size'] batch_size = rl_params['batch_size'] start_steps = rl_params['start_steps'] max_ep_len = rl_params['max_ep_len'] max_noop = rl_params['max_noop'] save_freq = rl_params['save_freq'] render = rl_params['render'] # rl params gamma = rl_params['gamma'] polyak = rl_params['polyak'] lr = rl_params['lr'] grad_clip_val = rl_params['grad_clip_val'] # entropy params alpha = rl_params['alpha'] target_entropy_start = rl_params['target_entropy_start'] target_entropy_stop = rl_params['target_entropy_stop'] target_entropy_steps = rl_params['target_entropy_steps'] # optimistic exploration params use_opt = rl_params['use_opt'] beta_UB = rl_params['beta_UB'] beta_LB = rl_params['beta_LB'] delta = rl_params['delta'] opt_lr = rl_params['opt_lr'] max_opt_steps = rl_params['max_opt_steps'] train_env, test_env = env_fn(), env_fn() obs_space = env.observation_space act_space = env.action_space # get the size after resize obs_dim = network_params['input_dims'] act_dim = act_space.n # set the seed tf.set_random_seed(seed) np.random.seed(seed) train_env.seed(seed) train_env.action_space.np_random.seed(seed) test_env.seed(seed) test_env.action_space.np_random.seed(seed) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # init a state buffer for storing last m states train_state_buffer = StateBuffer(m=obs_dim[2]) test_state_buffer = StateBuffer(m=obs_dim[2]) # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = placeholders(obs_dim, act_dim, obs_dim, None, None) # alpha and entropy setup max_target_entropy = tf.log(tf.cast(act_dim, tf.float32)) target_entropy_prop_ph = tf.placeholder(dtype=tf.float32, shape=()) target_entropy = max_target_entropy * target_entropy_prop_ph log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=0.0) if alpha == 'auto': # auto tune alpha alpha = tf.exp(log_alpha) else: # fixed alpha alpha = tf.get_variable('alpha', dtype=tf.float32, initializer=alpha) # Main outputs from computation graph with tf.variable_scope('main'): mu, pi, action_probs, log_action_probs, action_logits, q1_logits, q2_logits = build_models( x_ph, a_ph, act_dim, network_params) with tf.variable_scope('main', reuse=True): _, _, action_probs_next, log_action_probs_next, _, _, _ = build_models( x2_ph, a_ph, act_dim, network_params) # Target value network with tf.variable_scope('target'): _, _, _, _, _, q1_logits_targ, q2_logits_targ = build_models( x2_ph, a_ph, act_dim, network_params) # Count variables var_counts = tuple( count_vars(scope) for scope in ['log_alpha', 'main/pi', 'main/q1', 'main/q2', 'main']) print("""\nNumber of parameters: alpha: %d, pi: %d, q1: %d, q2: %d, total: %d\n""" % var_counts) if use_opt: # Optimistic Exploration mu_Q = (q1_logits + q2_logits) / 2.0 sigma_Q = tf.math.abs(q1_logits - q2_logits) / 2.0 Q_UB = mu_Q + beta_UB * sigma_Q Q_LB = mu_Q + beta_LB * sigma_Q Q_UB_sm = tf.nn.softmax( Q_UB, axis=-1 ) # needed to make EV and penalty proportional for optimisation R = tf.get_variable('R', dtype=tf.float32, shape=[1, act_dim], initializer=tf.random_normal_initializer( mean=0.0, stddev=0.01)) assign_R = R.assign( action_logits ) # initialises P as the same "pessimistic" action distribution P = tf.nn.softmax(R, axis=-1) expected_value = tf.reduce_sum(tf.multiply(P, Q_UB_sm)) KL_P_PT = tf.reduce_sum( tf.multiply(P, tf.log(tf.divide(P, action_probs)))) penalty = KL_P_PT - delta relu_penalty = tf.nn.relu(penalty) penalised_opt_function = -expected_value + relu_penalty optpi_optimizer = tf.train.AdamOptimizer(learning_rate=opt_lr) train_optpi_op = optpi_optimizer.minimize(penalised_opt_function, var_list=get_vars('R')) optimistic_policy_dist = tf.distributions.Categorical(probs=P) optimistic_pi = optimistic_policy_dist.sample() else: optimistic_pi = pi # use standard SAC policy Q_LB = tf.minimum(q1_logits, q2_logits) # Min Double-Q: min_q_logits_targ = tf.minimum(q1_logits_targ, q2_logits_targ) # Targets for Q regression q_backup = r_ph + gamma * (1 - d_ph) * tf.stop_gradient( tf.reduce_sum(action_probs_next * (min_q_logits_targ - alpha * log_action_probs_next), axis=-1)) # critic losses q1_a = tf.reduce_sum(tf.multiply(q1_logits, a_ph), axis=1) q2_a = tf.reduce_sum(tf.multiply(q2_logits, a_ph), axis=1) q1_loss = 0.5 * tf.reduce_mean((q_backup - q1_a)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2_a)**2) value_loss = q1_loss + q2_loss # policy loss pi_backup = tf.reduce_sum(action_probs * (alpha * log_action_probs - Q_LB), axis=-1) pi_loss = tf.reduce_mean(pi_backup) # alpha loss for temperature parameter pi_entropy = -tf.reduce_sum(action_probs * log_action_probs, axis=-1) alpha_backup = tf.stop_gradient(target_entropy - pi_entropy) alpha_loss = -tf.reduce_mean(log_alpha * alpha_backup) # Policy train op # (has to be separate from value train op, because q1_logits appears in pi_loss) pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr, epsilon=1e-04) if grad_clip_val is not None: gvs = pi_optimizer.compute_gradients(pi_loss, var_list=get_vars('main/pi')) capped_gvs = [(ClipIfNotNone(grad, grad_clip_val), var) for grad, var in gvs] train_pi_op = pi_optimizer.apply_gradients(capped_gvs) else: train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr, epsilon=1e-04) with tf.control_dependencies([train_pi_op]): if grad_clip_val is not None: gvs = value_optimizer.compute_gradients( value_loss, var_list=get_vars('main/q')) capped_gvs = [(ClipIfNotNone(grad, grad_clip_val), var) for grad, var in gvs] train_value_op = value_optimizer.apply_gradients(capped_gvs) else: train_value_op = value_optimizer.minimize( value_loss, var_list=get_vars('main/q')) # Alpha train op alpha_optimizer = tf.train.AdamOptimizer(learning_rate=lr, epsilon=1e-04) with tf.control_dependencies([train_value_op]): train_alpha_op = alpha_optimizer.minimize( alpha_loss, var_list=get_vars('log_alpha')) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # All ops to call during one training step step_ops = [ pi_loss, q1_loss, q2_loss, q1_a, q2_a, pi_entropy, target_entropy, alpha_loss, alpha, train_pi_op, train_value_op, train_alpha_op, target_update ] # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session(config=tf_config) sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'mu': mu, 'pi': pi, 'q1_a': q1_a, 'q2_a': q2_a }) def get_action(state, deterministic=False): state = state.astype('float32') / 255. # # record data for printing # _ = sess.run(assign_R, feed_dict={x_ph: [state]}) # ins = sess.run([action_probs, Q_UB, P, KL_P_PT], feed_dict={x_ph: [state]}) if deterministic: act_op = mu else: if use_opt: # run a few optimisation steps to set optimistic policy _ = sess.run(assign_R, feed_dict={x_ph: [state]}) for i in range(max_opt_steps): _ = sess.run([train_optpi_op], feed_dict={x_ph: [state]}) act_op = optimistic_pi # # print difference between pessimistic and optimistic policy probabilities # outs = sess.run([P, KL_P_PT], feed_dict={x_ph: [state]}) # print('ap: ', ins[0]) # print('Q: ', ins[1]) # print('P_in: ', ins[2]) # print('P_out: ', outs[0]) # print('KL_in: ', ins[3]) # print('KL_out: ', outs[1]) # print('') return sess.run(act_op, feed_dict={x_ph: [state]})[0] def reset(env, state_buffer): o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # fire to start game and perform no-op for some frames to randomise start o, _, _, _ = env.step(1) # Fire action to start game for _ in range(np.random.randint(1, max_noop)): o, _, _, _ = env.step(0) # Action 'NOOP' o = process_image_observation(o, obs_dim, thresh) r = process_reward(r) old_lives = env.ale.lives() state = state_buffer.init_state(init_obs=o) return o, r, d, ep_ret, ep_len, old_lives, state def test_agent(n=10, render=True): global sess, mu, pi, q1, q2 for j in range(n): o, r, d, ep_ret, ep_len, test_old_lives, test_state = reset( test_env, test_state_buffer) terminal_life_lost_test = False if render: test_env.render() while not (d or (ep_len == max_ep_len)): # start by firing if terminal_life_lost_test: a = 1 else: # Take lower variance actions at test(noise_scale=0.05) a = get_action(test_state, True) # Take deterministic actions at test time o, r, d, _ = test_env.step(a) o = process_image_observation(o, obs_dim, thresh) r = process_reward(r) test_state = test_state_buffer.append_state(o) ep_ret += r ep_len += 1 if test_env.ale.lives() < test_old_lives: test_old_lives = test_env.ale.lives() terminal_life_lost_test = True else: terminal_life_lost_test = False if render: test_env.render() if render: test_env.close() logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) # ================== Main training Loop ================== start_time = time.time() o, r, d, ep_ret, ep_len, old_lives, state = reset(train_env, train_state_buffer) total_steps = steps_per_epoch * epochs target_entropy_prop = linear_anneal(current_step=0, start=target_entropy_start, stop=target_entropy_stop, steps=target_entropy_steps) save_iter = 0 terminal_life_lost = False # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # press fire to start if terminal_life_lost: a = 1 else: if t > start_steps: a = get_action(state) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) o2 = process_image_observation(o2, obs_dim, thresh) r = process_reward(r) one_hot_a = process_action(a, act_dim) next_state = train_state_buffer.append_state(o2) ep_ret += r ep_len += 1 if train_env.ale.lives() < old_lives: old_lives = train_env.ale.lives() terminal_life_lost = True else: terminal_life_lost = False # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(state, one_hot_a, r, next_state, terminal_life_lost) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 state = next_state if d or (ep_len == max_ep_len): """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], target_entropy_prop_ph: target_entropy_prop } outs = sess.run(step_ops, feed_dict) logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2], Q1Vals=outs[3], Q2Vals=outs[4], PiEntropy=outs[5], TargEntropy=outs[6], LossAlpha=outs[7], Alpha=outs[8]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len, old_lives, state = reset( train_env, train_state_buffer) # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # update target entropy every epoch target_entropy_prop = linear_anneal(current_step=t, start=target_entropy_start, stop=target_entropy_stop, steps=target_entropy_steps) # Save model if save_freq is not None: if (epoch % save_freq == 0) or (epoch == epochs - 1): print('Saving...') logger.save_state({'env': env}, itr=save_iter) save_iter += 1 # Test the performance of the deterministic version of the agent. test_agent(n=10, render=render) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('PiEntropy', average_only=True) logger.log_tabular('TargEntropy', average_only=True) logger.log_tabular('Alpha', average_only=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) logger.log_tabular('LossAlpha', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() plot_progress(os.path.join(logger_kwargs['output_dir'], 'progress.txt'), show_plot=True)
def simple_dqn(env_fn = lambda : gym.make('CartPole-v1') , actor_critic=None , ac_kwargs=dict() , seed=0 , episodes_per_epoch=1000 , epochs=1000 , gamma=0.99 , logger_kwargs=dict() , save_freq=1000 , hidden_dim=32 , n_layers=2 , lr=1e-4 , batch_size=32 , target_update_freq=2500 , final_epsilon=0.05 , finish_decay=50000 , replay_buffer_size=25000 , steps_before_training=5000 , n_test_eps = 10 ): max_steps_per_epoch = 5000 # Global variables num_of_train_epochs = epochs # `number_of_layers` hidden layers with `hidden_dim` units each number_of_layers = n_layers learning_rate = lr discount_factor = gamma # init log logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) #make gym enviornment env = env_fn() obs_dim = env.observation_space.shape[0] number_of_actions = env.action_space.n #define evaluation network with tf.variable_scope('evaluation_network'): #input layer obs_ph = tf.placeholder(dtype=tf.float32, shape=(None,obs_dim), name='obs_ph') #mlp - #mlp (Multi Layer Perceptron) - hidden layers hidden_sizes = [hidden_dim] * number_of_layers x = obs_ph for h in hidden_sizes: x = tf.layers.dense(x, units=h, activation=tf.tanh) #output layer eval_net = tf.layers.dense(x,units=number_of_actions,activation=None) #define taget network with tf.variable_scope('target_network'): #input layer obs_target_ph = tf.placeholder(dtype=tf.float32, shape=(None,obs_dim), name='obs_target_ph') #mlp - #mlp (Multi Layer Perceptron) - hidden layers hidden_sizes = [hidden_dim] * number_of_layers x = obs_target_ph for h in hidden_sizes: x = tf.layers.dense(x, units=h, activation=tf.tanh) #output layer target_net = tf.layers.dense(x,units=number_of_actions,activation=None) #define loss function selected_action_ph = tf.placeholder(shape=(None,), dtype=tf.int32, name='selected_action_ph') reward_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='reward_ph') done_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='done_ph') actions_one_hot = tf.one_hot(selected_action_ph, number_of_actions) q_a = tf.reduce_sum(actions_one_hot * eval_net,axis=1) #use target network to approximate TD target = reward_ph + discount_factor * (1-done_ph) * tf.stop_gradient(tf.reduce_max(target_net, axis=1)) loss = tf.reduce_mean((q_a - target)**2) #init replay buffer replay_current_obs = np.zeros([replay_buffer_size, obs_dim], dtype=np.int32) replay_next_obs = np.zeros([replay_buffer_size, obs_dim], dtype=np.int32) replay_selected_action = np.zeros(replay_buffer_size, dtype=np.int32) replay_reward =np.zeros(replay_buffer_size, dtype=np.float32) replay_done = np.zeros(replay_buffer_size, dtype=np.float32) # update op for target network main_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='evaluation_network') target_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_network') assign_ops = [tf.assign(target_var, main_var) for target_var, main_var in zip(target_vars, main_vars)] target_update_op = tf.group(*assign_ops) # define train optimizer_operation optimizer_operation = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss) # init session session = tf.InteractiveSession() session.run(tf.global_variables_initializer()) logger.setup_tf_saver(session, inputs={'x': obs_ph}, outputs={'q': eval_net}) current_index = replay_buffer_size - 1 #reset train data epoch, step, training_finished, epsilon = 0, 0, False, 1 #reset epoch data epoch_rews, epoch_lens, epoch_losses, epoch_qs = [], [], [], [] #reset episodic data obs, reward, done, ep_rews, ep_len, episode_num, end_of_epoch = env.reset(), 0, False, 0, 0, 0, False last_number_steps = 0 while not training_finished: step += 1 #get action - # epsilon greedy selected_action = 0 if np.random.rand() < epsilon : #exploration selected_action = np.random.randint(number_of_actions) else: #exploitation estimated_q = session.run(eval_net, feed_dict={obs_ph: obs.reshape(1,-1)}) selected_action = np.argmax(estimated_q) # preform one step in gym enviornment # receive observation reward and whether the episode has ended obs, reward, done, _ = env.step(selected_action) #store information in replay buffer #TODO deal with first and done replay_next_obs[current_index] = obs current_index = step % replay_buffer_size replay_current_obs[current_index] = obs replay_selected_action[current_index] = selected_action replay_reward[current_index] = reward replay_done[current_index] = done ep_rews += reward ep_len += 1 if done: episode_num += 1 #save episodic data epoch_rews.append(ep_rews) epoch_lens.append(ep_len) #reset episodic data obs, reward, done, ep_rews, ep_len, end_of_epoch = env.reset(), 0, False, 0, 0, episode_num % episodes_per_epoch == 0 #first `steps_before_training` do no train - replay buffer is too small if step > steps_before_training: #single train iteration #get data from replay trained_indices = np.random.randint(min(replay_buffer_size, step), size = batch_size) trained_observation = replay_current_obs[trained_indices] trained_next_observation = replay_next_obs[trained_indices] trained_selected_action = replay_selected_action[trained_indices] trained_reward = replay_reward[trained_indices] trained_done = replay_done[trained_indices] #if (step % save_freq == 0) or (step >= total_number_of_steps - 1): # logger.save_state({'env': env}, None) # train eval network step_loss, curr_q, _ = session.run([loss, q_a, optimizer_operation], feed_dict={obs_ph: trained_observation, obs_target_ph: trained_next_observation, selected_action_ph: trained_selected_action, reward_ph: trained_reward, done_ph: trained_done}) #just for logging epoch_losses.append(step_loss) epoch_qs.append(curr_q) if end_of_epoch: logger.save_state({'env': env}, None) # update target network session.run(target_update_op) epoch += 1 training_finished = epoch >= num_of_train_epochs #test epoch ep_rets, ep_lens = [], [] for _ in range(n_test_eps): obs, rew, done, ep_ret, ep_len = env.reset(), 0, False, 0, 0 while not(done): #env.render() estimated_q = session.run(eval_net, feed_dict={obs_ph: obs.reshape(1,-1)}) selected_action = np.argmax(estimated_q) obs, rew, done, _ = env.step(selected_action) ep_ret += rew ep_len += 1 ep_rets.append(ep_ret) ep_lens.append(ep_len) test_ep_ret = np.mean(ep_rets) test_ep_len = np.mean(ep_lens) obs, rew, done, ep_ret, ep_len, end_of_epoch = env.reset(), 0, False, 0, 0, False # log epoch results logger.log_tabular('Epoch', epoch) logger.log_tabular('TotalEnvInteracts', step - last_number_steps) logger.log_tabular('loss', np.mean(epoch_losses)) logger.log_tabular('AverageEpRet', np.mean(test_ep_ret)) logger.log_tabular('epispode mean length', np.mean(test_ep_len)) logger.dump_tabular() epoch_rews, epoch_lens, epoch_losses, epoch_qs, last_number_steps= [], [], [], [], step #adapt epsilon epsilon = 1 + (final_epsilon - 1)*min(1, step/finish_decay)
def my_ddpg(env_fn, seed=0, steps_per_epoch=4000, epochs=100, max_ep_len=1000, hidden_sizes=[256,256], logger_kwargs=dict(), save_freq=1, batch_size=100, start_steps=10000, update_after=1000, update_every=50, num_test_episodes=10, gamma=0.99, polyak=0.995, act_noise=0.1, pi_lr=1e-3, q_lr=1e-3, buffer_size=int(1e6)): """ My DDPG implementation """ # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random seed torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env_fn() test_env = env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] print("env.observation_space", env.observation_space) print("env.observation_space.shape", env.observation_space.shape) print("env.action_space", env.action_space) action_min = env.action_space.low[0] action_max = env.action_space.high[0] if isinstance(env.action_space, gym.spaces.Discrete): print("Discrete action space not supported for my-ddpg!") return # Set up experience buffer buf = ReplayBuffer(obs_dim, act_dim, buffer_size) # Instantiate models assert action_max == abs(action_min) policy = DeterministicPolicyNet(obs_dim, act_dim, hidden_sizes, action_max) policy_target = copy.deepcopy(policy) policy_optimizer = torch.optim.Adam(policy.mu_net.parameters(), lr=pi_lr) q_function = QNet(obs_dim, act_dim, hidden_sizes) q_function_target = copy.deepcopy(q_function) q_optimizer = torch.optim.Adam(q_function.q_net.parameters(), lr=q_lr) # Set up model saving logger.setup_pytorch_saver(policy) # TODO: Save value network as well # Freeze target networks with respect to optimizers (only update via polyak averaging) for p_targ in policy_target.parameters(): p_targ.requires_grad = False for q_targ in q_function_target.parameters(): q_targ.requires_grad = False # Prepare for interaction with environment num_steps = epochs * steps_per_epoch start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for step in range(num_steps): # TODO: Change to for loop over range(epochs) and range(steps_per_epoch) with torch.no_grad(): if step < start_steps: # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy (with some noise, via act_noise). a = env.action_space.sample() else: assert o.shape == (obs_dim,) a = policy(torch.tensor(o, dtype=torch.float32).unsqueeze(0)) assert a.shape == (1, act_dim) a = a[0] # Remove batch dimension a = torch.clamp(a + act_noise * torch.randn(act_dim), action_min, action_max) # Add exploration noise a = a.numpy() # Convert to numpy next_o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len==max_ep_len else d buf.store(o, a, r, next_o, d) # Update obs (critical!) o = next_o # Trajectory finished if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 if step >= update_after and step % update_every == 0: for _ in range(update_every): def update(): o, a, r, next_o, d = buf.sample_batch(batch_size) # Compute targets with torch.no_grad(): next_a_targ = policy_target(next_o) next_q_targ = q_function_target(next_o, next_a_targ) q_targ = r + gamma * (1 - d) * next_q_targ # Update Q function q_optimizer.zero_grad() q_loss = ((q_function(o, a) - q_targ)**2).mean() q_loss.backward() q_optimizer.step() # Freeze Q-network so you don't waste computational effort # computing gradients for it during the policy learning step. for p in q_function.parameters(): p.requires_grad = False # Policy function update policy_optimizer.zero_grad() policy_loss = -(q_function(o, policy(o))).mean() policy_loss.backward() policy_optimizer.step() # Unfreeze Q-network so you can optimize it at next DDPG step. for p in q_function.parameters(): p.requires_grad = True # Update target networks with polyak with torch.no_grad(): for p, p_targ in zip(policy.parameters(), policy_target.parameters()): p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) for q, q_targ in zip(q_function.parameters(), q_function_target.parameters()): q_targ.data.mul_(polyak) q_targ.data.add_((1 - polyak) * q.data) update() if (step + 1) % steps_per_epoch == 0: epoch = (step + 1) // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. def test_agent(): with torch.no_grad(): for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not(d or (ep_len == max_ep_len)): # Take deterministic actions at test time a = policy(torch.tensor(o, dtype=torch.float32).unsqueeze(0)) assert a.shape == (1, act_dim) a = a[0] # Remove batch dimension a = a.numpy() # Convert to numpy o, r, d, _ = test_env.step(a) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', step) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular()
def ppo(env_fn, expert=None, policy_path=None, actor_critic=core.mlp_actor_critic_m, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=10000, dagger_epochs=500, pretrain_epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=1e-4, dagger_noise=0.01, batch_size=64, replay_size=int(5e3), vf_lr=1e-4, train_pi_iters=80, train_v_iters=80, lam=0.999, max_ep_len=500, target_kl=0.01, logger_kwargs=dict(), save_freq=10, test_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) policy_path (str): path of pretrained policy model train from scratch if None logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) test_logger_kwargs = dict() test_logger_kwargs['output_dir'] = osp.join(logger_kwargs['output_dir'], "test") test_logger_kwargs['exp_name'] = logger_kwargs['exp_name'] test_logger = EpochLogger(**test_logger_kwargs) test_logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space act_high_limit = env.action_space.high act_low_limit = env.action_space.low sess = tf.Session() if policy_path is None: # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) tfa_ph = core.placeholder(act_dim) # Main outputs from computation graph mu, pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) sess.run(tf.global_variables_initializer()) else: # load pretrained model # sess, x_ph, a_ph, mu, pi, logp, logp_pi, v = load_policy(policy_path, itr='last', deterministic=False, act_high=env.action_space.high) # # get_action_2 = lambda x : sess.run(mu, feed_dict={x_ph: x[None,:]})[0] # adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) model = restore_tf_graph(sess, osp.join(policy_path, 'simple_save')) x_ph, a_ph, adv_ph, ret_ph, logp_old_ph = model['x_ph'], model[ 'a_ph'], model['adv_ph'], model['ret_ph'], model['logp_old_ph'] mu, pi, logp, logp_pi, v = model['mu'], model['pi'], model[ 'logp'], model['logp_pi'], model['v'] # tfa_ph = core.placeholder(act_dim) tfa_ph = model['tfa_ph'] # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) print("---------------", local_steps_per_epoch) buf = PPOBuffer(obs_dim, act_dim, steps_per_epoch, gamma, lam) # print(obs_dim) # print(act_dim) dagger_replay_buffer = DaggerReplayBuffer(obs_dim=obs_dim[0], act_dim=act_dim[0], size=replay_size) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives if policy_path is None: ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) v_loss = tf.reduce_mean((ret_ph - v)**2) dagger_pi_loss = tf.reduce_mean(tf.square(mu - tfa_ph)) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers dagger_pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) optimizer_pi = tf.train.AdamOptimizer(learning_rate=pi_lr) optimizer_v = tf.train.AdamOptimizer(learning_rate=vf_lr) train_dagger_pi_op = dagger_pi_optimizer.minimize( dagger_pi_loss, name='train_dagger_pi_op') train_pi = optimizer_pi.minimize(pi_loss, name='train_pi_op') train_v = optimizer_v.minimize(v_loss, name='train_v_op') sess.run(tf.variables_initializer(optimizer_pi.variables())) sess.run(tf.variables_initializer(optimizer_v.variables())) sess.run(tf.variables_initializer(dagger_pi_optimizer.variables())) else: graph = tf.get_default_graph() dagger_pi_loss = model['dagger_pi_loss'] pi_loss = model['pi_loss'] v_loss = model['v_loss'] approx_ent = model['approx_ent'] approx_kl = model['approx_kl'] clipfrac = model['clipfrac'] train_dagger_pi_op = graph.get_operation_by_name('train_dagger_pi_op') train_pi = graph.get_operation_by_name('train_pi_op') train_v = graph.get_operation_by_name('train_v_op') # sess = tf.Session() # sess.run(tf.global_variables_initializer()) # Sync params across processes # sess.run(sync_all_params()) tf.summary.FileWriter("log/", sess.graph) # Setup model saving logger.setup_tf_saver(sess, inputs={'x_ph': x_ph, 'a_ph': a_ph, 'tfa_ph': tfa_ph, 'adv_ph': adv_ph, 'ret_ph': ret_ph, 'logp_old_ph': logp_old_ph}, \ outputs={'mu': mu, 'pi': pi, 'v': v, 'logp': logp, 'logp_pi': logp_pi, 'clipfrac': clipfrac, 'approx_kl': approx_kl, \ 'pi_loss': pi_loss, 'v_loss': v_loss, 'dagger_pi_loss': dagger_pi_loss, 'approx_ent': approx_ent}) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) def choose_action(s, add_noise=False): s = s[np.newaxis, :] a = sess.run(mu, {x_ph: s})[0] if add_noise: noise = dagger_noise * act_high_limit * np.random.normal( size=a.shape) a = a + noise return np.clip(a, act_low_limit, act_high_limit) def test_agent(n=81, test_num=1): n = env.unwrapped._set_test_mode(True) con_flag = False for j in range(n): o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, info = env.step(choose_action(np.array(o), 0)) ep_ret += r ep_len += 1 if d: test_logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) test_logger.store(arrive_des=info['arrive_des']) test_logger.store( arrive_des_appro=info['arrive_des_appro']) if not info['out_of_range']: test_logger.store(converge_dis=info['converge_dis']) con_flag = True test_logger.store(out_of_range=info['out_of_range']) # print(info) # test_logger.dump_tabular() # time.sleep(10) if not con_flag: test_logger.store(converge_dis=10000) env.unwrapped._set_test_mode(False) def ref_test_agent(n=81, test_num=1): n = env.unwrapped._set_test_mode(True) con_flag = False for j in range(n): o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) a = call_ref_controller(env, expert) o, r, d, info = env.step(a) ep_ret += r ep_len += 1 if d: test_logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) test_logger.store(arrive_des=info['arrive_des']) test_logger.store( arrive_des_appro=info['arrive_des_appro']) if not info['out_of_range']: test_logger.store(converge_dis=info['converge_dis']) con_flag = True test_logger.store(out_of_range=info['out_of_range']) # print(info) # test_logger.dump_tabular() if not con_flag: test_logger.store(converge_dis=10000) env.unwrapped._set_test_mode(False) ref_test_agent(test_num=-1) test_logger.log_tabular('epoch', -1) test_logger.log_tabular('TestEpRet', average_only=True) test_logger.log_tabular('TestEpLen', average_only=True) test_logger.log_tabular('arrive_des', average_only=True) test_logger.log_tabular('arrive_des_appro', average_only=True) test_logger.log_tabular('converge_dis', average_only=True) test_logger.log_tabular('out_of_range', average_only=True) test_logger.dump_tabular() start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 test_policy_epochs = 91 episode_steps = 500 total_env_t = 0 test_num = 0 print(colorize("begin dagger training", 'green', bold=True)) for epoch in range(1, dagger_epochs + 1, 1): # test policy if epoch > 0 and (epoch % save_freq == 0) or (epoch == epochs): # Save model logger.save_state({}, None) # Test the performance of the deterministic version of the agent. test_num += 1 test_agent(test_num=test_num) test_logger.log_tabular('epoch', epoch) test_logger.log_tabular('TestEpRet', average_only=True) test_logger.log_tabular('TestEpLen', average_only=True) test_logger.log_tabular('arrive_des', average_only=True) test_logger.log_tabular('arrive_des_appro', average_only=True) test_logger.log_tabular('converge_dis', average_only=True) test_logger.log_tabular('out_of_range', average_only=True) test_logger.dump_tabular() # train policy o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 env.unwrapped._set_test_mode(False) obs, acs, rewards = [], [], [] for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run( get_action_ops, feed_dict={x_ph: np.array(o).reshape(1, -1)}) # a = get_action_2(np.array(o)) # save and log obs.append(o) ref_action = call_ref_controller(env, expert) if (epoch < pretrain_epochs): action = ref_action else: action = choose_action(np.array(o), True) buf.store(o, action, r, v_t, logp_t) logger.store(VVals=v_t) o, r, d, _ = env.step(action) acs.append(ref_action) rewards.append(r) ep_ret += r ep_len += 1 total_env_t += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: np.array(o).reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Perform dagger and partical PPO update! inputs = {k: v for k, v in zip(all_phs, buf.get())} # pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update max_step = len(np.array(rewards)) dagger_replay_buffer.stores(obs, acs, rewards) for _ in range(int(local_steps_per_epoch / 10)): batch = dagger_replay_buffer.sample_batch(batch_size) feed_dict = {x_ph: batch['obs1'], tfa_ph: batch['acts']} q_step_ops = [dagger_pi_loss, train_dagger_pi_op] for j in range(10): outs = sess.run(q_step_ops, feed_dict) logger.store(LossPi=outs[0]) c_v_loss = sess.run(v_loss, feed_dict=inputs) logger.store(LossV=c_v_loss, KL=0, Entropy=0, ClipFrac=0, DeltaLossPi=0, DeltaLossV=0, StopIter=0) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() # Main loop: collect experience in env and update/log each epoch print(colorize("begin ppo training", 'green', bold=True)) for epoch in range(1, epochs + 1, 1): # test policy if epoch > 0 and (epoch % save_freq == 0) or (epoch == epochs) or epoch == 1: # Save model logger.save_state({}, None) # Test the performance of the deterministic version of the agent. test_num += 1 test_agent(test_num=test_num) test_logger.log_tabular('epoch', epoch) test_logger.log_tabular('TestEpRet', average_only=True) test_logger.log_tabular('TestEpLen', average_only=True) test_logger.log_tabular('arrive_des', average_only=True) test_logger.log_tabular('arrive_des_appro', average_only=True) test_logger.log_tabular('converge_dis', average_only=True) test_logger.log_tabular('out_of_range', average_only=True) test_logger.dump_tabular() # train policy o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 env.unwrapped._set_test_mode(False) for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run( get_action_ops, feed_dict={x_ph: np.array(o).reshape(1, -1)}) # a = a[0] # a = get_action_2(np.array(o)) # a = np.clip(a, act_low_limit, act_high_limit) # if epoch < pretrain_epochs: # a = env.action_space.sample() # a = np.clip(a, act_low_limit, act_high_limit) # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) o, r, d, _ = env.step(a[0]) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: np.array(o).reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def sac_pytorch( env_fn, hidden_sizes=[256, 256], seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=3e-4, alpha=0.2, batch_size=256, start_steps=10000, max_ep_len=1000, save_freq=1, dont_save=True, regularization_weight=0, grad_clip=-1, logger_kwargs=dict(), ): """ Largely following OpenAI documentation But slightly different from tensorflow implementation Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. hidden_sizes: number of entries is number of hidden layers each entry in this list indicate the size of that hidden layer. applies to all networks seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. Note the epoch here is just logging epoch so every this many steps a logging to stdouot and also output file will happen note: not to be confused with training epoch which is a term used often in literature for all kinds of different things epochs (int): Number of epochs to run and train agent. Usage of this term can be different in different algorithms, use caution. Here every epoch you get new logs replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. However during testing the action always come from policy max_ep_len (int): Maximum length of trajectory / episode / rollout. Environment will get reseted if timestep in an episode excedding this number save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. logger_kwargs (dict): Keyword args for EpochLogger. """ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("running on device:", device) """set up logger""" logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) env, test_env = env_fn(), env_fn() ## seed torch and numpy torch.manual_seed(seed) np.random.seed(seed) ## seed environment along with env action space so that everything about env is seeded env.seed(seed) env.action_space.np_random.seed(seed) test_env.seed(seed) test_env.action_space.np_random.seed(seed) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # if environment has a smaller max episode length, then use the environment's max episode length max_ep_len = env._max_episode_steps if max_ep_len > env._max_episode_steps else max_ep_len # Action limit for clamping: critically, assumes all dimensions share the same bound! # we need .item() to convert it from numpy float to python float act_limit = env.action_space.high[0].item() # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) def test_agent(n=5): """ This will test the agent's performance by running n episodes During the runs, the agent only take deterministic action, so the actions are not drawn from a distribution, but just use the mean :param n: number of episodes to run the agent """ ep_return_list = np.zeros(n) for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time a = policy_net.get_env_action(o, deterministic=True) o, r, d, _ = test_env.step(a) ep_ret += r ep_len += 1 ep_return_list[j] = ep_ret logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs """init all networks""" # see line 1 policy_net = TanhGaussianPolicy(obs_dim, act_dim, hidden_sizes, action_limit=act_limit).to(device) value_net = Mlp(obs_dim, 1, hidden_sizes).to(device) target_value_net = Mlp(obs_dim, 1, hidden_sizes).to(device) q1_net = Mlp(obs_dim + act_dim, 1, hidden_sizes).to(device) q2_net = Mlp(obs_dim + act_dim, 1, hidden_sizes).to(device) # see line 2: copy parameters from value_net to target_value_net target_value_net.load_state_dict(value_net.state_dict()) # set up optimizers policy_optimizer = optim.Adam(policy_net.parameters(), lr=lr) value_optimizer = optim.Adam(value_net.parameters(), lr=lr) q1_optimizer = optim.Adam(q1_net.parameters(), lr=lr) q2_optimizer = optim.Adam(q2_net.parameters(), lr=lr) # mean squared error loss for v and q networks mse_criterion = nn.MSELoss() # Main loop: collect experience in env and update/log each epoch # NOTE: t here is the current number of total timesteps used # it is not the number of timesteps passed in the current episode for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t > start_steps: a = policy_net.get_env_action(o, deterministic=False) else: a = env.action_space.sample() # Step the env, get next observation, reward and done signal o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience (observation, action, reward, next observation, done) to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. Quoted from the original SAC paper: 'In practice, we take a single environment step followed by one or several gradient step' after a single environment step, the number of gradient steps is 1 for SAC. (see paper for reference) """ for j in range(ep_len): # get data from replay buffer batch = replay_buffer.sample_batch(batch_size) obs_tensor = Tensor(batch['obs1']).to(device) obs_next_tensor = Tensor(batch['obs2']).to(device) acts_tensor = Tensor(batch['acts']).to(device) # unsqueeze is to make sure rewards and done tensors are of the shape nx1, instead of n # to prevent problems later rews_tensor = Tensor(batch['rews']).unsqueeze(1).to(device) done_tensor = Tensor(batch['done']).unsqueeze(1).to(device) """ now we do a SAC update, following the OpenAI spinup doc check the openai sac document psudocode part for reference line nubmers indicate lines in psudocode part we will first compute each of the losses and then update all the networks in the end """ # see line 12: get a_tilda, which is newly sampled action (not action from replay buffer) a_tilda, mean_a_tilda, log_std_a_tilda, log_prob_a_tilda, _, _ = policy_net.forward( obs_tensor) """get q loss""" # see line 12: first equation v_from_target_v_net = target_value_net(obs_next_tensor) y_q = rews_tensor + gamma * (1 - done_tensor) * v_from_target_v_net # see line 13: compute loss for the 2 q networks, note that we want to detach the y_q value # since we only want to update q networks here, and don't want other gradients q1_prediction = q1_net(torch.cat([obs_tensor, acts_tensor], 1)) q1_loss = mse_criterion(q1_prediction, y_q.detach()) q2_prediction = q2_net(torch.cat([obs_tensor, acts_tensor], 1)) q2_loss = mse_criterion(q2_prediction, y_q.detach()) """get v loss""" # see line 12: second equation q1_a_tilda = q1_net(torch.cat([obs_tensor, a_tilda], 1)) q2_a_tilda = q2_net(torch.cat([obs_tensor, a_tilda], 1)) min_q1_q2_a_tilda = torch.min( torch.cat([q1_a_tilda, q2_a_tilda], 1), 1)[0].reshape(-1, 1) y_v = min_q1_q2_a_tilda - alpha * log_prob_a_tilda # see line 14: compute loss for value network v_prediction = value_net(obs_tensor) v_loss = mse_criterion(v_prediction, y_v.detach()) """policy loss""" # line 15: note that here we are doing gradient ascent, so we add a minus sign in the front policy_loss = -(q1_a_tilda - alpha * log_prob_a_tilda).mean() """ add policy regularization loss, this is not in openai's minimal version, but they are in the original sac code, see https://github.com/vitchyr/rlkit for reference this part is not necessary but might improve performance """ if regularization_weight > 0: policy_mean_reg_weight = regularization_weight policy_std_reg_weight = regularization_weight mean_reg_loss = policy_mean_reg_weight * (mean_a_tilda** 2).mean() std_reg_loss = policy_std_reg_weight * (log_std_a_tilda** 2).mean() policy_loss = policy_loss + mean_reg_loss + std_reg_loss """update networks""" q1_optimizer.zero_grad() q1_loss.backward() if grad_clip > 0: nn.utils.clip_grad_norm_(q1_net.parameters(), grad_clip) q1_optimizer.step() q2_optimizer.zero_grad() q2_loss.backward() if grad_clip > 0: nn.utils.clip_grad_norm_(q2_net.parameters(), grad_clip) q2_optimizer.step() value_optimizer.zero_grad() v_loss.backward() if grad_clip > 0: nn.utils.clip_grad_norm_(value_net.parameters(), grad_clip) value_optimizer.step() policy_optimizer.zero_grad() policy_loss.backward() if grad_clip > 0: nn.utils.clip_grad_norm_(policy_net.parameters(), grad_clip) policy_optimizer.step() # see line 16: update target value network with value network soft_update_model1_with_model2(target_value_net, value_net, polyak) # store diagnostic info to logger logger.store(LossPi=policy_loss.cpu().item(), LossQ1=q1_loss.cpu().item(), LossQ2=q2_loss.cpu().item(), LossV=v_loss.cpu().item(), Q1Vals=q1_prediction.detach().cpu().numpy(), Q2Vals=q2_prediction.detach().cpu().numpy(), VVals=v_prediction.detach().cpu().numpy(), LogPi=log_prob_a_tilda.detach().cpu().numpy()) ## store episode return and length to logger logger.store(EpRet=ep_ret, EpLen=ep_len) ## reset environment o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if (t + 1) % steps_per_epoch == 0: epoch = t // steps_per_epoch """ Save pytorch model, very different from tensorflow version We need to save the environment, the state_dict of each network and also the state_dict of each optimizer """ if not dont_save: sac_state_dict = { 'env': env, 'policy_net': policy_net.state_dict(), 'value_net': value_net.state_dict(), 'target_value_net': target_value_net.state_dict(), 'q1_net': q1_net.state_dict(), 'q2_net': q2_net.state_dict(), 'policy_opt': policy_optimizer, 'value_opt': value_optimizer, 'q1_opt': q1_optimizer, 'q2_opt': q2_optimizer } if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state(sac_state_dict, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def sac(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000, update_after=1000, update_every=50, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Soft Actor-Critic (SAC) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with an ``act`` method, a ``pi`` module, a ``q1`` module, and a ``q2`` module. The ``act`` method and ``pi`` module should accept batches of observations as inputs, and ``q1`` and ``q2`` should accept a batch of observations and a batch of actions as inputs. When called, ``act``, ``q1``, and ``q2`` should return: =========== ================ ====================================== Call Output Shape Description =========== ================ ====================================== ``act`` (batch, act_dim) | Numpy array of actions for each | observation. ``q1`` (batch,) | Tensor containing one current estimate | of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) ``q2`` (batch,) | Tensor containing the other current | estimate of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) =========== ================ ====================================== Calling ``pi`` should return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Tensor containing actions from policy | given observations. ``logp_pi`` (batch,) | Tensor containing log probabilities of | actions in ``a``. Importantly: gradients | should be able to flow back into ``a``. =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Create actor-critic module and target networks ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) ac_targ = deepcopy(ac) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters()) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple( core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2]) logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' % var_counts) # Set up function for computing SAC Q-losses def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] q1 = ac.q1(o, a) q2 = ac.q2(o, a) # Bellman backup for Q functions with torch.no_grad(): # Target actions come from *current* policy a2, logp_a2 = ac.pi(o2) # Target Q-values q1_pi_targ = ac_targ.q1(o2, a2) q2_pi_targ = ac_targ.q2(o2, a2) q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) backup = r + gamma * (1 - d) * (q_pi_targ - alpha * logp_a2) # MSE loss against Bellman backup loss_q1 = ((q1 - backup)**2).mean() loss_q2 = ((q2 - backup)**2).mean() loss_q = loss_q1 + loss_q2 # Useful info for logging q_info = dict(Q1Vals=q1.detach().numpy(), Q2Vals=q2.detach().numpy()) return loss_q, q_info # Set up function for computing SAC pi loss def compute_loss_pi(data): o = data['obs'] pi, logp_pi = ac.pi(o) q1_pi = ac.q1(o, pi) q2_pi = ac.q2(o, pi) q_pi = torch.min(q1_pi, q2_pi) # Entropy-regularized policy loss loss_pi = (alpha * logp_pi - q_pi).mean() # Useful info for logging pi_info = dict(LogPi=logp_pi.detach().numpy()) return loss_pi, pi_info # Set up optimizers for policy and q-function pi_optimizer = Adam(ac.pi.parameters(), lr=lr) q_optimizer = Adam(q_params, lr=lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(data): # First run one gradient descent step for Q1 and Q2 q_optimizer.zero_grad() loss_q, q_info = compute_loss_q(data) loss_q.backward() q_optimizer.step() # Record things logger.store(LossQ=loss_q.item(), **q_info) # Freeze Q-networks so you don't waste computational effort # computing gradients for them during the policy learning step. for p in q_params: p.requires_grad = False # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) loss_pi.backward() pi_optimizer.step() # Unfreeze Q-networks so you can optimize it at next DDPG step. for p in q_params: p.requires_grad = True # Record things logger.store(LossPi=loss_pi.item(), **pi_info) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) def get_action(o, deterministic=False): return ac.act(torch.as_tensor(o, dtype=torch.float32), deterministic) def test_agent(): for j in range(num_test_episodes): o, d, ep_ret, ep_len, success, goalDist, reachDist = test_env.reset( ), False, 0, 0, False, None, None while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, info = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 if 'success' in info: success = info['success'] or success if 'goalDist' in info and info['goalDist'] is not None: goalDist = info['goalDist'] if 'reachDist' in info and info['reachDist'] is not None: reachDist = info['reachDist'] if goalDist != None: logger.store(TestGoalDist=goalDist) if reachDist != None: logger.store(TestReachDist=reachDist) logger.store(TestEpRet=ep_ret, TestEpLen=ep_len, TestSuccess=success) # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy. if t > start_steps: a = get_action(o) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Update handling if t >= update_after and t % update_every == 0: for j in range(update_every): batch = replay_buffer.sample_batch(batch_size) update(data=batch) # End of epoch handling if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) if 'TestGoalDist' in logger.epoch_dict: logger.log_tabular('TestGoalDist', with_min_and_max=True) if 'TestReachDist' in logger.epoch_dict: logger.log_tabular('TestReachDist', with_min_and_max=True) if 'TestSuccess' in logger.epoch_dict: logger.log_tabular('TestSuccess', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def pretrain(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, pi_epochs=100, vf_epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10, demo_file=""): setup_pytorch_for_mpi() logger = EpochLogger(**logger_kwargs) # locals() return all local variable logger.save_config(locals()) # Random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # demo environment demo_env = DemoGymEnv(demo_file=demo_file, seed=seed) demo_env.check_env(env) # Create actor-critic module ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs).to(device) sync_params(ac) # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = ACDFBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) vf_pi_optimizer = Adam(ac.v_pi.parameters(), lr=vf_lr) logger.setup_pytorch_saver(ac) def compute_loss_v(data): obs, ret = Variable(data['obs']), Variable(data['ret']) return ((ac.v(obs) - ret)**2).mean() def compute_loss_v_pi(data): obs, ret = Variable(data['obs']), Variable(data['ret']) return ((ac.v_pi(obs) - ret)**2).mean() def demo_update(): data = buf.get() pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() pi_info, loss_pi, loss_v = {}, 0, 0 for i in range(train_pi_iters): pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: # logger.log('Early stopping at step %d due to reaching max kl.' % i) break loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() logger.store(StopIter=i) for i in range(train_v_iters): vf_pi_optimizer.zero_grad() loss_v = compute_loss_v_pi(data) loss_v.backward() mpi_avg_grads(ac.v_pi) vf_pi_optimizer.step() print("Pi loss: {}".format(pi_l_old)) kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old)) def compute_loss_pi(data): obs, act, adv, logp_old = Variable(data['obs']), Variable(data['act']), Variable(data['adv']), Variable(data['logp']) # Policy loss pi, logp = ac.pi(obs, act) ratio = torch.exp(logp - logp_old) clip_adv = torch.clamp(ratio, 1-clip_ratio, 1+clip_ratio) * adv loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() # Useful extra info approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() clipped = ratio.gt(1+clip_ratio) | ratio.lt(1-clip_ratio) clipfrac = torch.as_tensor(clipped, dtype=torch.float32, device=device).mean().item() pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac) return loss_pi, pi_info buf = ACDFBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # pretraining epochs # demonstration training: main loop, for policy network o, ep_ret, ep_len = demo_env.reset(), 0, 0 start_time = time.time() for epoch in range(pi_epochs): pi_old_data = [deepcopy(p.data) for p in ac.pi.parameters()] vf_old_data = [deepcopy(p.data) for p in ac.v.parameters()] vf_pi_old_data = [deepcopy(p.data) for p in ac.v_pi.parameters()] for t in range(local_steps_per_epoch): a, v, logp_a, m, std = ac.pretrain_step(torch.as_tensor(o, dtype=torch.float32, device=device)) next_o, r, d, _ = demo_env.step(a, std) ep_ret += r ep_len += 1 buf.store(o, a, r, v, logp_a, std=std) logger.store(VVals=v) o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: if epoch_ended and not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _, _, _ = ac.pretrain_step(torch.as_tensor(o, dtype=torch.float32, device=device)) else: v = 0 if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) buf.finish_path(v) o, ep_ret, ep_len = demo_env.reset(), 0, 0 # Save model # if (epoch % save_freq == 0) or (epoch == pi_epochs-1): if (epoch in SAVE_FREQ) or (epoch == pi_epochs - 1): save_pi(logger_kwargs.get('output_dir', "model"), itr=epoch, paramenters=ac.pi) logger.save_state({'env': env}, None) demo_update() delta_v, delta_v_pi, delta_pi = 0, 0, 0 for i, param in enumerate(ac.v_pi.parameters()): delta_v_pi += torch.norm(param.data - vf_pi_old_data[i]) for i, param in enumerate(ac.v.parameters()): delta_v += torch.norm(param.data - vf_old_data[i]) for i, param in enumerate(ac.pi.parameters()): delta_pi += torch.norm(param.data - pi_old_data[i]) print("delta v_pi: {}; delta vf: {}; delta pi: {}".format(delta_v_pi, delta_v, delta_pi)) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() logger.save_state({'env': env}, pi_epochs) def update_vf(): data = buf.get() v_l_old = compute_loss_v(data).item() print("Loss for Value function: {}".format(v_l_old)) for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) vf_optimizer.step() # for the value function pre-training o, ep_ret, ep_len = demo_env.reset(), 0, 0 start_time = time.time() for epoch in range(vf_epochs): pi_old_data = [deepcopy(p.data) for p in ac.pi.parameters()] vf_old_data = [deepcopy(p.data) for p in ac.v.parameters()] vf_pi_old_data = [deepcopy(p.data) for p in ac.v_pi.parameters()] for t in range(local_steps_per_epoch): next_o, r, d, _, a = demo_env.free_step() v = ac.v(torch.as_tensor(o, dtype=torch.float32, device=device)).cpu().detach().numpy() ep_ret += r ep_len += 1 buf.store(o, a, r, v, 1) # logger.store(VVals=v) o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: if epoch_ended and not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: v = ac.v(torch.as_tensor(o, dtype=torch.float32, device=device)).cpu().detach().numpy() else: v = 0 buf.finish_path(v) o, ep_ret, ep_len = demo_env.reset(), 0, 0 print("Pretraining for value function at Epoch: {}".format(epoch)) update_vf() delta_v, delta_v_pi, delta_pi = 0, 0, 0 for i, param in enumerate(ac.v_pi.parameters()): delta_v_pi += torch.norm(param.data - vf_pi_old_data[i]) for i, param in enumerate(ac.v.parameters()): delta_v += torch.norm(param.data - vf_old_data[i]) for i, param in enumerate(ac.pi.parameters()): delta_pi += torch.norm(param.data - pi_old_data[i]) print("delta v_pi: {}; delta vf: {}; delta pi: {}".format(delta_v_pi, delta_v, delta_pi)) if (epoch in SAVE_FREQ) or (epoch == vf_epochs - 1): save_vf(logger_kwargs.get('output_dir', "model"), itr=epoch, paramenters=ac.v) logger.save_state({'env': env}, None)
def ddpg(env_fn, actor_critic=core.ActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, act_noise=0.1, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A reference to ActorCritic class which after instantiation takes state, ``x``, and action, ``a``, and returns: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Deterministically computes actions | from policy given states. ``q`` (batch,) | Gives the current estimate of Q* for | states in ``x`` and actions in | ``a``. ``q_pi`` (batch,) | Gives the composition of ``q`` and | ``pi`` for states in ``x``: | q(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to DDPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) # https://pytorch.org/docs/master/notes/randomness.html#cudnn torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) actor_critic_main = actor_critic(obs_dim, **ac_kwargs).to(device) # Note that the action placeholder going to targer actor_critic here # is irrelevant, because we only need q_targ(s, pi_targ(s)). actor_critic_target = actor_critic(obs_dim, **ac_kwargs).to(device) # Count variables var_counts = tuple( core.count_vars(model) for model in [actor_critic_main.policy, actor_critic_main.q, actor_critic_main]) logger.log('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n' % var_counts) # Optimizers pi_optimizer = optim.Adam(actor_critic_main.policy.parameters(), lr=pi_lr) q_optimizer = optim.Adam(actor_critic_main.q.parameters(), lr=q_lr) def get_action(o, noise_scale): a = actor_critic_main(Tensor(o.reshape(1, -1)).to(device)) a = a.cpu().detach().numpy() a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(n=10): for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, _ = test_env.step(get_action(o, 0)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ if t > start_steps: a = get_action(o, act_noise) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all DDPG updates at the end of the trajectory, in accordance with tuning done by TD3 paper authors. """ for _ in range(ep_len): batch = replay_buffer.sample_batch(batch_size) x, x2, a, r, d = [ Tensor(batch[k]).to(device) for k in ['obs1', 'obs2', 'acts', 'rews', 'done'] ] _, q, q_pi = actor_critic_main(x, a) _, _, q_pi_targ = actor_critic_target(x2, a) # Bellman backup for Q function backup = (r + gamma * (1 - d) * q_pi_targ).detach() # DDPG losses pi_loss = -q_pi.mean() q_loss = ((q - backup)**2).mean() # Q-learning update q_optimizer.zero_grad() q_loss.backward() q_optimizer.step() logger.store(LossQ=q_loss, QVals=q.cpu().detach().numpy()) # Policy update pi_optimizer.zero_grad() pi_loss.backward() pi_optimizer.step() logger.store(LossPi=pi_loss) # Polyak averaging for target variables # Credits: https://github.com/ghliu/pytorch-ddpg/blob/master/util.py params = zip(actor_critic_target.parameters(), actor_critic_main.parameters()) for ac_target, ac_main in params: ac_target.data.copy_(ac_main.data * (1.0 - polyak) + ac_target.data * polyak) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, actor_critic_main, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('QVals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def td3(env_fn, expert=None, policy_path=None, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=500, epochs=1000, replay_size=int(5e3), gamma=0.99, polyak=0.995, pi_lr=1e-4, q_lr=1e-4, batch_size=64, start_epochs=500, dagger_epochs=500, pretrain_epochs=50, dagger_noise=0.02, act_noise=0.02, target_noise=0.02, noise_clip=0.5, policy_delay=2, max_ep_len=500, logger_kwargs=dict(), save_freq=50, UPDATE_STEP=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Deterministically computes actions | from policy given states. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to TD3. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) target_noise (float): Stddev for smoothing noise added to target policy. noise_clip (float): Limit for absolute value of target policy smoothing noise. policy_delay (int): Policy will only be updated once every policy_delay times for each update of the Q-networks. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) test_logger_kwargs = dict() test_logger_kwargs['output_dir'] = osp.join(logger_kwargs['output_dir'], "test") test_logger_kwargs['exp_name'] = logger_kwargs['exp_name'] test_logger = EpochLogger(**test_logger_kwargs) # test_logger_kwargs = dict() # test_logger_kwargs['output_dir'] = osp.join(logger_kwargs['output_dir'], "test") # test_logger_kwargs['exp_name'] = logger_kwargs['exp_name'] # test_logger = EpochLogger(**test_logger_kwargs) # pretrain_logger_kwargs = dict() # pretrain_logger_kwargs['output_dir'] = osp.join(logger_kwargs['output_dir'], "pretrain") # pretrain_logger_kwargs['exp_name'] = logger_kwargs['exp_name'] # pretrain_logger = EpochLogger(**pretrain_logger_kwargs) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, do not assumes all dimensions share the same bound! act_limit = env.action_space.high / 2 act_high_limit = env.action_space.high act_low_limit = env.action_space.low act_noise_limit = act_noise * act_limit sess = tf.Session() if policy_path is None: # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders( obs_dim, act_dim, obs_dim, None, None) tfa_ph = core.placeholder(act_dim) # Main outputs from computation graph with tf.variable_scope('main'): pi, q1, q2, q1_pi = actor_critic(x_ph, a_ph, **ac_kwargs) # Target policy network with tf.variable_scope('target'): pi_targ, _, _, _ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Target Q networks with tf.variable_scope('target', reuse=True): # Target policy smoothing, by adding clipped noise to target actions epsilon = tf.random_normal(tf.shape(pi_targ), stddev=target_noise) epsilon = tf.clip_by_value(epsilon, -noise_clip, noise_clip) a2 = pi_targ + epsilon a2 = tf.clip_by_value(a2, act_low_limit, act_high_limit) # Target Q-values, using action from target policy _, q1_targ, q2_targ, _ = actor_critic(x2_ph, a2, **ac_kwargs) else: # sess = tf.Session() model = restore_tf_graph(sess, osp.join(policy_path, 'simple_save')) x_ph, a_ph, x2_ph, r_ph, d_ph = model['x_ph'], model['a_ph'], model[ 'x2_ph'], model['r_ph'], model['d_ph'] pi, q1, q2, q1_pi = model['pi'], model['q1'], model['q2'], model[ 'q1_pi'] pi_targ, q1_targ, q2_targ = model['pi_targ'], model['q1_targ'], model[ 'q2_targ'] tfa_ph = core.placeholder(act_dim) dagger_epochs = 0 # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) dagger_replay_buffer = DaggerReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main']) print( '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t total: %d\n' % var_counts) if policy_path is None: # Bellman backup for Q functions, using Clipped Double-Q targets min_q_targ = tf.minimum(q1_targ, q2_targ) backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * min_q_targ) # dagger loss dagger_pi_loss = tf.reduce_mean(tf.square(pi - tfa_ph)) # TD3 losses pi_loss = -tf.reduce_mean(q1_pi) q1_loss = tf.reduce_mean((q1 - backup)**2) q2_loss = tf.reduce_mean((q2 - backup)**2) q_loss = tf.add(q1_loss, q2_loss) pi_loss = tf.identity(pi_loss, name="pi_loss") q1_loss = tf.identity(q1_loss, name="q1_loss") q2_loss = tf.identity(q2_loss, name="q2_loss") q_loss = tf.identity(q_loss, name="q_loss") # Separate train ops for pi, q dagger_pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) train_dagger_pi_op = dagger_pi_optimizer.minimize( dagger_pi_loss, var_list=get_vars('main/pi'), name='train_dagger_pi_op') train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'), name='train_pi_op') train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q'), name='train_q_op') # Polyak averaging for target variables target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess.run(tf.global_variables_initializer()) else: graph = tf.get_default_graph() # opts = graph.get_operations() # print (opts) pi_loss = model['pi_loss'] q1_loss = model['q1_loss'] q2_loss = model['q2_loss'] q_loss = model['q_loss'] train_q_op = graph.get_operation_by_name('train_q_op') train_pi_op = graph.get_operation_by_name('train_pi_op') # target_update = graph.get_operation_by_name('target_update') # target_init = graph.get_operation_by_name('target_init') # Polyak averaging for target variables target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # sess = tf.Session() # sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={'x_ph': x_ph, 'a_ph': a_ph, 'x2_ph': x2_ph, 'r_ph': r_ph, 'd_ph': d_ph}, \ outputs={'pi': pi, 'q1': q1, 'q2': q2, 'q1_pi': q1_pi, 'pi_targ': pi_targ, 'q1_targ': q1_targ, 'q2_targ': q2_targ, \ 'pi_loss': pi_loss, 'q1_loss': q1_loss, 'q2_loss': q2_loss, 'q_loss': q_loss}) def get_action(o, noise_scale): a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0] # todo: add act_limit scale noise a += noise_scale * np.random.randn(act_dim) return np.clip(a, act_low_limit, act_high_limit) def choose_action(s, add_noise=False): s = s[np.newaxis, :] a = sess.run(pi, {x_ph: s})[0] if add_noise: noise = dagger_noise * act_high_limit * np.random.normal( size=a.shape) a = a + noise return np.clip(a, act_low_limit, act_high_limit) def test_agent(n=81, test_num=1): n = env.unwrapped._set_test_mode(True) con_flag = False for j in range(n): o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, info = env.step(choose_action(np.array(o), 0)) ep_ret += r ep_len += 1 if d: test_logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) test_logger.store(arrive_des=info['arrive_des']) test_logger.store( arrive_des_appro=info['arrive_des_appro']) if not info['out_of_range']: test_logger.store(converge_dis=info['converge_dis']) con_flag = True test_logger.store(out_of_range=info['out_of_range']) # print(info) # test_logger.dump_tabular() # time.sleep(10) if not con_flag: test_logger.store(converge_dis=10000) env.unwrapped._set_test_mode(False) start_time = time.time() env.unwrapped._set_test_mode(False) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs test_num = 0 total_env_t = 0 print(colorize("begin dagger training", 'green', bold=True)) # Main loop for dagger pretrain for epoch in range(1, dagger_epochs + 1, 1): obs, acs, rewards = [], [], [] # number of timesteps for t in range(steps_per_epoch): # action = env.action_space.sample() # action = ppo.choose_action(np.array(observation)) obs.append(o) ref_action = call_ref_controller(env, expert) if (epoch < pretrain_epochs): action = ref_action else: action = choose_action(np.array(o), True) o2, r, d, info = env.step(action) ep_ret += r ep_len += 1 total_env_t += 1 acs.append(ref_action) rewards.append(r) # Store experience to replay buffer replay_buffer.store(o, action, r, o2, d) o = o2 if (t == steps_per_epoch - 1): # print ("reached the end") d = True if d: # collected data to replaybuffer max_step = len(np.array(rewards)) q = [ np.sum( np.power(gamma, np.arange(max_step - t)) * rewards[t:]) for t in range(max_step) ] dagger_replay_buffer.stores(obs, acs, rewards, q) # update policy for _ in range(int(max_step / 5)): batch = dagger_replay_buffer.sample_batch(batch_size) feed_dict = {x_ph: batch['obs1'], tfa_ph: batch['acts']} q_step_ops = [dagger_pi_loss, train_dagger_pi_op] for j in range(UPDATE_STEP): outs = sess.run(q_step_ops, feed_dict) logger.store(LossPi=outs[0]) # train q function for j in range(int(max_step / 5)): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } q_step_ops = [q_loss, q1, q2, train_q_op] # for _ in range(UPDATE_STEP): outs = sess.run(q_step_ops, feed_dict) logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2]) if j % policy_delay == 0: # Delayed target update outs = sess.run([target_update], feed_dict) # logger.store(LossPi=outs[0]) # logger.store(LossQ=1000000, Q1Vals=1000000, Q2Vals=1000000) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 break # End of epoch wrap-up if epoch > 0 and (epoch % save_freq == 0) or (epoch == dagger_epochs): # Save model logger.save_state({}, None) # Test the performance of the deterministic version of the agent. test_num += 1 test_agent(test_num=test_num) # Log info about epoch test_logger.log_tabular('epoch', epoch) test_logger.log_tabular('TestEpRet', average_only=True) test_logger.log_tabular('TestEpLen', average_only=True) test_logger.log_tabular('arrive_des', average_only=True) test_logger.log_tabular('converge_dis', average_only=True) test_logger.log_tabular('out_of_range', average_only=True) test_logger.dump_tabular() sess.run(target_init) print(colorize("begin td3 training", 'green', bold=True)) # Main loop: collect experience in env and update/log each epoch # total_env_t = 0 for epoch in range(1, epochs + 1, 1): # End of epoch wrap-up if epoch > 0 and (epoch % save_freq == 0) or (epoch == epochs): # Save model logger.save_state({}, None) # Test the performance of the deterministic version of the agent. test_num += 1 test_agent(test_num=test_num) # Log info about epoch test_logger.log_tabular('epoch', epoch) test_logger.log_tabular('TestEpRet', average_only=True) test_logger.log_tabular('TestEpLen', average_only=True) test_logger.log_tabular('arrive_des', average_only=True) test_logger.log_tabular('converge_dis', average_only=True) test_logger.log_tabular('out_of_range', average_only=True) test_logger.dump_tabular() """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ # o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 for t in range(steps_per_epoch): if epoch > start_epochs: a = get_action(np.array(o), act_noise_limit) else: a = env.action_space.sample() # ref_action = call_ref_controller(env, expert) # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 total_env_t += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) # d = False if ep_len==max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if (t == steps_per_epoch - 1): # print ("reached the end") d = True if d: """ Perform all TD3 updates at the end of the trajectory (in accordance with source code of TD3 published by original authors). """ for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } q_step_ops = [q_loss, q1, q2, train_q_op] # for _ in range(UPDATE_STEP): outs = sess.run(q_step_ops, feed_dict) logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2]) if j % policy_delay == 0: # Delayed policy update outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) logger.store(LossPi=outs[0]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 break
def acdf(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, pi_epochs=100, vf_epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10, demo_file=""): # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # demo environment demo_env = DemoGymEnv(demo_file=demo_file, seed=seed) demo_env.check_env(env) # Create actor-critic module ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs).to(device) # Sync params across processes sync_params(ac) # Count variables var_counts = tuple( core.count_vars(module) for module in [ac.pi, ac.v, ac.v_pi]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d and v_pi: %d\n' % var_counts) # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = ACDFBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Set up function for computing PPO policy loss def compute_loss_pi(data): obs, act, adv, logp_old = Variable(data['obs']), Variable( data['act']), Variable(data['adv']), Variable(data['logp']) # Policy loss pi, logp = ac.pi(obs, act) ratio = torch.exp(logp - logp_old) clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() # Useful extra info approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) clipfrac = torch.as_tensor(clipped, dtype=torch.float32, device=device).mean().item() pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac) return loss_pi, pi_info # Set up function for computing value loss def compute_loss_v(data): obs, ret = Variable(data['obs']), Variable(data['ret']) return ((ac.v(obs) - ret)**2).mean() def compute_loss_v_pi(data): obs, ret = Variable(data['obs']), Variable(data['ret']) return ((ac.v_pi(obs) - ret)**2).mean() # Set up optimizers for policy and value function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) vf_pi_optimizer = Adam(ac.v_pi.parameters(), lr=vf_lr) # Set up model savingF logger.setup_pytorch_saver(ac) def update(): data = buf.get() pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with multiple steps of gradient descent for i in range(train_pi_iters): pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() logger.store(StopIter=i) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old)) def demo_update(): data = buf.get() pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v_pi(data).item() for i in range(train_pi_iters): pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: # logger.log('Early stopping at step %d due to reaching max kl.' % i) break loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() # logger.store(StopIter=i) for i in range(train_v_iters): vf_pi_optimizer.zero_grad() loss_v = compute_loss_v_pi(data) loss_v.backward() mpi_avg_grads(ac.v_pi) vf_pi_optimizer.step() print("Pi loss: {}".format(pi_l_old)) # kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] # logger.store(LossPi=pi_l_old, LossV=v_l_old, # KL=kl, Entropy=ent, ClipFrac=cf, # DeltaLossPi=(loss_pi.item() - pi_l_old), # DeltaLossV=(loss_v.item() - v_l_old)) def update_vf(): data = buf.get() v_l_old = compute_loss_v(data).item() print("Loss for Value function: {}".format(v_l_old)) for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) vf_optimizer.step() # pretraining epochs # pi_epochs, vf_epochs = 100, 50 # demonstration training: main loop, for policy network o, ep_ret, ep_len = demo_env.reset(), 0, 0 start_time = time.time() for epoch in range(pi_epochs): for t in range(local_steps_per_epoch): a, v, logp_a, m, std = ac.pretrain_step( torch.as_tensor(o, dtype=torch.float32, device=device)) next_o, r, d, _ = demo_env.step(a, std) ep_ret += r ep_len += 1 buf.store(o, a, r, v, logp_a, std=std) # logger.store(VVals=v) o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: if epoch_ended and not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _, _, _ = ac.pretrain_step( torch.as_tensor(o, dtype=torch.float32, device=device)) else: v = 0 # if terminal: # # only save EpRet / EpLen if trajectory finished # # logger.store(EpRet=ep_ret, EpLen=ep_len) buf.finish_path(v) o, ep_ret, ep_len = demo_env.reset(), 0, 0 demo_update() # # Log info about epoch # logger.log_tabular('Epoch', epoch) # logger.log_tabular('EpRet', with_min_and_max=True) # logger.log_tabular('EpLen', average_only=True) # logger.log_tabular('VVals', with_min_and_max=True) # logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) # logger.log_tabular('LossPi', average_only=True) # logger.log_tabular('LossV', average_only=True) # logger.log_tabular('DeltaLossPi', average_only=True) # logger.log_tabular('DeltaLossV', average_only=True) # logger.log_tabular('Entropy', average_only=True) # logger.log_tabular('KL', average_only=True) # logger.log_tabular('ClipFrac', average_only=True) # logger.log_tabular('StopIter', average_only=True) # logger.log_tabular('Time', time.time() - start_time) # logger.dump_tabular() # for the value function pre-training o, ep_ret, ep_len = demo_env.reset(), 0, 0 start_time = time.time() for epoch in range(vf_epochs): for t in range(local_steps_per_epoch): next_o, r, d, _, a = demo_env.free_step() v = ac.v(torch.as_tensor(o, dtype=torch.float32, device=device)).cpu().detach().numpy() ep_ret += r ep_len += 1 buf.store(o, a, r, v, 1) # logger.store(VVals=v) o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: if epoch_ended and not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: v = ac.v( torch.as_tensor(o, dtype=torch.float32, device=device)).cpu().detach().numpy() else: v = 0 buf.finish_path(v) o, ep_ret, ep_len = demo_env.reset(), 0, 0 print("Pretraining for value function at Epoch: {}".format(epoch)) update_vf() # Prepare for interaction with environment start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 buf = ACDFBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v, logp = ac.step( torch.as_tensor(o, dtype=torch.float32, device=device)) next_o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v, logp) logger.store(VVals=v) # Update obs (critical!) o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: if epoch_ended and not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _ = ac.step( torch.as_tensor(o, dtype=torch.float32, device=device)) else: v = 0 buf.finish_path(v) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def sac(env, logger_kwargs=dict(), network_params=dict(), rl_params=dict(), resume_training=False, resume_params=dict()): logger = EpochLogger(**logger_kwargs) if not resume_training: save_vars = locals().copy() save_vars.pop('env') logger.save_config(save_vars) # ==== control params ==== seed = rl_params['seed'] epochs = rl_params['epochs'] steps_per_epoch = rl_params['steps_per_epoch'] replay_size = rl_params['replay_size'] update_freq = rl_params['update_freq'] n_updates = rl_params['n_updates'] batch_size = rl_params['batch_size'] start_steps = rl_params['start_steps'] max_ep_len = rl_params['max_ep_len'] num_tests = rl_params['num_tests'] save_freq = rl_params['save_freq'] # ==== rl params ==== use_HER = rl_params['use_HER'] use_prev_a = rl_params['use_prev_a'] gamma = rl_params['gamma'] polyak = rl_params['polyak'] act_lr = rl_params['act_lr'] crit_lr = rl_params['crit_lr'] alph_lr = rl_params['alph_lr'] # ==== exploration params ==== alpha = rl_params['alpha'] target_entropy = rl_params['target_entropy'] if not resume_training: sess = tf.compat.v1.Session(config=tf_config) # set seeding (still not perfectly deterministic) tf.set_random_seed(seed) np.random.seed(seed) env.seed(seed) env.action_space.np_random.seed(seed) # get required gym spaces obs = env.observation_space act = env.action_space # get the obs size after resize of raw image obs_dim = network_params['input_dims'] act_dim = env.action_space.shape[0] act_low = env.action_space.low[0] act_high = env.action_space.high[0] goal_dim = len(env.goal_list) if not resume_training: # init a state buffer for storing last m states train_state_buffer = StateBuffer(m=obs_dim[2]) test_state_buffer = StateBuffer(m=obs_dim[2]) # Experience buffer replay_buffer = ContReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, goal_dim=goal_dim, size=replay_size) # Inputs to computation graph x_ph, a_ph, prev_a_ph, x2_ph, r_ph, d_ph, g_ph = placeholders( obs_dim, act_dim, act_dim, obs_dim, None, None, goal_dim) # alpha Params if target_entropy == 'auto': target_entropy = tf.cast(-act_dim, tf.float32) else: target_entropy = tf.cast(target_entropy, tf.float32) log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=0.0) if alpha == 'auto': # auto tune alpha alpha = tf.exp(log_alpha) else: # fixed alpha alpha = tf.get_variable('alpha', dtype=tf.float32, initializer=alpha) # Main outputs from computation graph with tf.variable_scope('main'): mu, pi, logp_pi, q1_a, q2_a = create_rl_networks( x_ph, a_ph, use_prev_a, prev_a_ph, g_ph, act_high, network_params) with tf.variable_scope('main', reuse=True): # compose q with pi, for pi-learning _, _, _, q1_pi, q2_pi = create_rl_networks(x_ph, pi, use_prev_a, prev_a_ph, g_ph, act_high, network_params) # get actions and log probs of actions for next states, for Q-learning _, pi_next, logp_pi_next, _, _ = create_rl_networks( x2_ph, a_ph, use_prev_a, prev_a_ph, g_ph, act_high, network_params) # Target networks with tf.variable_scope('target'): _, _, _, q1_pi_targ, q2_pi_targ = create_rl_networks( x2_ph, pi_next, use_prev_a, a_ph, g_ph, act_high, network_params) var_counts = tuple( count_vars(scope) for scope in ['log_alpha', 'main/pi', 'main/q1', 'main/q2', 'main']) print("""\nNumber of parameters: alpha: %d, pi: %d, q1: %d, q2: %d, total: %d\n""" % var_counts) # Min Double-Q: min_q_pi = tf.minimum(q1_pi, q2_pi) min_q_pi_targ = tf.minimum(q1_pi_targ, q2_pi_targ) # Targets for Q and V regression q_backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * (min_q_pi_targ - alpha * logp_pi_next)) # critic losses q1_loss = 0.5 * tf.reduce_mean((q_backup - q1_a)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2_a)**2) value_loss = q1_loss + q2_loss # Soft actor losses pi_loss = tf.reduce_mean(alpha * logp_pi - min_q_pi) # alpha loss for temperature parameter alpha_backup = tf.stop_gradient(logp_pi + target_entropy) alpha_loss = -tf.reduce_mean((log_alpha * alpha_backup)) # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) pi_optimizer = tf.train.AdamOptimizer(learning_rate=act_lr, epsilon=1e-04) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'), name='train_pi_op') # Value train op value_optimizer = tf.train.AdamOptimizer(learning_rate=crit_lr, epsilon=1e-04) with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize( value_loss, var_list=get_vars('main/q'), name='train_value_op') # Alpha train op alpha_optimizer = tf.train.AdamOptimizer(learning_rate=alph_lr, epsilon=1e-04) with tf.control_dependencies([train_value_op]): train_alpha_op = alpha_optimizer.minimize( alpha_loss, var_list=get_vars('log_alpha'), name='train_alpha_op') # Polyak averaging for target variables target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ], name='target_update') # Initializing targets to match main variables target_init = tf.group([ tf.compat.v1.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess.run(tf.compat.v1.global_variables_initializer()) sess.run(target_init) else: # if resuming define all the ph and outputs from saved model # inputs x_ph = resume_params['model']['x_ph'] a_ph = resume_params['model']['a_ph'] prev_a_ph = resume_params['model']['prev_a_ph'] x2_ph = resume_params['model']['x2_ph'] r_ph = resume_params['model']['r_ph'] d_ph = resume_params['model']['d_ph'] g_ph = resume_params['model']['g_ph'] # outputs mu = resume_params['model']['mu'] pi = resume_params['model']['pi'] pi_loss = resume_params['model']['pi_loss'] q1_loss = resume_params['model']['q1_loss'] q2_loss = resume_params['model']['q2_loss'] q1_a = resume_params['model']['q1_a'] q2_a = resume_params['model']['q2_a'] logp_pi = resume_params['model']['logp_pi'] target_entropy = resume_params['model']['target_entropy'] alpha_loss = resume_params['model']['alpha_loss'] alpha = resume_params['model']['alpha'] # buffers replay_buffer = resume_params['resume_state']['replay_buffer'] train_state_buffer = resume_params['resume_state'][ 'train_state_buffer'] test_state_buffer = resume_params['resume_state']['test_state_buffer'] # get needed operations from graph by name (trouble saving these) train_pi_op = tf.get_default_graph().get_operation_by_name( "train_pi_op") train_value_op = tf.get_default_graph().get_operation_by_name( "train_value_op") train_alpha_op = tf.get_default_graph().get_operation_by_name( "train_alpha_op") target_update = tf.get_default_graph().get_operation_by_name( "target_update") sess = resume_params['sess'] # All ops to call during one training step step_ops = [ pi_loss, q1_loss, q2_loss, q1_a, q2_a, logp_pi, target_entropy, alpha_loss, alpha, train_pi_op, train_value_op, train_alpha_op, target_update ] # Setup model saving if save_freq is not None: logger.setup_tf_saver(sess, inputs={ 'x_ph': x_ph, 'a_ph': a_ph, 'prev_a_ph': prev_a_ph, 'x2_ph': x2_ph, 'r_ph': r_ph, 'd_ph': d_ph, 'g_ph': g_ph }, outputs={ 'mu': mu, 'pi': pi, 'pi_loss': pi_loss, 'q1_loss': q1_loss, 'q2_loss': q2_loss, 'q1_a': q1_a, 'q2_a': q2_a, 'logp_pi': logp_pi, 'target_entropy': target_entropy, 'alpha_loss': alpha_loss, 'alpha': alpha }) def get_action(state, one_hot_goal, prev_a, deterministic=False): state = state.astype('float32') / 255. act_op = mu if deterministic else pi a = sess.run(act_op, feed_dict={ x_ph: [state], g_ph: [one_hot_goal], prev_a_ph: [prev_a] })[0] return a def reset(state_buffer): o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 o = process_image_observation(o, obs_dim) r = process_reward(r) state = state_buffer.init_state(init_obs=o) prev_a = np.zeros(act_dim) # new random goal when the env is reset goal_id = np.random.randint(goal_dim) one_hot_goal = np.eye(goal_dim)[goal_id] goal = env.goal_list[goal_id] env.goal_button = goal # print('Goal Button: {}'.format(goal)) return o, r, d, ep_ret, ep_len, state, one_hot_goal, prev_a def test_agent(n=1): print('Testing...') for j in range(n): test_o, test_r, test_d, test_ep_ret, test_ep_len, test_state, test_one_hot_goal, test_prev_a = reset( test_state_buffer) while not (test_d or (test_ep_len == max_ep_len)): test_a = get_action(test_state, test_one_hot_goal, test_prev_a, True) test_o, test_r, test_d, _ = env.step(test_a) test_o = process_image_observation(test_o, obs_dim) test_r = process_reward(test_r) test_state = test_state_buffer.append_state(test_o) test_ep_ret += test_r test_ep_len += 1 test_prev_a = test_a logger.store(TestEpRet=test_ep_ret, TestEpLen=test_ep_len) # ================== Main training Loop ================== if not resume_training: start_time = time.time() o, r, d, ep_ret, ep_len, state, one_hot_goal, prev_a = reset( train_state_buffer) total_steps = steps_per_epoch * epochs resume_t = 0 # array for storing states used with HER if use_HER: HER_buffer = ContHERBuffer(obs_dim=obs_dim, act_dim=act_dim, goal_dim=goal_dim, size=max_ep_len) # resuming training else: start_time = time.time() total_steps = steps_per_epoch * (epochs + resume_params['additional_epochs']) HER_buffer = resume_params['resume_state']['HER_buffer'] resume_t = resume_params['resume_state']['resume_t'] o, r, d, ep_ret, ep_len, state, one_hot_goal, prev_a = resume_params[ 'resume_state']['rl_state'] # reset the environment to the state set before saving env.set_env_state(resume_params['resume_state']['env_state']) # Main loop: collect experience in env and update/log each epoch for t in range(resume_t, total_steps): if t > start_steps: a = get_action(state, one_hot_goal, prev_a, False) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) o2 = process_image_observation(o2, obs_dim) # thresholding done in env r = process_reward(r) next_state = train_state_buffer.append_state(o2) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer # if life is lost then store done as true true replay_buffer.store(state, a, prev_a, r, next_state, d, one_hot_goal) # append to HER buffer if use_HER: HER_buffer.store(state, a, prev_a, r, next_state, d, one_hot_goal) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 state = next_state prev_a = a # store additional states in replay buffer where the goal # is given by the final state, if the final state was incorrect if use_HER: if d and (ep_len != max_ep_len): # get actual goal achieved achieved_goal = np.eye(goal_dim)[env.goal_list.index( env.latest_button)] # if an incorrect goal was reached if (achieved_goal != one_hot_goal).any(): for j in range(ep_len): # pull data from HER buffer sample = HER_buffer.sample(j) # change this to calc_rew function in env if j == ep_len - 1: new_rew = env.max_rew else: new_rew = sample['rews'] # add to replay buffer replay_buffer.store(sample['obs1'], sample['acts'], sample['prev_acts'], new_rew, sample['obs2'], sample['done'], achieved_goal) # do a single update if t > 0 and t % update_freq == 0: for i in range(n_updates): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], prev_a_ph: batch['prev_acts'], r_ph: batch['rews'], d_ph: batch['done'], g_ph: batch['goal'] } outs = sess.run(step_ops, feed_dict) logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2], Q1Vals=outs[3], Q2Vals=outs[4], LogPPi=outs[5], TargEntropy=outs[6], LossAlpha=outs[7], Alpha=outs[8]) if d or (ep_len == max_ep_len): # store episode values logger.store(EpRet=ep_ret, EpLen=ep_len) # reset the environment o, r, d, ep_ret, ep_len, state, one_hot_goal, prev_a = reset( train_state_buffer) if use_HER: # reset HER buffer HER_buffer.reset() # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # save everything neccessary for restarting training from current position env_state = env.get_env_state() # Save model if save_freq is not None: if (epoch % save_freq == 0) or (epoch == epochs - 1): print('Saving...') rl_state = [ o, r, d, ep_ret, ep_len, state, one_hot_goal, prev_a ] logger.save_state( state_dict={ 'env_state': env_state, 'replay_buffer': replay_buffer, 'train_state_buffer': train_state_buffer, 'test_state_buffer': test_state_buffer, 'HER_buffer': HER_buffer, 'resume_t': t + 1, 'rl_state': rl_state }) # Test the performance of the deterministic version of the agent. (resets the env) test_agent(n=num_tests) # set params for resuming training env.set_env_state(env_state) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LogPPi', average_only=True) logger.log_tabular('TargEntropy', average_only=True) logger.log_tabular('Alpha', average_only=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) logger.log_tabular('LossAlpha', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() plot_progress(os.path.join(logger_kwargs['output_dir'], 'progress.txt'), show_plot=False)
def ddpg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, act_noise=0.1, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Deterministically computes actions | from policy given states. ``q`` (batch,) | Gives the current estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q_pi`` (batch,) | Gives the composition of ``q`` and | ``pi`` for states in ``x_ph``: | q(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to DDPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope('main'): pi, q, q_pi = actor_critic(x_ph, a_ph, **ac_kwargs) # Target networks with tf.variable_scope('target'): # Note that the action placeholder going to actor_critic here is # irrelevant, because we only need q_targ(s, pi_targ(s)). pi_targ, _, q_pi_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main']) print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n' % var_counts) # Bellman backup for Q function backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * q_pi_targ) # DDPG losses pi_loss = -tf.reduce_mean(q_pi) q_loss = tf.reduce_mean((q - backup)**2) # Separate train ops for pi, q pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q')) # Polyak averaging for target variables target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'pi': pi, 'q': q }) def get_action(o, noise_scale): a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)}) a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(n=10): for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, _ = test_env.step(get_action(o, 0)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ if t > start_steps: a = get_action(o, act_noise) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all DDPG updates at the end of the trajectory, in accordance with tuning done by TD3 paper authors. """ for _ in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } # Q-learning update outs = sess.run([q_loss, q, train_q_op], feed_dict) logger.store(LossQ=outs[0], QVals=outs[1]) # Policy update outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) logger.store(LossPi=outs[0]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('QVals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def sqn(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``mu`` (batch, act_dim) | Computes mean actions from policy | given states. ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. Critical: must be differentiable | with respect to policy parameters all | the way through action sampling. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). ``q2_pi`` (batch,) | Gives the composition of ``q2`` and | ``pi`` for states in ``x_ph``: | q2(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for policy/value/alpha learning). alpha (float/'auto'): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) / 'auto': alpha is automated. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # print(max_ep_len,type(max_ep_len)) logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] obs_space = env.observation_space act_dim = env.action_space.n act_space = env.action_space # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders_from_space(obs_space, act_space, obs_space, None, None) ###### if alpha == 'auto': # target_entropy = (-np.prod(env.action_space.n)) # target_entropy = (np.prod(env.action_space.n))/4/10 target_entropy = 0.15 log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=0.0) alpha = tf.exp(log_alpha) ###### # Main outputs from computation graph with tf.variable_scope('main'): mu, pi, _, q1, q2, q1_pi, q2_pi = actor_critic(x_ph, a_ph, alpha, **ac_kwargs) # Target value network with tf.variable_scope('target'): _, _, logp_pi_, _, _,q1_pi_, q2_pi_= actor_critic(x2_ph, a_ph, alpha, **ac_kwargs) # Experience buffer if isinstance(act_space, Box): a_dim = act_dim elif isinstance(act_space, Discrete): a_dim = 1 replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=a_dim, size=replay_size) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main']) print(('\nNumber of parameters: \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t total: %d\n')%var_counts) ###### if isinstance(alpha,tf.Tensor): alpha_loss = tf.reduce_mean(-log_alpha * tf.stop_gradient(logp_pi_ + target_entropy)) alpha_optimizer = tf.train.AdamOptimizer(learning_rate=lr, name='alpha_optimizer') train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss, var_list=[log_alpha]) ###### # Min Double-Q: min_q_pi = tf.minimum(q1_pi_, q2_pi_) # Targets for Q and V regression v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi_) ############################## alpha=0 q_backup = r_ph + gamma*(1-d_ph)*v_backup # Soft actor-critic losses q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2) value_loss = q1_loss + q2_loss # # Policy train op # # (has to be separate from value train op, because q1_pi appears in pi_loss) # pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) # train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars('main/q') #with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) # All ops to call during one training step if isinstance(alpha, Number): step_ops = [q1_loss, q2_loss, q1, q2, logp_pi_, tf.identity(alpha), train_value_op, target_update] else: step_ops = [q1_loss, q2_loss, q1, q2, logp_pi_, alpha, train_value_op, target_update, train_alpha_op] # Initializing targets to match main variables target_init = tf.group([tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2}) def get_action(o, deterministic=False): act_op = mu if deterministic else pi return sess.run(act_op, feed_dict={x_ph: np.expand_dims(o, axis=0)})[0] def test_agent(n=20): # n: number of tests global sess, mu, pi, q1, q2, q1_pi, q2_pi for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not(d or (ep_len == max_ep_len)): # max_ep_len # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() # o = env.reset() ##################### # o, r, d, ep_ret, ep_len = env.step(1)[0], 0, False, 0, 0 ##################### o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ # if t > start_steps and 100*t/total_steps > np.random.random(): # greedy, avoid falling into sub-optimum if t > start_steps: a = get_action(o) else: a = env.action_space.sample() np.random.random() # Step the env o2, r, d, _ = env.step(a) #print(a,o2) # o2, r, _, d = env.step(a) ##################### # d = d['ale.lives'] < 5 ##################### ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len==max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of episode. Training (ep_len times). if d or (ep_len == max_ep_len): # make sure: max_ep_len < steps_per_epoch """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = {x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], } # step_ops = [q1_loss, q2_loss, q1, q2, logp_pi, alpha, train_pi_op, train_value_op, target_update] outs = sess.run(step_ops, feed_dict) logger.store(LossQ1=outs[0], LossQ2=outs[1], Q1Vals=outs[2], Q2Vals=outs[3], LogPi=outs[4], Alpha=outs[5]) #if d: logger.store(EpRet=ep_ret, EpLen=ep_len) # o = env.reset() ##################### # o, r, d, ep_ret, ep_len = env.step(1)[0], 0, False, 0, 0 ##################### o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs-1): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # logger.store(): store the data; logger.log_tabular(): log the data; logger.dump_tabular(): write the data # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Alpha',average_only=True) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) # logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) # logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) # logger.log_tabular('LossV', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular()
def a2c(env_fn, model: IActorCritic, seed=0, num_cpu=1, device=torch.device("cpu"), epochs=1000, steps_per_epoch=100, episode_len_limit=None, gamma=0.99, use_gae=True, tau=0.95, max_grad_norm=0.5, polyak=0.995, learning_rate=1e-3, value_loss_coef=0.5, policy_loss_coef=1, entropy_loss_coef=0.1, save_every=100, log_every=10, logger_kwargs=dict(), test_every=100, num_test_episodes=5, test_episode_len_limit=None, deterministic=False, save_freq=1, solved_score=None, ): use_MPI = num_cpu > 1 if use_MPI: # Special function to avoid certain slowdowns from PyTorch + MPI combo. mpi_pytorch.setup_pytorch_for_mpi() else: torch.set_num_threads(torch.get_num_threads()) # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) config = locals() del config['env_fn'] del config['model'] del config['logger'] logger.save_config(config) test_logger_kwargs = deepcopy(logger_kwargs) test_logger_kwargs['output_dir'] = pathlib.Path(test_logger_kwargs['output_dir']) / 'evaluation' test_logger = EpochLogger(**test_logger_kwargs) # Random seed if use_MPI: seed += 10000 * mpi_tools.proc_id() torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env_fn() test_env = env_fn() obs_shape = env.observation_space.shape act_dim = env.action_space.n # episode length limit if episode_len_limit is None: if env.unwrapped.spec and env.unwrapped.spec.max_episode_steps: episode_len_limit = env.spec.max_episode_steps else: raise ValueError("Episode length limit must be specified") if test_episode_len_limit is None: test_episode_len_limit = episode_len_limit # training model and target model actor_critic = model target_actor_critic = deepcopy(actor_critic) if use_MPI: # Sync params across processes mpi_pytorch.sync_params(actor_critic) mpi_pytorch.sync_params(target_actor_critic) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in target_actor_critic.parameters(): p.requires_grad = False # Utilize GPU actor_critic.to(device) target_actor_critic.to(device) # Set up optimizers for policy and q-function optimizer = Adam(actor_critic.parameters(), lr=learning_rate) # Set up model saving logger.setup_pytorch_saver(actor_critic, name='model') def update(episode_buffer): # Update if episode_buffer.dones[-1]: next_value = 0.0 else: last_obs = episode_buffer.next_observations[-1] last_obs_tensor = torch.tensor(last_obs, dtype=torch.float32).unsqueeze(0) context = actor_critic.get_context() next_value = target_actor_critic.predict_value(last_obs_tensor, context=context).cpu().item() # Super critical!! optimizer.zero_grad() # Compute value and policy losses loss, info = actor_critic.compute_loss(rewards=np.array(episode_buffer.rewards), dones=np.array(episode_buffer.dones), next_value=next_value, discount_factor=gamma, use_gae=use_gae, tau=tau, value_loss_coef=value_loss_coef, policy_loss_coef=policy_loss_coef, entropy_reg_coef=entropy_loss_coef) loss.backward() if use_MPI: mpi_pytorch.mpi_avg_grads(actor_critic) # Optimize if max_grad_norm is not None: torch.nn.utils.clip_grad_norm_(actor_critic.parameters(), max_grad_norm) optimizer.step() # Log losses and info logger.store(**info) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(actor_critic.parameters(), target_actor_critic.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) if use_MPI: mpi_pytorch.sync_params(target_actor_critic) # Prepare for interaction with environment start_time = time.time() # Main loop: collect experience in env and update/log each epoch total_steps = 0 # Reset env obs = env.reset() # Reset episode stats episode_return = 0 episode_length = 0 for _ in range(5): logger.store(EpRet=0, EpLen=0) for epoch in range(1, epochs + 1): actor_critic.reset_for_training() epoch_history = EpisodeHistory() for t in range(steps_per_epoch): total_steps += 1 # Get action from the model obs_tensor = torch.tensor(obs, dtype=torch.float32).unsqueeze(0) action = actor_critic.step(obs_tensor) # Step the env obs2, reward, done, _ = env.step(action.detach().cpu().item()) episode_return += reward episode_length += 1 # Store transition to history epoch_history.store(observation=obs, action=action, reward=reward, done=done, next_observation=obs2) # Super critical, easy to overlook step: make sure to update # most recent observation! obs = obs2 # End of trajectory handling if done or episode_length > episode_len_limit: break update(epoch_history) # if done if epoch_history.dones[-1]: logger.store(EpRet=episode_return, EpLen=episode_length) # Reset env obs = env.reset() actor_critic.reset() # Reset episode stats episode_return = 0 episode_length = 0 # End of epoch handling if epoch % log_every == 0: total_interactions = mpi_tools.mpi_sum(total_steps) if use_MPI else total_steps # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('Value', average_only=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossEntropy', average_only=True) logger.log_tabular('TotalEnvInteracts', total_interactions) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() # Test agent solved = False if epoch % test_every == 0: # Test the performance of the deterministic version of the agent. context = actor_critic.get_context() actor_critic.eval() episode_info = evaluate_agent(env=test_env, agent=actor_critic, deterministic=deterministic, num_episodes=num_test_episodes, episode_len_limit=test_episode_len_limit, render=False, logger=test_logger) actor_critic.train() actor_critic.set_context(context) if solved_score is not None: solved = all(r >= solved_score for (t, r) in episode_info) # Save model if (epoch % save_every == 0) or (epoch == epochs) or solved: logger.save_state({'env': env}) # Check environment is solved if solved: plog = lambda msg: logger.log(msg, color='green') plog("=" * 40) plog(f"ENVIRONMENT SOLVED!") plog("=" * 40) plog(f' TotalEnvInteracts {total_steps}') plog(f' Time {time.time() - start_time}') plog(f' Epoch {epoch}') break
def sac(env_fn, expert=None, policy_path=None, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=500, epochs=100000, replay_size=int(5e3), gamma=0.99, dagger_noise=0.02, polyak=0.995, lr=1e-4, alpha=0.2, batch_size=64, dagger_epochs=200, pretrain_epochs=50, max_ep_len=500, logger_kwargs=dict(), save_freq=50, update_steps=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``mu`` (batch, act_dim) | Computes mean actions from policy | given states. ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. Critical: must be differentiable | with respect to policy parameters all | the way through action sampling. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). ``q2_pi`` (batch,) | Gives the composition of ``q2`` and | ``pi`` for states in ``x_ph``: | q2(x, pi(x)). ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) test_logger_kwargs = dict() test_logger_kwargs['output_dir'] = osp.join(logger_kwargs['output_dir'], "test") test_logger_kwargs['exp_name'] = logger_kwargs['exp_name'] test_logger = EpochLogger(**test_logger_kwargs) tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] print(obs_dim) print(act_dim) # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space act_high_limit = env.action_space.high act_low_limit = env.action_space.low sess = tf.Session() if policy_path is None: # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) tfa_ph = core.placeholder(act_dim) # Main outputs from computation graph with tf.variable_scope('main'): mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Target value network with tf.variable_scope('target'): _, _, _, _, _, _, _, v_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) # sess.run(tf.global_variables_initializer()) else: # load pretrained model model = restore_tf_graph(sess, osp.join(policy_path, 'simple_save')) x_ph, a_ph, x2_ph, r_ph, d_ph = model['x_ph'], model['a_ph'], model['x2_ph'], model['r_ph'], model['d_ph'] mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v = model['mu'], model['pi'], model['logp_pi'], model['q1'], model['q2'], model['q1_pi'], model['q2_pi'], model['v'] # tfa_ph = core.placeholder(act_dim) tfa_ph = model['tfa_ph'] # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) dagger_replay_buffer = DaggerReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main']) print(('\nNumber of parameters: \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t v: %d, \t total: %d\n')%var_counts) # print(obs_dim) # print(act_dim) # SAC objectives if policy_path is None: # Min Double-Q: min_q_pi = tf.minimum(q1_pi, q2_pi) # Targets for Q and V regression q_backup = tf.stop_gradient(r_ph + gamma*(1-d_ph)*v_targ) v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi) # Soft actor-critic losses dagger_pi_loss = tf.reduce_mean(tf.square(mu-tfa_ph)) pi_loss = tf.reduce_mean(alpha * logp_pi - q1_pi) q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2) v_loss = 0.5 * tf.reduce_mean((v_backup - v)**2) value_loss = q1_loss + q2_loss + v_loss # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) dagger_pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_dagger_pi_op = dagger_pi_optimizer.minimize(dagger_pi_loss, name='train_dagger_pi_op') pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'), name='train_pi_op') # sess.run(tf.variables_initializer(pi_optimizer.variables())) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars('main/q') + get_vars('main/v') with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params, name='train_value_op') # sess.run(tf.variables_initializer(value_optimizer.variables())) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) # All ops to call during one training step step_ops = [pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, train_pi_op, train_value_op, target_update] # Initializing targets to match main variables target_init = tf.group([tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) sess.run(tf.global_variables_initializer()) else: graph = tf.get_default_graph() dagger_pi_loss = model['dagger_pi_loss'] pi_loss = model['pi_loss'] q1_loss = model['q1_loss'] q2_loss = model['q2_loss'] v_loss = model['v_loss'] train_dagger_pi_op = graph.get_operation_by_name('train_dagger_pi_op') train_value_op = graph.get_operation_by_name('train_value_op') train_pi_op = graph.get_operation_by_name('train_pi_op') # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) # All ops to call during one training step step_ops = [pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, train_pi_op, train_value_op, target_update] # Initializing targets to match main variables target_init = tf.group([tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) # sess = tf.Session() # sess.run(tf.global_variables_initializer()) dagger_step_ops = [q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, train_value_op, target_update] tf.summary.FileWriter("log/", sess.graph) # Setup model saving logger.setup_tf_saver(sess, inputs={'x_ph': x_ph, 'a_ph': a_ph, 'tfa_ph': tfa_ph, 'x2_ph': x2_ph, 'r_ph': r_ph, 'd_ph': d_ph}, \ outputs={'mu': mu, 'pi': pi, 'v': v, 'logp_pi': logp_pi, 'q1': q1, 'q2': q2, 'q1_pi': q1_pi, 'q2_pi': q2_pi, \ 'pi_loss': pi_loss, 'v_loss': v_loss, 'dagger_pi_loss': dagger_pi_loss, 'q1_loss': q1_loss, 'q2_loss': q2_loss}) def get_action(o, deterministic=False): act_op = mu if deterministic else pi a = sess.run(act_op, feed_dict={x_ph: o.reshape(1,-1)})[0] return np.clip(a, act_low_limit, act_high_limit) def choose_action(s, add_noise=False): s = s[np.newaxis, :] a = sess.run(mu, {x_ph: s})[0] if add_noise: noise = dagger_noise * act_high_limit * np.random.normal(size=a.shape) a = a + noise return np.clip(a, act_low_limit, act_high_limit) def test_agent(n=81, test_num=1): n = env.unwrapped._set_test_mode(True) con_flag = False for j in range(n): o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 while not(d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, info = env.step(choose_action(np.array(o), 0)) ep_ret += r ep_len += 1 if d: test_logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) test_logger.store(arrive_des=info['arrive_des']) test_logger.store(arrive_des_appro=info['arrive_des_appro']) if not info['out_of_range']: test_logger.store(converge_dis=info['converge_dis']) con_flag = True test_logger.store(out_of_range=info['out_of_range']) # print(info) # test_logger.dump_tabular() # time.sleep(10) if not con_flag: test_logger.store(converge_dis=10000) env.unwrapped._set_test_mode(False) def ref_test_agent(n=81, test_num=1): n = env.unwrapped._set_test_mode(True) con_flag = False for j in range(n): o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 while not(d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) a = call_ref_controller(env, expert) o, r, d, info = env.step(a) ep_ret += r ep_len += 1 if d: test_logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) test_logger.store(arrive_des=info['arrive_des']) test_logger.store(arrive_des_appro=info['arrive_des_appro']) if not info['out_of_range']: test_logger.store(converge_dis=info['converge_dis']) con_flag = True test_logger.store(out_of_range=info['out_of_range']) # print(info) # test_logger.dump_tabular() if not con_flag: test_logger.store(converge_dis=10000) env.unwrapped._set_test_mode(False) # ref_test_agent(test_num = -1) # test_logger.log_tabular('epoch', -1) # test_logger.log_tabular('TestEpRet', average_only=True) # test_logger.log_tabular('TestEpLen', average_only=True) # test_logger.log_tabular('arrive_des', average_only=True) # test_logger.log_tabular('arrive_des_appro', average_only=True) # test_logger.log_tabular('converge_dis', average_only=True) # test_logger.log_tabular('out_of_range', average_only=True) # test_logger.dump_tabular() start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 episode_steps = 500 total_env_t = 0 test_num = 0 print(colorize("begin dagger training", 'green', bold=True)) for epoch in range(1, dagger_epochs + 1, 1): # test policy if epoch > 0 and (epoch % save_freq == 0) or (epoch == epochs): # Save model logger.save_state({}, None) # Test the performance of the deterministic version of the agent. test_num += 1 test_agent(test_num=test_num) test_logger.log_tabular('epoch', epoch) test_logger.log_tabular('TestEpRet', average_only=True) test_logger.log_tabular('TestEpLen', average_only=True) test_logger.log_tabular('arrive_des', average_only=True) test_logger.log_tabular('arrive_des_appro', average_only=True) test_logger.log_tabular('converge_dis', average_only=True) test_logger.log_tabular('out_of_range', average_only=True) test_logger.dump_tabular() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular() # train policy o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 env.unwrapped._set_test_mode(False) obs, acs, rewards = [], [], [] for t in range(steps_per_epoch): obs.append(o) ref_action = call_ref_controller(env, expert) if(epoch < pretrain_epochs): action = ref_action else: action = choose_action(np.array(o), True) o2, r, d, _ = env.step(action) o = o2 acs.append(ref_action) rewards.append(r) if (t == steps_per_epoch-1): # print ("reached the end") d = True # Store experience to replay buffer replay_buffer.store(o, action, r, o2, d) ep_ret += r ep_len += 1 total_env_t += 1 if d: # Perform partical sac update! for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = {x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], } outs = sess.run(dagger_step_ops, feed_dict) logger.store(LossQ1=outs[0], LossQ2=outs[1], LossV=outs[2], Q1Vals=outs[3], Q2Vals=outs[4], VVals=outs[5], LogPi=outs[6]) # Perform dagger policy update dagger_replay_buffer.stores(obs, acs, rewards) for _ in range(int(ep_len/5)): batch = dagger_replay_buffer.sample_batch(batch_size) feed_dict = {x_ph: batch['obs1'], tfa_ph: batch['acts']} q_step_ops = [dagger_pi_loss, train_dagger_pi_op] for j in range(10): outs = sess.run(q_step_ops, feed_dict) logger.store(LossPi = outs[0]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 break # Main loop: collect experience in env and update/log each epoch print(colorize("begin sac training", 'green', bold=True)) for epoch in range(1, epochs + 1, 1): # test policy if epoch > 0 and (epoch % save_freq == 0) or (epoch == epochs): # Save model logger.save_state({}, None) # Test the performance of the deterministic version of the agent. test_num += 1 test_agent(test_num=test_num) test_logger.log_tabular('epoch', epoch) test_logger.log_tabular('TestEpRet', average_only=True) test_logger.log_tabular('TestEpLen', average_only=True) test_logger.log_tabular('arrive_des', average_only=True) # test_logger.log_tabular('arrive_des_appro', average_only=True) test_logger.log_tabular('converge_dis', average_only=True) test_logger.log_tabular('out_of_range', average_only=True) test_logger.dump_tabular() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) # logger.log_tabular('DeltaLossPi', average_only=True) # logger.log_tabular('DeltaLossV', average_only=True) # logger.log_tabular('Entropy', average_only=True) # logger.log_tabular('KL', average_only=True) # logger.log_tabular('ClipFrac', average_only=True) # logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular() # train policy o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 env.unwrapped._set_test_mode(False) for t in range(steps_per_epoch): a = get_action(np.array(o)) o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 if (t == steps_per_epoch-1): # print ("reached the end") d = True replay_buffer.store(o, a, r, o2, d) o = o2 if d: """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = {x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], } outs = sess.run(step_ops, feed_dict) logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2], LossV=outs[3], Q1Vals=outs[4], Q2Vals=outs[5], VVals=outs[6], LogPi=outs[7]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
def vpg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, pi_lr=3e-4, vf_lr=1e-3, train_v_iters=80, lam=0.97, max_ep_len=1000, logger_kwargs=dict(), save_freq=10, custom_h=None, do_checkpoint_eval=False, env_name=None): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to VPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # create logger for tensorboard tb_logdir = "{}/tb_logs/".format(logger.output_dir) tb_logger = Logger(log_dir=tb_logdir) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space if custom_h is not None: hidden_layers_str_list = custom_h.split('-') hidden_layers_int_list = [int(h) for h in hidden_layers_str_list] ac_kwargs['hidden_sizes'] = hidden_layers_int_list # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = VPGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # VPG objectives pi_loss = -tf.reduce_mean(logp * adv_ph) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) # create a tf session with GPU memory usage option to be allow_growth so that one program will not use up the # whole GPU memory config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) # log tf graph tf.summary.FileWriter(tb_logdir, sess.graph) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) # for saving the best models and performances during train and evaluate best_eval_AverageEpRet = 0.0 best_eval_StdEpRet = 1.0e20 def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Policy gradient step sess.run(train_pi, feed_dict=inputs) # Value function learning for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl = sess.run([pi_loss, v_loss, approx_kl], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) o, r, d, _ = env.step(a[0]) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): # Save a new model every save_freq and at the last epoch. Do not overwrite the previous save. logger.save_state({'env': env}, epoch) # Evaluate and save best model if do_checkpoint_eval and epoch > 0: # below is a hack. best model related stuff is saved at itr 999999, therefore, simple_save999999. # Doing this way, I can use test_policy and plot directly to test the best models. # saved best models includes: # 1) a copy of the env # 2) the best rl model with parameters # 3) a pickle file "best_eval_performance_n_structure" storing best_performance, best_structure and epoch # note that 1) and 2) are spinningup defaults, and 3) is a custom save best_eval_AverageEpRet, best_eval_StdEpRet = eval_and_save_best_model( best_eval_AverageEpRet, best_eval_StdEpRet, # a new logger is created and passed in so that the new logger can leverage the directory # structure without messing up the logger in the training loop eval_logger=EpochLogger( **dict(exp_name=logger_kwargs['exp_name'], output_dir=os.path.join(logger.output_dir, "simple_save999999"))), train_logger=logger, tb_logger=tb_logger, epoch=epoch, # the env_name is passed in so that to create an env when and where it is needed. This is to # logx.save_state() error where an env pointer cannot be pickled env_name=env_name, get_action=lambda x: sess.run( pi, feed_dict={x_ph: x[None, :]})[0]) # Perform VPG update! update() # # # Log into tensorboard log_key_to_tb(tb_logger, logger, epoch, key="EpRet", with_min_and_max=True) log_key_to_tb(tb_logger, logger, epoch, key="EpLen", with_min_and_max=False) log_key_to_tb(tb_logger, logger, epoch, key="VVals", with_min_and_max=True) log_key_to_tb(tb_logger, logger, epoch, key="LossPi", with_min_and_max=False) log_key_to_tb(tb_logger, logger, epoch, key="LossV", with_min_and_max=False) log_key_to_tb(tb_logger, logger, epoch, key="DeltaLossPi", with_min_and_max=False) log_key_to_tb(tb_logger, logger, epoch, key="DeltaLossV", with_min_and_max=False) log_key_to_tb(tb_logger, logger, epoch, key="Entropy", with_min_and_max=False) log_key_to_tb(tb_logger, logger, epoch, key="KL", with_min_and_max=False) tb_logger.log_scalar(tag="TotalEnvInteracts", value=(epoch + 1) * steps_per_epoch, step=epoch) tb_logger.log_scalar(tag="Time", value=time.time() - start_time, step=epoch) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def sac_multistep( env_fn, hidden_sizes=[256, 256], seed=0, steps_per_epoch=1000, epochs=1000, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=3e-4, alpha=0.2, batch_size=256, start_steps=10000, max_ep_len=1000, save_freq=1, save_model=False, auto_alpha=True, grad_clip=-1, logger_store_freq=100, multistep_k=1, debug=False, use_single_variant=False, logger_kwargs=dict(), ): """ Largely following OpenAI documentation, but a bit different Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. hidden_sizes: number of entries is number of hidden layers each entry in this list indicate the size of that hidden layer. applies to all networks seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. Note the epoch here is just logging epoch so every this many steps a logging to stdouot and also output file will happen note: not to be confused with training epoch which is a term used often in literature for all kinds of different things epochs (int): Number of epochs to run and train agent. Usage of this term can be different in different algorithms, use caution. Here every epoch you get new logs replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. However during testing the action always come from policy max_ep_len (int): Maximum length of trajectory / episode / rollout. Environment will get reseted if timestep in an episode excedding this number save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. logger_kwargs (dict): Keyword args for EpochLogger. save_model (bool): set to True if want to save the trained agent auto_alpha: set to True to use the adaptive alpha scheme, target entropy will be set automatically grad_clip: whether to use gradient clipping. < 0 means no clipping logger_store_freq: how many steps to log debugging info, typically don't need to change """ if debug: hidden_sizes = [2, 2] batch_size = 2 start_steps = 1000 multistep_k = 5 use_single_variant = True """set up logger""" logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) env, test_env = env_fn(), env_fn() ## seed torch and numpy torch.manual_seed(seed) np.random.seed(seed) ## seed environment along with env action space so that everything about env is seeded env.seed(seed) env.action_space.np_random.seed(seed) test_env.seed(seed + 10000) test_env.action_space.np_random.seed(seed + 10000) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # if environment has a smaller max episode length, then use the environment's max episode length max_ep_len = env._max_episode_steps if max_ep_len > env._max_episode_steps else max_ep_len # Action limit for clamping: critically, assumes all dimensions share the same bound! # we need .item() to convert it from numpy float to python float act_limit = env.action_space.high[0].item() # Experience buffer replay_buffer = MultistepReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) """ Auto tuning alpha """ if auto_alpha: target_entropy = -np.prod(env.action_space.shape).item() # H log_alpha = torch.zeros(1, requires_grad=True) alpha_optim = optim.Adam([log_alpha], lr=lr) else: target_entropy, log_alpha, alpha_optim = None, None, None def test_agent(n=1): """ This will test the agent's performance by running n episodes During the runs, the agent only take deterministic action, so the actions are not drawn from a distribution, but just use the mean :param n: number of episodes to run the agent """ ep_return_list = np.zeros(n) for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time a = policy_net.get_env_action(o, deterministic=True) o, r, d, _ = test_env.step(a) ep_ret += r ep_len += 1 ep_return_list[j] = ep_ret logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs """init all networks""" # see line 1 policy_net = TanhGaussianPolicySACAdapt(obs_dim, act_dim, hidden_sizes, action_limit=act_limit) q1_net = Mlp(obs_dim + act_dim, 1, hidden_sizes) q2_net = Mlp(obs_dim + act_dim, 1, hidden_sizes) q1_target_net = Mlp(obs_dim + act_dim, 1, hidden_sizes) q2_target_net = Mlp(obs_dim + act_dim, 1, hidden_sizes) # see line 2: copy parameters from value_net to target_value_net q1_target_net.load_state_dict(q1_net.state_dict()) q2_target_net.load_state_dict(q2_net.state_dict()) # set up optimizers policy_optimizer = optim.Adam(policy_net.parameters(), lr=lr) q1_optimizer = optim.Adam(q1_net.parameters(), lr=lr) q2_optimizer = optim.Adam(q2_net.parameters(), lr=lr) # mean squared error loss for v and q networks mse_criterion = nn.MSELoss() # Main loop: collect experience in env and update/log each epoch # NOTE: t here is the current number of total timesteps used # it is not the number of timesteps passed in the current episode current_update_index = 0 for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t > start_steps: a = policy_net.get_env_action(o, deterministic=False) else: a = env.action_space.sample() # Step the env, get next observation, reward and done signal o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience (observation, action, reward, next observation, done) to replay buffer # the multi-step buffer (given to you) will store the data in a fashion that # they can be easily used for multi-step update replay_buffer.store(o, a, r, o2, d, ep_len, max_ep_len, multistep_k, gamma) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 """perform update""" if replay_buffer.size >= batch_size: # get data from replay buffer batch = replay_buffer.sample_batch(batch_size) obs_tensor = Tensor(batch['obs1']) # NOTE: given the multi-step buffer, obs_next_tensor now contains the observation that are # k-step away from current observation obs_next_tensor = Tensor(batch['obs2']) acts_tensor = Tensor(batch['acts']) # NOTE: given the multi-step buffer, rewards tensor now contain the sum of discounted rewards in the next # k steps (or up until termination, if terminated in less than k steps) rews_tensor = Tensor(batch['rews']).unsqueeze(1) # NOTE: given the multi-step buffer, done_tensor now shows whether the data's episode terminated in less # than k steps or not done_tensor = Tensor(batch['done']).unsqueeze(1) """ now we do a SAC update, following the OpenAI spinup doc check the openai sac document pseudocode part for reference line numbers indicate lines in pseudocode part we will first compute each of the losses and then update all the networks in the end """ # see line 12: get a_tilda, which is newly sampled action (not action from replay buffer) """get q loss""" with torch.no_grad(): a_tilda_next, _, _, log_prob_a_tilda_next, _, _ = policy_net.forward( obs_next_tensor) q1_next = q1_target_net( torch.cat([obs_next_tensor, a_tilda_next], 1)) q2_next = q2_target_net( torch.cat([obs_next_tensor, a_tilda_next], 1)) # TODO: compute the k-step Q estimate (in the form of reward + next Q), don't worry about the entropy terms if use_single_variant: # write code for computing the k-step estimate for the single Q estimate variant case y_q = rews_tensor + (gamma**multistep_k) * ( 1 - done_tensor) * q1_next else: # write code for computing the k-step estimate while using double clipped Q y_q = rews_tensor + (gamma**multistep_k) * ( 1 - done_tensor) * torch.min(q1_next, q2_next) # add the entropy, with a simplied heuristic way # NOTE: you don't need to modify the following 3 lines. They deal with entropy terms powers = np.arange(1, multistep_k + 1) entropy_discounted_sum = -sum(gamma**powers) * ( 1 - done_tensor) * alpha * log_prob_a_tilda_next y_q += entropy_discounted_sum # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] q1_prediction = q1_net(torch.cat([obs_tensor, acts_tensor], 1)) q1_loss = mse_criterion(q1_prediction, y_q) q2_prediction = q2_net(torch.cat([obs_tensor, acts_tensor], 1)) q2_loss = mse_criterion(q2_prediction, y_q) """ get policy loss """ a_tilda, mean_a_tilda, log_std_a_tilda, log_prob_a_tilda, _, _ = policy_net.forward( obs_tensor) # see line 12: second equation q1_a_tilda = q1_net(torch.cat([obs_tensor, a_tilda], 1)) q2_a_tilda = q2_net(torch.cat([obs_tensor, a_tilda], 1)) # TODO write code here to compute policy loss correctly, for both variants. if use_single_variant: q_policy_part = q1_a_tilda else: q_policy_part = torch.min(q1_a_tilda, q2_a_tilda) # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))] policy_loss = (alpha * log_prob_a_tilda - q_policy_part).mean() """ alpha loss, update alpha """ if auto_alpha: alpha_loss = -( log_alpha * (log_prob_a_tilda + target_entropy).detach()).mean() alpha_optim.zero_grad() alpha_loss.backward() if grad_clip > 0: nn.utils.clip_grad_norm_(log_alpha, grad_clip) alpha_optim.step() alpha = log_alpha.exp().item() else: alpha_loss = 0 """update networks""" q1_optimizer.zero_grad() q1_loss.backward() if grad_clip > 0: nn.utils.clip_grad_norm_(q1_net.parameters(), grad_clip) q1_optimizer.step() q2_optimizer.zero_grad() q2_loss.backward() if grad_clip > 0: nn.utils.clip_grad_norm_(q2_net.parameters(), grad_clip) q2_optimizer.step() policy_optimizer.zero_grad() policy_loss.backward() if grad_clip > 0: nn.utils.clip_grad_norm_(policy_net.parameters(), grad_clip) policy_optimizer.step() # see line 16: update target value network with value network soft_update_model1_with_model2(q1_target_net, q1_net, polyak) soft_update_model1_with_model2(q2_target_net, q2_net, polyak) current_update_index += 1 if current_update_index % logger_store_freq == 0: # store diagnostic info to logger logger.store(LossPi=policy_loss.item(), LossQ1=q1_loss.item(), LossQ2=q2_loss.item(), LossAlpha=alpha_loss.item(), Q1Vals=q1_prediction.detach().numpy(), Q2Vals=q2_prediction.detach().numpy(), Alpha=alpha, LogPi=log_prob_a_tilda.detach().numpy()) if d or (ep_len == max_ep_len): """when episode terminates, log info about this episode, then reset""" ## store episode return and length to logger logger.store(EpRet=ep_ret, EpLen=ep_len) ## reset environment o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if (t + 1) % steps_per_epoch == 0: epoch = t // steps_per_epoch """ Save pytorch model, very different from tensorflow version We need to save the environment, the state_dict of each network and also the state_dict of each optimizer """ if save_model: sac_state_dict = { 'env': env, 'policy_net': policy_net.state_dict(), 'q1_net': q1_net.state_dict(), 'q2_net': q2_net.state_dict(), 'q1_target_net': q1_target_net.state_dict(), 'q2_target_net': q2_target_net.state_dict(), 'policy_opt': policy_optimizer, 'q1_opt': q1_optimizer, 'q2_opt': q2_optimizer, 'log_alpha': log_alpha, 'alpha_opt': alpha_optim, 'target_entropy': target_entropy } if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state(sac_state_dict, None) # use joblib.load(fname) to load # Test the performance of the deterministic version of the agent. test_agent() # TODO write code here to estimate the bias of the Q networks # recall that we can define the Q bias to be Q value - discounted MC return # initialize another environment that is only used for provide such a bias estimate # store that to logger def estimate_bias(n=1, use_single_variant=False, k=multistep_k): """ run n episodes and calculate the mc_return and estimated_q for each appearing states drop last multistep_k data point of each episode calculate the q bias using mean(mc_return)-mean(estimated_q) return q bias and mean(estimated_q) """ state_num, mc_ret, est_q = 0, 0, 0 for _ in range(n): o, r, d, ep_len, ep_mc_ret, reward_list, q_list = bias_test_env.reset( ), 0, False, 0, 0, [], [] while not (d or (ep_len == max_ep_len)): # Take stochastic actions a = policy_net.get_env_action(o, deterministic=False) q1 = q1_net(torch.cat( [Tensor([o]), Tensor([a])], 1)).item() q2 = q2_net(torch.cat( [Tensor([o]), Tensor([a])], 1)).item() # add estimated q for each state to q_list # if use_single_variant: q_list.append(q1) # else: # q_list.append(min(q1,q2)) o, r, d, _ = bias_test_env.step(a) # store each r in reward_list reward_list.append(r) ep_len += 1 # drop last 200 terms of the reward list and q list reward_list = reward_list[:-200] q_list = q_list[:-200] # calculate the sum of all mc_returns for each state in the episode for i in range(len(reward_list)): powers = np.arange(len(reward_list) - i) ep_mc_ret += sum( (gamma**powers) * np.array(reward_list[i:])) # update mc_ret and est_q mc_ret = (state_num * mc_ret + ep_mc_ret) / (state_num + ep_len) est_q = (state_num * est_q + sum(q_list)) / (state_num + ep_len) # calculate bias return est_q - mc_ret, est_q bias_test_env = env_fn() bias_test_env.seed(seed + 10000) bias_test_env.action_space.np_random.seed(seed + 10000) bias, est_q = estimate_bias(n=1, use_single_variant=use_single_variant, k=multistep_k) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('Alpha', with_min_and_max=True) logger.log_tabular('LossAlpha', average_only=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) # TODO after you store bias info to logger, you should also write code here to log them # so that you can later plot them logger.log_tabular('QBias', bias) logger.log_tabular('QVals', est_q) logger.log_tabular('K', multistep_k) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() sys.stdout.flush()
def ppo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) o, r, d, _ = env.step(a[0]) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) #在轨迹的末尾调用finish进行切断 if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def train_mnist(steps_per_epoch=100, epochs=5, lr=1e-3, layers=2, hidden_size=64, logger_kwargs=dict(), save_freq=1): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Load and preprocess MNIST data (x_train, y_train), _ = tf.keras.datasets.mnist.load_data() x_train = x_train.reshape(-1, 28 * 28) / 255.0 # Define inputs & main outputs from computation graph x_ph = tf.compat.v1.placeholder(tf.float32, shape=(None, 28 * 28)) y_ph = tf.compat.v1.placeholder(tf.int32, shape=(None, )) logits = mlp(x_ph, hidden_sizes=[hidden_size] * layers + [10], activation=tf.nn.relu) predict = tf.argmax(input=logits, axis=1, output_type=tf.int32) # Define loss function, accuracy, and training op y = tf.one_hot(y_ph, 10) loss = tf.compat.v1.losses.softmax_cross_entropy(y, logits) acc = tf.reduce_mean( input_tensor=tf.cast(tf.equal(y_ph, predict), tf.float32)) train_op = tf.compat.v1.train.AdamOptimizer().minimize(loss) # Prepare session sess = tf.compat.v1.Session() sess.run(tf.compat.v1.global_variables_initializer()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={ 'logits': logits, 'predict': predict }) start_time = time.time() # Run main training loop for epoch in range(epochs): for t in range(steps_per_epoch): idxs = np.random.randint(0, len(x_train), 32) feed_dict = {x_ph: x_train[idxs], y_ph: y_train[idxs]} outs = sess.run([loss, acc, train_op], feed_dict=feed_dict) logger.store(Loss=outs[0], Acc=outs[1]) # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state(state_dict=dict(), itr=None) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('Acc', with_min_and_max=True) logger.log_tabular('Loss', average_only=True) logger.log_tabular('TotalGradientSteps', (epoch + 1) * steps_per_epoch) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()