def ppo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states.策略给出的动作 ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``.策略给出的x状态下的a动作概率 ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``.动作被采样的概率 ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!)状态x下的V值 =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. AC框架参数 seed (int): Seed for random number generators.#随机种子数0 steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch.每轮迭代次数4000 epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform.轮次50 gamma (float): Discount factor. (Always between 0 and 1.)折扣因子0.99 clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. 剪切比率0.2 pi_lr (float): Learning rate for policy optimizer.策略学习率3e-4 vf_lr (float): Learning rate for value function optimizer.评价网络学习率1e-3 train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.)策略最大梯度下降步数80 train_v_iters (int): Number of gradient descent steps to take on value function per epoch.评价最大梯度下降步数80 lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) TD(lambda)中的lambda =0.97 max_ep_len (int): Maximum length of trajectory / episode / rollout. 每次最长步长 1000 target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.)提前停车用,目标kl 0.01 logger_kwargs (dict): Keyword args for EpochLogger.日志参数 save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function.模型保存频率每10轮 """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() #创建环境 obs_dim = env.observation_space.shape #读取环境维度 act_dim = env.action_space.shape #动作维度 # Share information about action space with policy architecture 策略框架下共享动作有关信息 ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) #输入环境和动作,输出策略相关信息 # Need all placeholders in *this* order later (to zip with data from buffer)之后需要的数据:环境,动作,优势函数、奖励和上一步的策略 all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob每步得到的信息:策略、V值和动作概率 get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) #经验池 # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) #记录变量 logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) #日志输出策略和评价的变量数 # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) 比率=pi(new)/pi(old) min_adv = tf.where( adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) #tfwhere条件adv_ph,>0,表示优势增加,选(1+clip_ratio)*adv_ph, #<0,表示优势减少,选(1-clip_ratio)*adv_ph。 pi_loss = -tf.reduce_mean(tf.minimum( ratio * adv_ph, min_adv)) #策略loss=比率adv与min_adv的最小者,限制策略偏移 v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp ) # a sample estimate for KL-divergence, easy to compute KL散度的样本估计,易于计算 approx_ent = tf.reduce_mean( -logp ) # a sample estimate for entropy, also easy to compute 熵的样本估计,易于计算 clipped = tf.logical_or( ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) # 逻辑或运算,判定需要剪切,clipped = true/false clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) #clipped转换为浮点数 # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get()) } # zip([x_ph, a_ph, adv_ph, ret_ph, logp_old_ph], #[self.obs_buf, self.act_buf, self.adv_buf, self.ret_buf, self.logp_buf] ) pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # 输入上述数据,计算俩loss和熵 # Training for i in range(train_pi_iters): #策略迭代 _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) # 计算kl kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break #提前停止策略训练 logger.store(StopIter=i) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # 训练评价网络 # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) #重新计算loss和kl,cf logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) # 输出旧loss,kl,cf 和 delta loss start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 #重置 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): # 每一轮 for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) #根据环境给出动作 o2, r, d, _ = env.step(a[0]) #环境交互取得奖励 ep_ret += r #奖励累加 ep_len += 1 # 步长加一 # save and log buf.store(o, a, r, v_t, logp_t) #存储(环境、动作、奖励、V值、概率)经验池 logger.store(VVals=v_t) # Update obs (critical!)更新环境 o = o2 terminal = d or (ep_len == max_ep_len) #结束了或者到达最大步数了 if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) #轨迹被epoch在ep_len步切断 # if trajectory didn't reach terminal state, bootstrap value target 如果轨迹没有达到终端状态,引导值目标 last_val = 0 if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def sqn(env_fn, env_init, ego_agent, opp_agent, opp_action, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-2, alpha=0.2, batch_size=100, start_steps=10000, update_after=4000, update_every=1, num_test_episodes=10, max_ep_len=4000, logger_kwargs=dict(), save_freq=1, lr_period=0.7): """ Soft Q-Network, based on SAC and clipped Double Q-learning Written by Christopher Hsu 05/2020 Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with an ``act`` method, a ``pi`` module, a ``q1`` module, and a ``q2`` module. The ``act`` method and ``pi`` module should accept batches of observations as inputs, and ``q1`` and ``q2`` should accept a batch of observations and a batch of actions as inputs. When called, ``act``, ``q1``, and ``q2`` should return: =========== ================ ====================================== Call Output Shape Description =========== ================ ====================================== ``act`` (batch, act_dim) | Numpy array of actions for each | observation. ``q1`` (batch,) | Tensor containing one current estimate | of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) ``q2`` (batch,) | Tensor containing the other current | estimate of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) =========== ================ ====================================== Calling ``pi`` should return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Tensor containing actions from policy | given observations. ``logp_pi`` (batch,) | Tensor containing log probabilities of | actions in ``a``. Importantly: gradients | should be able to flow back into ``a``. =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.n # Create actor-critic module and target networks ac = actor_critic(env.observation_space, env.action_space, alpha, **ac_kwargs) ac_targ = deepcopy(ac) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters()) # Experience buffer if isinstance(env.action_space, Box): a_dim = act_dim elif isinstance(env.action_space, Discrete): a_dim = 1 replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=a_dim, size=replay_size) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple(core.count_vars(module) for module in [ac.q1, ac.q2]) logger.log('\nNumber of parameters: \t q1: %d, \t q2: %d\n' % var_counts) # Set up function for computing SAC Q-losses def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] q1 = ac.q1(o, a) q2 = ac.q2(o, a) # Bellman backup for Q functions with torch.no_grad(): # Target actions come from *current* policy v1 = ac.q1.values(o2) v2 = ac.q2.values(o2) a2, logp_a2 = ac.pi(v1 + v2, action_mask=ego_agent.aval_paths) # Target Q-values q1_pi_targ = ac_targ.q1(o2, a2) q2_pi_targ = ac_targ.q2(o2, a2) q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) #Unsqueeze adds another dim, necessary to be column vectors backup = r.unsqueeze(1) + gamma * (1 - d).unsqueeze(1) * ( q_pi_targ - alpha * logp_a2) # MSE loss against Bellman backup loss_q1 = ((q1 - backup)**2).mean() loss_q2 = ((q2 - backup)**2).mean() loss_q = loss_q1 + loss_q2 # Useful info for logging q_info = dict(Q1Vals=q1.detach().numpy(), Q2Vals=q2.detach().numpy()) return loss_q, q_info q_optimizer = Adam(q_params, lr=lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(data, lr_iter): # Update learning rate with cosine schedule lr = np.clip( 0.005 * np.cos(np.pi * lr_iter / (total_steps * lr_period)) + 0.00501, 1e-2, 1e-5) q_optimizer.param_groups[0]['lr'] = lr # First run one gradient descent step for Q1 and Q2 q_optimizer.zero_grad() loss_q, q_info = compute_loss_q(data) loss_q.backward() q_optimizer.step() # Record things logger.store(LossQ=loss_q.item(), **q_info) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) def get_action(o, action_mask, deterministic=False): return ac.act(torch.as_tensor(o, dtype=torch.float32), action_mask, deterministic) def test_agent(): for j in range(num_test_episodes): d, ep_ret, ep_len = False, 0, 0 init_positions = np.random.random_integers(0, 1) o = test_env.reset({ 'x': env_init['initial_x'][init_positions], 'y': env_init['initial_y'], 'theta': env_init['initial_theta'] }) #Convert o to RL obs RLobs = ego_agent.process_obs(o) Oppobs = opp_agent.process_obs(o) while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time a = get_action(RLobs, action_mask=ego_agent.aval_paths, deterministic=True) #RL action to drive control actions ego_speed, ego_steer, a = ego_agent.plan(o, a) #Opponent decision a_opp = opp_action(Oppobs, action_mask=opp_agent.aval_paths, deterministic=True) # a_opp = 7 opp_speed, opp_steer, _ = opp_agent.plan(o, a_opp) #Action dict action = { 'ego_idx': 0, 'speed': [ego_speed, opp_speed], 'steer': [ego_steer, opp_steer] } o, r, d, _ = test_env.step(action) #Convert o to RL obs RLobs = ego_agent.process_obs(o) Oppobs = opp_agent.process_obs(o) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() init_positions = np.random.random_integers(0, 1) o, ep_ret, ep_len = env.reset({ 'x': env_init['initial_x'][init_positions], 'y': env_init['initial_y'], 'theta': env_init['initial_theta'] }), 0, 0 #Convert o to RL obs RLobs = ego_agent.process_obs(o) Oppobs = opp_agent.process_obs(o) # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy. if t > start_steps: a = get_action(RLobs, action_mask=ego_agent.aval_paths, deterministic=False) else: try: a = random.choice(tuple(ego_agent.aval_paths)) except: #happens when there are no paths available a = 15 #RL action to drive control actions ego_speed, ego_steer, a = ego_agent.plan(o, a) #Opponent decision a_opp = opp_action(Oppobs, action_mask=opp_agent.aval_paths, deterministic=True) # a_opp = 7 opp_speed, opp_steer, _ = opp_agent.plan(o, a_opp) #Action dict action = { 'ego_idx': 0, 'speed': [ego_speed, opp_speed], 'steer': [ego_steer, opp_steer] } # Step the env o2, r, d, _ = env.step(action) ep_ret += r ep_len += 1 #Convert o2 to RLobs2 RLobs2 = ego_agent.process_obs(o2) Oppobs2 = opp_agent.process_obs(o2) # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer # replay_buffer.store(o, a, r, o2, d) replay_buffer.store(RLobs, a, r, RLobs2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! RLobs = RLobs2 Oppobs = Oppobs2 o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) init_positions = np.random.random_integers(0, 1) o, ep_ret, ep_len = env.reset({ 'x': env_init['initial_x'][init_positions], 'y': env_init['initial_y'], 'theta': env_init['initial_theta'] }), 0, 0 #Convert o to RL obs RLobs = ego_agent.process_obs(o) Oppobs = opp_agent.process_obs(o) # Update handling if t >= update_after and t % update_every == 0: for j in range(update_every): #Cosine learning rate schedule if t < total_steps * (1 - lr_period): lr_iter = 0 else: lr_iter = t - total_steps * (1 - lr_period) batch = replay_buffer.sample_batch(batch_size) update(data=batch, lr_iter=lr_iter) # End of epoch handling if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): if epoch == epochs: logger.save_state({'env': env}, None) else: #SpinningUp saving style logger.save_state({'env': env}, epoch) #Standard pytorch way of saving fpath = logger_kwargs['output_dir'] + '/state_dict/' os.makedirs(fpath, exist_ok=True) torch.save(ac.state_dict(), fpath + 'model%d.pt' % epoch) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def sac(env_fn, actor_critic=a_in_mlp_actor_critic, logger_kwargs=dict(), network_params=dict(), rl_params=dict()): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # control params seed = rl_params['seed'] epochs = rl_params['epochs'] steps_per_epoch = rl_params['steps_per_epoch'] replay_size = rl_params['replay_size'] batch_size = rl_params['batch_size'] start_steps = rl_params['start_steps'] max_ep_len = rl_params['max_ep_len'] save_freq = rl_params['save_freq'] render = rl_params['render'] # rl params gamma = rl_params['gamma'] polyak = rl_params['polyak'] lr = rl_params['lr'] state_hist_n = rl_params['state_hist_n'] grad_clip_val = rl_params['grad_clip_val'] # entropy params alpha = rl_params['alpha'] target_entropy_start = rl_params['target_entropy_start'] target_entropy_stop = rl_params['target_entropy_stop'] target_entropy_steps = rl_params['target_entropy_steps'] train_env, test_env = env_fn(), env_fn() obs = train_env.observation_space act = train_env.action_space tf.set_random_seed(seed) np.random.seed(seed) train_env.seed(seed) train_env.action_space.np_random.seed(seed) test_env.seed(seed) test_env.action_space.np_random.seed(seed) try: obs_dim = obs.n observation_type = 'Discrete' except AttributeError as e: obs_dim = obs.shape[0] observation_type = 'Box' obs_dim = obs_dim act_dim = act.n # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim * state_hist_n, act_dim=act_dim, size=replay_size) # init a state buffer for storing last m states train_state_buffer = StateBuffer(m=state_hist_n) test_state_buffer = StateBuffer(m=state_hist_n) # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = placeholders(obs_dim * state_hist_n, act_dim, obs_dim * state_hist_n, None, None) # alpha and entropy setup max_target_entropy = tf.log(tf.cast(act_dim, tf.float32)) target_entropy_prop_ph = tf.placeholder(dtype=tf.float32, shape=()) target_entropy = max_target_entropy * target_entropy_prop_ph log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=0.0) if alpha == 'auto': # auto tune alpha alpha = tf.exp(log_alpha) else: # fixed alpha alpha = tf.get_variable('alpha', dtype=tf.float32, initializer=alpha) # Main outputs from computation graph with tf.variable_scope('main'): mu, pi_dist, logp_pi, q1_a, q2_a = actor_critic( x_ph, a_ph, **network_params) # sample an action from the gumbel distribution pi = tf.argmax(pi_dist, axis=-1) # policy_dist = tf.distributions.Categorical(probs=pi_dist) # pi = policy_dist.sample() with tf.variable_scope('main', reuse=True): # compose q with pi, for pi-learning _, _, _, q1_pi, q2_pi = actor_critic(x_ph, pi_dist, **network_params) # get actions and log probs of actions for next states, for Q-learning _, pi_dist_next, logp_pi_next, _, _ = actor_critic( x2_ph, a_ph, **network_params) # Target value network with tf.variable_scope('target'): _, _, _, q1_pi_targ, q2_pi_targ = actor_critic(x2_ph, pi_dist_next, **network_params) # Count variables var_counts = tuple( count_vars(scope) for scope in ['log_alpha', 'main/pi', 'main/q1', 'main/q2', 'main']) print(('\nNumber of parameters: \t alpha: %d, \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t total: %d\n')%var_counts) # Min Double-Q: min_q_pi = tf.minimum(q1_pi, q2_pi) min_q_pi_targ = tf.minimum(q1_pi_targ, q2_pi_targ) # Targets for Q and V regression q_backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * (min_q_pi_targ - alpha * logp_pi_next)) # critic losses q1_loss = 0.5 * tf.reduce_mean((q_backup - q1_a)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2_a)**2) value_loss = q1_loss + q2_loss # Soft actor losses pi_loss = tf.reduce_mean(alpha * logp_pi - min_q_pi) # alpha loss for temperature parameter alpha_backup = tf.stop_gradient(logp_pi + target_entropy) alpha_loss = -tf.reduce_mean(log_alpha * alpha_backup) # Policy train op # (has to be separate from value train op, because q1_logits appears in pi_loss) pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr, epsilon=1e-04) if grad_clip_val is not None: gvs = pi_optimizer.compute_gradients(pi_loss, var_list=get_vars('main/pi')) capped_gvs = [(ClipIfNotNone(grad, grad_clip_val), var) for grad, var in gvs] train_pi_op = pi_optimizer.apply_gradients(capped_gvs) else: train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr, epsilon=1e-04) with tf.control_dependencies([train_pi_op]): if grad_clip_val is not None: gvs = value_optimizer.compute_gradients( value_loss, var_list=get_vars('main/q')) capped_gvs = [(ClipIfNotNone(grad, grad_clip_val), var) for grad, var in gvs] train_value_op = value_optimizer.apply_gradients(capped_gvs) else: train_value_op = value_optimizer.minimize( value_loss, var_list=get_vars('main/q')) alpha_optimizer = tf.train.AdamOptimizer(learning_rate=lr, epsilon=1e-04) with tf.control_dependencies([train_value_op]): train_alpha_op = alpha_optimizer.minimize( alpha_loss, var_list=get_vars('log_alpha')) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # All ops to call during one training step step_ops = [ pi_loss, q1_loss, q2_loss, q1_a, q2_a, logp_pi, target_entropy, alpha_loss, alpha, train_pi_op, train_value_op, train_alpha_op, target_update ] # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session(config=tf_config) sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'pi': pi, 'q1': q1_a, 'q2': q2_a }) def get_action(state, deterministic=False): act_op = mu if deterministic else pi return sess.run(act_op, feed_dict={x_ph: [state]})[0] def reset(env, state_buffer): o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 o = process_observation(o, obs_dim, observation_type) r = process_reward(r) state = state_buffer.init_state(init_obs=o) return o, r, d, ep_ret, ep_len, state def test_agent(n=10, render=True): global sess, mu, pi, q1_a, q2_a, q1_pi, q2_pi for j in range(n): o, r, d, ep_ret, ep_len, test_state = reset( test_env, test_state_buffer) if render: test_env.render() while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(test_state, True)) o = process_observation(o, obs_dim, observation_type) r = process_reward(r) test_state = test_state_buffer.append_state(o) ep_ret += r ep_len += 1 if render: test_env.render() logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) if render: test_env.close() start_time = time.time() o, r, d, ep_ret, ep_len, state = reset(train_env, train_state_buffer) total_steps = steps_per_epoch * epochs target_entropy_prop = linear_anneal(current_step=0, start=target_entropy_start, stop=target_entropy_stop, steps=target_entropy_steps) # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t > start_steps: a = get_action(state) else: a = train_env.action_space.sample() # Step the env o2, r, d, _ = train_env.step(a) o2 = process_observation(o2, obs_dim, observation_type) a = process_action(a, act_dim) r = process_reward(r) next_state = train_state_buffer.append_state(o2) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(state, a, r, next_state, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 state = next_state if d or (ep_len == max_ep_len): """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], target_entropy_prop_ph: target_entropy_prop } outs = sess.run(step_ops, feed_dict) logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2], Q1Vals=outs[3], Q2Vals=outs[4], LogPi=outs[5], TargEntropy=outs[6], LossAlpha=outs[7], Alpha=outs[8]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len, state = reset(train_env, train_state_buffer) # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # update target entropy every epoch target_entropy_prop = linear_anneal(current_step=t, start=target_entropy_start, stop=target_entropy_stop, steps=target_entropy_steps) # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': train_env}, None) # Test the performance of the deterministic version of the agent. test_agent(n=10, render=render) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LogPi', average_only=True) logger.log_tabular('TargEntropy', average_only=True) logger.log_tabular('Alpha', average_only=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) logger.log_tabular('LossAlpha', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def sac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=200000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.005, batch_size=1000, start_steps=10000, max_ep_len=2000, logger_kwargs=dict(), save_freq=1, env_init=dict(), env_name='unknown', nb_test_episodes=50, train_freq=10, Teacher=None): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``mu`` (batch, act_dim) | Computes mean actions from policy | given states. ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. Critical: must be differentiable | with respect to policy parameters all | the way through action sampling. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). ``q2_pi`` (batch,) | Gives the composition of ``q2`` and | ``pi`` for states in ``x_ph``: | q2(x, pi(x)). ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) hyperparams = locals() if Teacher: del hyperparams[ 'Teacher'] # remove teacher to avoid serialization error logger.save_config(hyperparams) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() # initialize environment (choose between short, default or quadrupedal walker) if len(env_init.items()) > 0: env.env.my_init(env_init) test_env.env.my_init(env_init) if Teacher: Teacher.set_env_params(env) env.reset() obs_dim = env.env.observation_space.shape[0] print(obs_dim) act_dim = env.env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope('main'): mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v = actor_critic( x_ph, a_ph, **ac_kwargs) # Target value network with tf.variable_scope('target'): _, _, _, _, _, _, _, v_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main']) print(('\nNumber of parameters: \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t v: %d, \t total: %d\n')%var_counts) # Min Double-Q: min_q_pi = tf.minimum(q1_pi, q2_pi) # Targets for Q and V regression q_backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * v_targ) v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi) # Soft actor-critic losses pi_loss = tf.reduce_mean(alpha * logp_pi - q1_pi) q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2) v_loss = 0.5 * tf.reduce_mean((v_backup - v)**2) value_loss = q1_loss + q2_loss + v_loss # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars('main/q') + get_vars('main/v') with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # All ops to call during one training step step_ops = [ pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, train_pi_op, train_value_op, target_update ] # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2, 'v': v }) def get_action(o, deterministic=False): act_op = mu if deterministic else pi return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1)})[0] def test_agent(n=10): global sess, mu, pi, q1, q2, q1_pi, q2_pi for j in range(n): if Teacher: Teacher.set_test_env_params(test_env) o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) if Teacher: Teacher.record_test_episode(ep_ret, ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t > start_steps: a = get_action(o) else: a = env.env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ for j in range(np.ceil(ep_len / train_freq).astype('int')): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], } outs = sess.run(step_ops, feed_dict) logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2], LossV=outs[3], Q1Vals=outs[4], Q2Vals=outs[5], VVals=outs[6], LogPi=outs[7]) logger.store(EpRet=ep_ret, EpLen=ep_len) if Teacher: Teacher.record_train_episode(ep_ret, ep_len) Teacher.set_env_params(env) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) #itr=epoch) # Test the performance of the deterministic version of the agent. test_agent(n=nb_test_episodes) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t + 1) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() # Pickle parameterized env data #print(logger.output_dir+'/env_params_save.pkl') if Teacher: Teacher.dump(logger.output_dir + '/env_params_save.pkl')
def ppo_pyco_multi(gym_or_pyco, env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=10000, epochs=1000, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=10000, num_copy=3, target_kl=0.01, logger_kwargs=dict(), save_freq=10, tensorboard_path = '/home/clement/spinningup/tensorboard'): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. tensorboard_path: The path to the saved graphs&scalars in tensorboard """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) dict_continous_gym = ['CarRacing', 'LunarLander','Pong','AirRaid','Adventure', 'AirRaid', 'Alien', 'Amidar', 'Assault', 'Asterix', 'Asteroids', 'Atlantis', 'BankHeist', 'BattleZone', 'BeamRider', 'Berzerk', 'Bowling', 'Boxing', 'Breakout', 'Carnival', 'Centipede', 'ChopperCommand', 'CrazyClimber', 'Defender', 'Demon_attack', 'DoubleDunk', 'ElevatorAction', 'Enduro', 'FishingDerby', 'Freeway', 'Frostbite', 'Gopher', 'Gravitar', 'Hero', 'IceHockey', 'Jamesbond', 'JourneyEscape', 'Kangaroo', 'Krull', 'KungFuMaster', 'MpntezumaRevenge', 'MsPacman', 'NameThisGame', 'Phoenix', 'Pitfall', 'Pooyan', 'PrivateEye', 'Qbert', 'Riverraid', 'RoadRunner', 'Robotank', 'Seaquest', 'Skiing', 'Solaris', 'SpaceInvaders', 'StarGunner', 'Tennis', 'TimePilot', 'Tutankham', 'UpNDown', 'Venture', 'VideoPinball', 'WizardOfWor', 'VarsRevenge', 'Zaxxon','Numberlink'] dict_discrete_gym = [] env = env_fn() dict_gym = ['CarRacing', 'LunarLander','Pong','AirRaid','Adventure', 'AirRaid', 'Alien', 'Amidar', 'Assault', 'Asterix', 'Asteroids', 'Atlantis', 'BankHeist', 'BattleZone', 'BeamRider', 'Berzerk', 'Bowling', 'Boxing', 'Breakout', 'Carnival', 'Centipede', 'ChopperCommand', 'CrazyClimber', 'Defender', 'Demon_attack', 'DoubleDunk', 'ElevatorAction', 'Enduro', 'FishingDerby', 'Freeway', 'Frostbite', 'Gopher', 'Gravitar', 'Hero', 'IceHockey', 'Jamesbond', 'JourneyEscape', 'Kangaroo', 'Krull', 'KungFuMaster', 'MpntezumaRevenge', 'MsPacman', 'NameThisGame', 'Phoenix', 'Pitfall', 'Pooyan', 'PrivateEye', 'Qbert', 'Riverraid', 'RoadRunner', 'Robotank', 'Seaquest', 'Skiing', 'Solaris', 'SpaceInvaders', 'StarGunner', 'Tennis', 'TimePilot', 'Tutankham', 'UpNDown', 'Venture', 'VideoPinball', 'WizardOfWor', 'VarsRevenge', 'Zaxxon','Numberlink'] # This code is specific for pycolab if gym_or_pyco == 'gym': #env_dict = {} #for i in range(num_copy): # env_dict["env_{}".format(i)] = env env_list = [] for i in range(num_copy): env_list.append(env) else: env_list = [] for i in range(num_copy): env_list.append(env()) env_list[i].reset() obs_dim = env().observation_space.shape # act_dim = env.action_space.n if env().action_space == 4: act_dim = env().action_space else: act_dim = env().action_space.n # Share information about action space with policy architecture ac_kwargs['action_space'] = env().action_space # Inputs to computation graph # x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) if gym_or_pyco == 'pyco': #x_ph = tf.placeholder(tf.float32, shape=( num_copy, obs_dim[0], obs_dim[1], 1)) x_ph = tf.placeholder(tf.float32, shape=( 1, obs_dim[0]*num_copy, obs_dim[1], 1)) else: x_ph = tf.placeholder(tf.float32, shape=( 1, obs_dim[0]*num_copy, obs_dim[1], obs_dim[2])) # a_ph = core.placeholders_from_spaces(env.action_space) if gym_or_pyco == 'gym' and isinstance(env.action_space, Discrete): a_ph = tf.placeholder(tf.uint8, shape=(num_copy)) elif gym_or_pyco == 'gym' and isinstance(env.action_space, Box): a_ph = tf.placeholder(tf.float32, shape=(env().action_space.shape[0])) else: a_ph = tf.placeholder(tf.uint8, shape=(num_copy)) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph # pi, logp, logp_pi, v, logits = actor_critic(x_ph, a_ph, **ac_kwargs) # actor_critic with relational policy # pi, logp, logp_pi, v, logits = actor_critic(x_ph, a_ph, policy='relational_categorical_policy', action_space = env.action_space.n, **ac_kwargs) if gym_or_pyco == 'gym' and isinstance(env().action_space, Discrete): pi, logp, logp_pi, v, logits = actor_critic(x_ph, a_ph, policy='baseline_categorical_policy', action_space=env().action_space.n) elif gym_or_pyco == 'gym' and isinstance(env().action_space, Box): pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, policy='relational_gaussian_policy', action_space=env().action_space.shape[0]) else: pi, logp1, logp2, logp_pi, v, logits = actor_critic(x_ph, a_ph, policy='baseline_categorical_policy', action_space=env().action_space.n, num_copy = num_copy ) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(gym_or_pyco, obs_dim, act_dim, local_steps_per_epoch, gamma, lam, num_copy) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = (tf.exp(logp1 - logp_old_ph) + tf.exp(logp2 - logp_old_ph))/2 # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) # tensorboard test with tf.name_scope('pi_loss'): core.variable_summaries(pi_loss) v_loss = tf.reduce_mean((ret_ph - v) ** 2) # Info (useful to watch during learning) approx_kl = (tf.reduce_mean(logp_old_ph - logp1)+tf.reduce_mean(logp_old_ph - logp2))/2 # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean(-logp1)+tf.reduce_mean(-logp2) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() # sess = tf_debug.LocalCLIDebugWrapperSession(sess) # tensorboard merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter(tensorboard_path + '/train', sess.graph) test_writer = tf.summary.FileWriter(tensorboard_path + '/test') sess.run(tf.global_variables_initializer()) # saver.restore(sess, "/home/clement/Documents/spinningup/trained_params/model.ckpt") # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(epoch): # inputs = {k:v for k,v in zip(all_phs, buf.get())} if gym_or_pyco == 'gym' and isinstance(env().action_space, Discrete): pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict={logp_old_ph: buf.logp_buf, x_ph: o_feed, a_ph: a_multi, adv_ph: buf.adv_buf, ret_ph: buf.ret_buf}) if gym_or_pyco == 'gym' and isinstance(env().action_space, Box): pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict={logp_old_ph: buf.logp_buf, x_ph: o_feed, a_ph: a_multi[0], adv_ph: buf.adv_buf, ret_ph: buf.ret_buf}) else: pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict={logp_old_ph: buf.logp_buf, x_ph: o_feed, a_ph: a_multi, adv_ph: buf.adv_buf, ret_ph: buf.ret_buf}) # pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) summary = tf.Summary(value=[tf.Summary.Value(tag="loss", simple_value=pi_l_old)]) test_writer.add_summary(summary, epoch) # Training for i in range(train_pi_iters): if gym_or_pyco == 'gym' and isinstance(env().action_space, Discrete): _, kl = sess.run([train_pi, approx_kl], feed_dict={logp_old_ph: buf.logp_buf, x_ph: o, a_ph: a_multi, adv_ph: buf.adv_buf, ret_ph: buf.ret_buf}) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log('Early stopping at step %d due to reaching max kl.' % i) break if gym_or_pyco == 'gym' and isinstance(env().action_space, Box): _, kl = sess.run([train_pi, approx_kl], feed_dict={logp_old_ph: buf.logp_buf, x_ph: o, a_ph: a_multi[0], adv_ph: buf.adv_buf, ret_ph: buf.ret_buf}) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log('Early stopping at step %d due to reaching max kl.' % i) break else: _, kl = sess.run([train_pi, approx_kl], feed_dict={logp_old_ph: buf.logp_buf, x_ph: o_feed, a_ph: a_multi, adv_ph: buf.adv_buf, ret_ph: buf.ret_buf}) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log('Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): if gym_or_pyco == 'gym' and isinstance(env().action_space, Discrete): sess.run(train_v, feed_dict={logp_old_ph: buf.logp_buf, x_ph: o, a_ph: a_multi, adv_ph: buf.adv_buf, ret_ph: buf.ret_buf}) if gym_or_pyco == 'gym' and isinstance(env().action_space, Box): sess.run(train_v, feed_dict={logp_old_ph: buf.logp_buf, x_ph: o, a_ph: a_multi[0], adv_ph: buf.adv_buf, ret_ph: buf.ret_buf}) else: sess.run(train_v, feed_dict={logp_old_ph: buf.logp_buf, x_ph: o_feed, a_ph: a_multi, adv_ph: buf.adv_buf, ret_ph: buf.ret_buf}) # Log changes from update if gym_or_pyco == 'gym' and isinstance(env().action_space, Discrete): pi_l_new, v_l_new, kl, cf = sess.run([pi_loss, v_loss, approx_kl, clipfrac], feed_dict={logp_old_ph: buf.logp_buf, x_ph: o, a_ph: a_multi, adv_ph: buf.adv_buf, ret_ph: buf.ret_buf}) if gym_or_pyco == 'gym' and isinstance(env().action_space, Box): pi_l_new, v_l_new, kl, cf = sess.run([pi_loss, v_loss, approx_kl, clipfrac], feed_dict={logp_old_ph: buf.logp_buf, x_ph: o, a_ph: a_multi[0], adv_ph: buf.adv_buf, ret_ph: buf.ret_buf}) else: pi_l_new, v_l_new, kl, cf = sess.run([pi_loss, v_loss, approx_kl, clipfrac], feed_dict={logp_old_ph: buf.logp_buf, x_ph: o_feed, a_ph: a_multi, adv_ph: buf.adv_buf, ret_ph: buf.ret_buf}) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len_list = env().reset(), np.zeros(num_copy), np.zeros(num_copy,dtype=bool), np.zeros(num_copy), np.zeros(num_copy) if gym_or_pyco == 'gym': o = o.reshape(1, obs_dim[0], obs_dim[1], obs_dim[2]) else: o = rgb_input_pyco(o, obs_dim) o = o.reshape(1, obs_dim[0], obs_dim[1], 1) o_feed = np.concatenate((o,o)) #o_feed = o_feed.reshape(1, obs_dim[0]*num_copy, obs_dim[1], 1) obs_buf = o_feed for i in range(num_copy - 2): obs_buf = np.concatenate((obs_buf, o)) obs_buf = obs_buf.reshape(1, obs_dim[0] * num_copy, obs_dim[1], 1) o_feed = obs_buf #o_feed = o_feed.reshape(num_copy, obs_dim[0], obs_dim[1], 1) # now we have several starting points # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): num_ep = 0 summary_ep = [] for t in range(local_steps_per_epoch): # a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.board.reshape(1, -1)}) a_multi, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o_feed}) a_multi = np.repeat(a_multi[0],num_copy) # save and log # buf.store(o.board.reshape(1,-1), a, r, v_t, logp_t) # obs, act, rew, val, logp logger.store(VVals=v_t) #o, r, d, _ = env.step(a[0]) o_feed_temp=[] for i in range(num_copy): o, r[i], d[i], _ = env_list[i].step(a_multi[i]) if gym_or_pyco == 'pyco': o = rgb_input_pyco(o, obs_dim) o = o.reshape(1, obs_dim[0], obs_dim[1], 1) o_feed_temp.append(o) else: o = o.reshape(1, obs_dim[0], obs_dim[1], obs_dim[2]) if r is None: ep_ret[i] += 0 r[i] = 0 else: ep_ret[i] += r[i] ep_len_list[i] += 1 terminal = d[i] or (sum(ep_len_list) == max_ep_len) if terminal : num_ep += 1 logger.store(EpRet=ep_ret[i], EpLen=ep_len_list[i]) summary_ep = summary_ep + [ep_ret[i]] o, r[i], d[i], ep_ret[i], ep_len_list[i] = env_list[i].reset(), 0, False, 0, 0 #if terminal or (t == local_steps_per_epoch - 1): if (t == local_steps_per_epoch - 1): num_ep += 2 for i in range(num_copy): if not d[i]: print('Warning: trajectory cut off by epoch at %d steps.' % ep_len_list[i]) # if trajectory didn't reach terminal state, bootstrap value target last_val = r[i] if d[i] else sess.run(v, feed_dict={x_ph: o_feed}) buf.finish_path(last_val,i) if d[i]: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=sum(ep_len_list)) summary_ep = summary_ep + [ep_ret] # summary = tf.Summary(value=[tf.Summary.Value(tag="mean_ep_ret", simple_value=summary_ep)]) # test_writer.add_summary(summary, num_ep) o, r[i], d[i], ep_ret[i], ep_len_list[i] = env_list[i].reset(), 0, False, 0, 0 if gym_or_pyco == 'gym': o = o.reshape(1, obs_dim[0], obs_dim[1], obs_dim[2]) else: o = rgb_input_pyco(o, obs_dim) o = o.reshape(1, obs_dim[0], obs_dim[1], 1) o_feed = np.concatenate((o, o)) # o_feed = o_feed.reshape(1, obs_dim[0]*num_copy, obs_dim[1], 1) obs_buf = o_feed for i in range(num_copy - 2): obs_buf = np.concatenate((obs_buf, o)) obs_buf = obs_buf.reshape(1, obs_dim[0] * num_copy, obs_dim[1], 1) o_feed = obs_buf # o_feed = o_feed.reshape(num_copy, obs_dim[0], obs_dim[1], 1) # now we have several starting points # now we have several starting points o_feed = np.concatenate((o_feed_temp[0], o_feed_temp[1])) obs_buf = o_feed for i in range(num_copy - 2): obs_buf = np.concatenate((obs_buf, o_feed_temp[i + 2])) obs_buf = obs_buf.reshape(1, obs_dim[0] * num_copy, obs_dim[1], 1) o_feed = obs_buf #obs_buf=np.concatenate((o_feed_temp[0],o_feed_temp[1])) #for i in range(num_copy-2): # obs_buf=np.concatenate((obs_buf,o_feed[i+2])) #obs_buf = obs_buf.reshape( 1, obs_dim[0]*num_copy, obs_dim[1], 1) buf.store(obs_buf, a_multi, r, v_t, logp_t,num_copy) # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! min_summary_ep = min(summary_ep) max_summary_ep = max(summary_ep) summary_ep = np.mean(summary_ep) value_summary = np.mean(buf.val_buf) summary = tf.Summary(value=[tf.Summary.Value(tag="mean_ep_ret", simple_value=summary_ep)]) test_writer.add_summary(summary, epoch) summary = tf.Summary(value=[tf.Summary.Value(tag="min_ep_ret", simple_value=min_summary_ep)]) test_writer.add_summary(summary, epoch) summary = tf.Summary(value=[tf.Summary.Value(tag="max_ep_ret", simple_value=max_summary_ep)]) test_writer.add_summary(summary, epoch) summary = tf.Summary(value=[tf.Summary.Value(tag="mean_value", simple_value=value_summary)]) test_writer.add_summary(summary, epoch) update(epoch) #saver = tf.train.Saver() #save_path = saver.save(sess, "/home/clement/Documents/spinningup/trained_params/model.ckpt") # If you want to reload saved variables : # with tf.Session() as sess: # Restore variables from disk. # saver.restore(sess, "/home/clement/Documents/spinningup/trained_params/model.ckpt") # since I changed my sess.run i have to reset the buffer myself: buf.get() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def __init__(self, env, EP_MAX=1000, EP_LEN=500, GAMMA=0.99, AR=0.0001, CR=0.0001, BATCH=32, UPDATE_STEP=10, hidden_sizes=(64, 64), activation=tf.tanh, output_activation=tf.tanh, act_noise_amount=0.01, logger_kwargs=dict()): self.logger = EpochLogger(**logger_kwargs) self.logger.save_config(locals()) self.EP_MAX = EP_MAX self.EP_LEN = EP_LEN self.GAMMA = GAMMA self.BATCH = BATCH self.UPDATE_STEP = UPDATE_STEP self.S_DIM = env.observation_space.shape[-1] self.A_DIM = env.action_space.shape[-1] self.act_high = env.action_space.high self.act_low = env.action_space.low self.hidden_sizes = hidden_sizes self.activation = activation self.output_activation = output_activation self.act_noise_amount = act_noise_amount # self.sess = tf.Session() self.PROJECT_ROOT = logger_kwargs['output_dir'] self.save_times = 0 # self.sess = sess self.tfs = tf.placeholder(tf.float32, [None, self.S_DIM], 'state') self.tfa = tf.placeholder(tf.float32, [None, self.A_DIM], 'action') # policy and value self.tfr = tf.placeholder(tf.float32, [ None, ], 'reward_to_go') # ac_kwargs['action_space'] = env.action_space # ac_kwargs['hidden_sizes'] = (64,64) # self.pi, self.logp, self.logp_pi, self.v = core.mlp_actor_critic(self.tfs, self.tfa, **ac_kwargs) self.pi, _ = self._build_net('pi', trainable=True) with tf.variable_scope('v'): self.v = tf.squeeze(mlp(self.tfs, list(self.hidden_sizes) + [1], self.activation, None), axis=1) # with tf.variable_scope('q1'): # q1 = tf.squeeze(mlp(tf.concat([x,a], axis=-1), list(hidden_sizes)+[1], activation, None), axis=1) with tf.variable_scope('loss'): self.pi_loss = tf.reduce_mean(tf.square(self.pi - self.tfa)) self.v_loss = tf.reduce_mean((self.tfr - self.v)**2) # self.q_loss = tf.reduce_mean((self.tfr - self.v)**2) with tf.variable_scope('train'): self.train_pi = tf.train.AdamOptimizer(AR).minimize(self.pi_loss) self.train_v = tf.train.AdamOptimizer(CR).minimize(self.v_loss) pi_ref_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='main/pi') self.saver = tf.train.Saver(pi_ref_vars) self.sess = tf.Session() tf.summary.FileWriter("log/", self.sess.graph) self.sess.run(tf.global_variables_initializer()) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) self.logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # # Setup model saving self.logger.setup_tf_saver(self.sess, inputs={ 'x': self.tfs, 'a': self.tfa }, outputs={ 'pi': self.pi, 'v': self.v })
def ppo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, episodes_per_epoch=None, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10, custom_h=None, eval_episodes=50, do_checkpoint_eval=False, env_name=None, eval_temp=1.0, train_starting_temp=1.0, env_version=None, env_input=None, target_arcs=None, early_stop_epochs=None, save_all_eval=False, meta_learning=False, finetune=False, finetune_model_path=None): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # create logger for training logger = EpochLogger(meta_learning_or_finetune=(finetune or meta_learning), **logger_kwargs) logger.save_config(locals()) # create logger for evaluation to keep track of evaluation values at each checkpoint (or save frequency) # using eval_progress.txt. It is different from the logger_eval used inside one evaluation epoch. logger_eval_progress = EpochLogger(output_fname='progress_eval.txt', **logger_kwargs) # create logger for evaluation and save best performance, best structure, and best model in simple_save999999 logger_eval = EpochLogger(**dict( exp_name=logger_kwargs['exp_name'], output_dir=os.path.join(logger.output_dir, "simple_save999999"))) # create logger for tensorboard tb_logdir = "{}/tb_logs/".format(logger.output_dir) tb_logger = Logger(log_dir=tb_logdir) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) logger.log('set tf and np random seed = {}'.format(seed)) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space if custom_h is not None: hidden_layers_str_list = custom_h.split('-') hidden_layers_int_list = [int(h) for h in hidden_layers_str_list] ac_kwargs['hidden_sizes'] = hidden_layers_int_list # create a tf session with GPU memory usage option to be allow_growth so that one program will not use up the # whole GPU memory config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) # log tf graph tf.summary.FileWriter(tb_logdir, sess.graph) if not finetune: # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) temperature_ph = tf.placeholder(tf.float32, shape=(), name="init") # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, temperature_ph, **ac_kwargs) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = tf.compat.v1.train.AdamOptimizer( learning_rate=pi_lr).minimize(pi_loss, name='train_pi') train_v = tf.compat.v1.train.AdamOptimizer( learning_rate=vf_lr).minimize(v_loss, name='train_v') sess.run(tf.global_variables_initializer()) else: # do finetuning -- load model from meta_model_path assert finetune_model_path is not None, "Please specify the path to the meta learnt model using --finetune_model_path" if 'simple_save' in finetune_model_path: model = restore_tf_graph(sess, fpath=finetune_model_path, meta_learning_or_finetune=finetune) else: model = restore_tf_graph(sess, fpath=finetune_model_path + '/simple_save999999', meta_learning_or_finetune=finetune) # get placeholders x_ph, a_ph, adv_ph = model['x'], model['a'], model['adv'] ret_ph, logp_old_ph, temperature_ph = model['ret'], model[ 'logp_old'], model['temperature'] # get model output pi, logp, logp_pi, v = model['pi'], model['logp'], model[ 'logp_pi'], model['v'] pi_loss, v_loss = model['pi_loss'], model['v_loss'] approx_kl, approx_ent, clipfrac = model['approx_kl'], model[ 'approx_ent'], model['clipfrac'] # get Optimizers train_pi = model['train_pi'] train_v = model['train_v'] # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph, temperature_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # # log tf graph # tf.summary.FileWriter(tb_logdir, sess.graph) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph, 'adv': adv_ph, 'ret': ret_ph, 'logp_old': logp_old_ph, 'temperature': temperature_ph }, outputs={ 'pi': pi, 'v': v, 'logp': logp, 'logp_pi': logp_pi, 'pi_loss': pi_loss, 'v_loss': v_loss, 'approx_kl': approx_kl, 'approx_ent': approx_ent, 'clipfrac': clipfrac }) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len, ep_dummy_action_count, ep_len_normalized = env.reset( ), 0, False, 0, 0, 0, [] # initialize variables for keeping track of BEST eval performance best_eval_AverageEpRet = -0.05 # a negative value so that best model is saved at least once. best_eval_StdEpRet = 1.0e30 # below are used for early-stop. We early stop if # 1) a best model has been saved, and, # 2) 50 epochs have passed without a new save saved = False early_stop_count_started = False episode_count_after_saved = 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): current_temp = _get_current_temperature(epoch, epochs, train_starting_temp) for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={ x_ph: o.reshape(1, -1), temperature_ph: current_temp }) # save and log buf.store(o, a, r, v_t, logp_t, current_temp) logger.store(VVals=v_t) o, r, d, _ = env.step(a[0]) ep_ret += r ep_len += 1 if env_version >= 4: ep_len_normalized.append(ep_len / env.allowed_steps) if env.action_is_dummy: # a is dummy action ep_dummy_action_count += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not terminal: print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run(v, feed_dict={ x_ph: o.reshape(1, -1), temperature_ph: current_temp }) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) if env_version >= 4: logger.store(EpDummyCount=ep_dummy_action_count) logger.store(EpTotalArcs=env.adjacency_matrix.sum()) assert len(ep_len_normalized) > 0 ep_len_normalized = np.asarray( ep_len_normalized, dtype=np.float32).mean() logger.store(EpDummyStepsNormalized=ep_len_normalized) o, r, d, ep_ret, ep_len, ep_dummy_action_count, ep_len_normalized = env.reset( ), 0, False, 0, 0, 0, [] # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): if meta_learning: # Save a new model every save_freq and at the last epoch. Do not overwrite the previous save. logger.save_state({'env_name': env_name}, epoch) else: # Save a new model every save_freq and at the last epoch. Only keep one copy - the current model logger.save_state({'env_name': env_name}) # Evaluate and save best model if do_checkpoint_eval and epoch > 0: # below is a hack. best model related stuff is saved at itr 999999, therefore, simple_save999999. # Doing this way, I can use test_policy and plot directly to test the best models. # saved best models includes: # 1) a copy of the env_name # 2) the best rl model with parameters # 3) a pickle file "best_eval_performance_n_structure" storing best_performance, best_structure and epoch # note that 1) and 2) are spinningup defaults, and 3) is a custom save best_eval_AverageEpRet, best_eval_StdEpRet, saved = eval_and_save_best_model( best_eval_AverageEpRet, best_eval_StdEpRet, # a new best logger is created and passed in so that the new logger can leverage the directory # structure without messing up the logger in the training loop # eval_logger=EpochLogger(**dict( # exp_name=logger_kwargs['exp_name'], # output_dir=os.path.join(logger.output_dir, "simple_save999999"))), eval_logger=logger_eval, train_logger=logger, eval_progress_logger=logger_eval_progress, tb_logger=tb_logger, epoch=epoch, # the env_name is passed in so that to create an env when and where it is needed. This is to # logx.save_state() error where an env pointer cannot be pickled env_name="F{}x{}T{}_SP{}_v{}".format( env.n_plant, env.n_product, env.target_arcs, env.n_sample, env_version) if env_version >= 3 else env_name, env_version=env_version, env_input=env_input, render= False, # change this to True if you want to visualize how arcs are added during evaluation target_arcs=env.input_target_arcs, get_action=lambda x: sess.run(pi, feed_dict={ x_ph: x[None, :], temperature_ph: eval_temp })[0], # number of samples to draw when simulate demand n_sample=5000, num_episodes=eval_episodes, seed=seed, save_all_eval=save_all_eval) # Perform PPO update! update() # # # Log into tensorboard log_key_to_tb(tb_logger, logger, epoch, key="EpRet", with_min_and_max=True) log_key_to_tb(tb_logger, logger, epoch, key="EpLen", with_min_and_max=False) log_key_to_tb(tb_logger, logger, epoch, key="VVals", with_min_and_max=True) log_key_to_tb(tb_logger, logger, epoch, key="LossPi", with_min_and_max=False) log_key_to_tb(tb_logger, logger, epoch, key="LossV", with_min_and_max=False) log_key_to_tb(tb_logger, logger, epoch, key="DeltaLossPi", with_min_and_max=False) log_key_to_tb(tb_logger, logger, epoch, key="DeltaLossV", with_min_and_max=False) log_key_to_tb(tb_logger, logger, epoch, key="Entropy", with_min_and_max=False) log_key_to_tb(tb_logger, logger, epoch, key="KL", with_min_and_max=False) log_key_to_tb(tb_logger, logger, epoch, key="ClipFrac", with_min_and_max=False) log_key_to_tb(tb_logger, logger, epoch, key="StopIter", with_min_and_max=False) tb_logger.log_scalar(tag="TotalEnvInteracts", value=(epoch + 1) * steps_per_epoch, step=epoch) tb_logger.log_scalar(tag="Time", value=time.time() - start_time, step=epoch) tb_logger.log_scalar(tag="epoch_temp", value=current_temp, step=epoch) if env_version >= 4: log_key_to_tb(tb_logger, logger, epoch, key="EpDummyCount", with_min_and_max=False) log_key_to_tb(tb_logger, logger, epoch, key="EpTotalArcs", with_min_and_max=False) if 'EpDummyStepsNormalized' in logger.epoch_dict.keys(): if len(logger.epoch_dict['EpDummyStepsNormalized']) > 0: log_key_to_tb(tb_logger, logger, epoch, key="EpDummyStepsNormalized", with_min_and_max=False) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.log_tabular('EpochTemp', current_temp) if env_version >= 4: logger.log_tabular('EpDummyCount', with_min_and_max=True) if 'EpDummyStepsNormalized' in logger.epoch_dict.keys(): if len(logger.epoch_dict['EpDummyStepsNormalized']) > 0: logger.log_tabular('EpDummyStepsNormalized', average_only=True) logger.log_tabular('EpTotalArcs', average_only=True) logger.dump_tabular() if early_stop_epochs > 0: # check for early stop if saved: # start to count the episodes elapsed after a "saved" event early_stop_count_started = True # reset the count to 0 episode_count_after_saved = 0 else: # check whether we should count this episode, i.e., whether early_stop_count_started == True if early_stop_count_started: episode_count_after_saved += 1 if episode_count_after_saved > early_stop_epochs: logger.log('Early Stopped at epoch {}.'.format(epoch), color='cyan') break
def PPO_with_bc(env_fn, traj_dir, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), d_hidden_size=64, seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=40, train_v_iters=40, lam=0.97, max_ep_len=4000, target_kl=0.01, logger_kwargs=dict(), save_freq=50, r_env_ratio=0, pretrain_bc=True, bc_itr=1e4): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape D = Discriminator(env, hidden_size=d_hidden_size) #!add Discriminator object e_obs = np.loadtxt(traj_dir + '/observations.csv', delimiter=',') #!何故かカンマなしのcsv e_act = np.loadtxt(traj_dir + '/actions.csv', delimiter=',') #Demo treajectory print(e_obs.shape, e_act.shape) assert e_obs.shape[1:] == obs_dim # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi, pi_std, entropy, v = actor_critic( x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) #buf_gail = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)#add buffer with TRgail rewards # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) v_loss = tf.reduce_mean((ret_ph - v)**2) #ret_phには累積報酬のバッファが入る # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() BC = BehavioralCloning(sess, pi, logp, x_ph, a_ph) obs_def_shape = 1 if obs_dim == () else obs_dim[0] #1次元にも対応 act_def_shape = 1 if act_dim == () else act_dim[0] e_obs_bc = e_obs.reshape(-1, obs_def_shape) e_act_bc = e_act.reshape(-1, act_def_shape) sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get()) } #all_phsは各バッファーに対応するプレースホルダー辞書 pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training#ここも変える必要あり? おそらく変えなくて良い for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: #更新時のklが想定の1.5倍大きいとログをだしてtrainループを着る logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): #vの更新 sess.run(train_v, feed_dict=inputs) # Log changes from update(新しいロスの計算) pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) std, std_ent = sess.run([pi_std, entropy], feed_dict=inputs) logger.store( LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=std_ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), #更新での改善量 DeltaLossV=(v_l_new - v_l_old), Std=std) start_time = time.time() if pretrain_bc: BC.learn(e_obs_bc, e_act_bc, max_itr=bc_itr) o, r, d, ep_ret_task, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) o, r, d, _ = env.step(a[0]) ''' if epoch >45: env.render() time.sleep(0.03) ''' ep_ret_task += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): ''' if not(terminal): print('Warning: trajectory cut off by epoch at %d steps.'%ep_len) ''' #!add discriminator train '''#終端も加えるならアリッチャあり o_reshape = o.reshape(core.combined_shape(1,obs_dim)) a_reshape = a.reshape(core.combined_shape(1,act_dim)) agent_obs = np.append(buf.obs_buf[buf.path_slice()],o_reshape,axis = 0)#!o を(obspace,)→(1,obspace)に変換してからアペンド agent_act = np.append(buf.act_buf[buf.path_slice()],a_reshape,axis = 0)#終端での状態行動対も加えてDを学習 ''' agent_obs = buf.obs_buf[buf.path_slice()] agent_act = buf.act_buf[buf.path_slice()] if d: # if trajectory didn't reach terminal state, bootstrap value target last_val = r else: last_val = sess.run(v, feed_dict={x_ph: o.reshape(1, -1) }) #v_last=...だったけどこれで良さげ #!until here buf.finish_path( last_val) #これの前にbuf.finish_add_r_vがなされていることを確認すべし if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret_task, EpLen=ep_len) o, r, d, ep_ret_task, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, epoch) # Perform PPO update! for _t in range(train_pi_iters // 3): D.train(sess, e_obs, e_act, agent_obs, agent_act) update() js_d = D.get_js_div(sess, e_obs, e_act, agent_obs, agent_act) logger.store(JS=js_d) # Log info about epoch #if epoch%10 == 0:#logger print each 10 epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.log_tabular('JS', average_only=True) logger.log_tabular('Std', average_only=True) logger.dump_tabular()
def sac_combined( env_fn, hidden_sizes=[256, 256], seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=3e-4, alpha=0.2, batch_size=256, start_steps=10000, max_ep_len=1000, save_freq=1, dont_save=True, logger_kwargs=dict(), update_multiplier=1, hidden_activation_setting='relu', regularization_weight=1e-3, PER_alpha=0.6, PER_beta_start=0.6, use_value_policy_weight=False, update_order='old_first', eta_0=0.994, eta_final=1.0, c_min=5000, ): """ Largely following OpenAI documentation But slightly different from tensorflow implementation Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. hidden_sizes: number of entries is number of hidden layers each entry in this list indicate the size of that hidden layer. applies to all networks seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. Note the epoch here is just logging epoch so every this many steps a logging to stdouot and also output file will happen note: not to be confused with training epoch which is a term used often in literature for all kinds of different things epochs (int): Number of epochs to run and train agent. Usage of this term can be different in different algorithms, use caution. Here every epoch you get new logs replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. However during testing the action always come from policy max_ep_len (int): Maximum length of trajectory / episode / rollout. Environment will get reseted if timestep in an episode excedding this number save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. logger_kwargs (dict): Keyword args for EpochLogger. """ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("running on device:", device) """set up logger""" logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) total_steps = steps_per_epoch * epochs env, test_env = env_fn(), env_fn() ## seed torch and numpy torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) ## seed environment along with env action space so that everything about env is seeded env.seed(seed) env.action_space.np_random.seed(seed) test_env.seed(seed) test_env.action_space.np_random.seed(seed) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # if environment has a smaller max episode length, then use the environment's max episode length max_ep_len = env._max_episode_steps if max_ep_len > env._max_episode_steps else max_ep_len # Action limit for clamping: critically, assumes all dimensions share the same bound! # we need .item() to convert it from numpy float to python float act_limit = env.action_space.high[0].item() # Experience buffer with PER proportional priority scheme replay_buffer = PrioritizedReplayMemory(replay_size, alpha=PER_alpha, beta_start=PER_beta_start, beta_frames=total_steps) def test_agent(n=5): """ This will test the agent's performance by running n episodes During the runs, the agent only take deterministic action, so the actions are not drawn from a distribution, but just use the mean :param n: number of episodes to run the agent """ ep_return_list = np.zeros(n) for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time a = policy_net.get_env_action(o, deterministic=True) o, r, d, _ = test_env.step(a) ep_ret += r ep_len += 1 ep_return_list[j] = ep_ret logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 hidden_activation_dictionary = { 'relu': F.relu, 'leaky_relu': F.leaky_relu, 'selu': F.selu } hidden_activation = hidden_activation_dictionary[hidden_activation_setting] """init all networks""" # see line 1 policy_net = TanhGaussianPolicy( obs_dim, act_dim, hidden_sizes, action_limit=act_limit, hidden_activation=hidden_activation).to(device) value_net = Mlp(obs_dim, 1, hidden_sizes, hidden_activation=hidden_activation).to(device) target_value_net = Mlp(obs_dim, 1, hidden_sizes, hidden_activation=hidden_activation).to(device) q1_net = Mlp(obs_dim + act_dim, 1, hidden_sizes, hidden_activation=hidden_activation).to(device) q2_net = Mlp(obs_dim + act_dim, 1, hidden_sizes, hidden_activation=hidden_activation).to(device) # see line 2: copy parameters from value_net to target_value_net target_value_net.load_state_dict(value_net.state_dict()) # set up optimizers policy_optimizer = optim.Adam(policy_net.parameters(), lr=lr) value_optimizer = optim.Adam(value_net.parameters(), lr=lr) q1_optimizer = optim.Adam(q1_net.parameters(), lr=lr) q2_optimizer = optim.Adam(q2_net.parameters(), lr=lr) # mean squared error loss for v and q networks mse_criterion = nn.MSELoss() # mse_criterion_no_reduction = nn.MSELoss(reduction='none') mse_criterion_no_reduction = nn.MSELoss(reduce=False) # Main loop: collect experience in env and update/log each epoch # NOTE: t here is the current number of total timesteps used # it is not the number of timesteps passed in the current episode for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t > start_steps: a = policy_net.get_env_action(o, deterministic=False) else: a = env.action_space.sample() # Step the env, get next observation, reward and done signal o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) # when env terminates because of time limit, d is False d = False if ep_len == max_ep_len else d # Store experience (observation, action, reward, next observation, done) to replay buffer data = [o, a, r, o2, d] replay_buffer.push(data) # replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. Quoted from the original SAC paper: 'In practice, we take a single environment step followed by one or several gradient step' after a single environment step, the number of gradient steps is 1 for SAC. (see paper for reference) """ eta_current = compute_current_eta(eta_0, eta_final, t, total_steps) num_updates = ep_len ck_list = get_ck_list_exp(replay_size, num_updates, eta_current, update_order) for k in range(num_updates): c_k = ck_list[k] if c_k < c_min: c_k = c_min # get data from replay buffer obs_tensor, acts_tensor, rews_tensor, obs_next_tensor, done_tensor, batch_idxs, batch_weights \ = replay_buffer.get_minibatch(batch_size, c_k) obs_tensor = obs_tensor.to(device) acts_tensor = acts_tensor.to(device) rews_tensor = rews_tensor.to(device) obs_next_tensor = obs_next_tensor.to(device) done_tensor = done_tensor.to(device) batch_weights = Tensor(batch_weights).reshape(batch_size, 1).to(device) """ now we do a SAC update, following the OpenAI spinup doc check the openai sac document psudocode part for reference line nubmers indicate lines in psudocode part we will first compute each of the losses and then update all the networks in the end """ # see line 12: get a_tilda, which is newly sampled action (not action from replay buffer) a_tilda, mean_a_tilda, log_std_a_tilda, log_prob_a_tilda, _, _ = policy_net.forward( obs_tensor) """get q loss""" # see line 12: first equation v_from_target_v_net = target_value_net( obs_next_tensor).detach() y_q = rews_tensor + gamma * (1 - done_tensor) * v_from_target_v_net # see line 13: compute loss for the 2 q networks, note that we want to detach the y_q value # since we only want to update q networks here, and don't want other gradients # loss value of each data point is multiplied by the importance sampling weight of that data point q1_prediction = q1_net(torch.cat([obs_tensor, acts_tensor], 1)) q1_loss = ( mse_criterion_no_reduction(q1_prediction, y_q.detach()) * batch_weights).mean() q2_prediction = q2_net(torch.cat([obs_tensor, acts_tensor], 1)) q2_loss = ( mse_criterion_no_reduction(q2_prediction, y_q.detach()) * batch_weights).mean() """ compute absolute TD error """ ## here we compute absolute TD error to be the mean of abs TD error of 2 q networks abs_td = ((q1_prediction.detach() - y_q.detach()).abs() + (q2_prediction.detach() - y_q.detach()).abs()) / 2 """get v and policy loss""" # see line 12: second equation q1_a_tilda = q1_net(torch.cat([obs_tensor, a_tilda], 1)) q2_a_tilda = q2_net(torch.cat([obs_tensor, a_tilda], 1)) min_q1_q2_a_tilda = torch.min( torch.cat([q1_a_tilda, q2_a_tilda], 1), 1)[0].reshape(-1, 1) y_v = min_q1_q2_a_tilda - alpha * log_prob_a_tilda # see line 14: compute loss for value network v_prediction = value_net(obs_tensor) if not use_value_policy_weight: # same as vanilla v_loss = mse_criterion(v_prediction, y_v.detach()) policy_loss = -(q1_a_tilda - alpha * log_prob_a_tilda).mean() else: # with importance sampling weight v_loss = (mse_criterion_no_reduction( v_prediction, y_v.detach()) * batch_weights).mean() policy_loss = (-(q1_a_tilda - alpha * log_prob_a_tilda) * batch_weights).mean() """ add policy regularization loss, this is not in openai's minimal version, but they are in the original sac code, see https://github.com/vitchyr/rlkit for reference this part is not necessary but might improve performance """ policy_mean_reg_weight = regularization_weight policy_std_reg_weight = regularization_weight mean_reg_loss = policy_mean_reg_weight * (mean_a_tilda** 2).mean() std_reg_loss = policy_std_reg_weight * (log_std_a_tilda** 2).mean() policy_loss = policy_loss + mean_reg_loss + std_reg_loss """update networks""" q1_optimizer.zero_grad() q1_loss.backward() q1_optimizer.step() q2_optimizer.zero_grad() q2_loss.backward() q2_optimizer.step() value_optimizer.zero_grad() v_loss.backward() value_optimizer.step() policy_optimizer.zero_grad() policy_loss.backward() policy_optimizer.step() # see line 16: update target value network with value network soft_update_model1_with_model2(target_value_net, value_net, polyak) """ Here we can do the priority updates, use the average absolute TD error from 2 q networks """ abs_td = abs_td.reshape(-1).cpu().numpy() replay_buffer.update_priorities(batch_idxs, abs_td.tolist()) # store diagnostic info to logger logger.store(LossPi=policy_loss.cpu().item(), LossQ1=q1_loss.cpu().item(), LossQ2=q2_loss.cpu().item(), LossV=v_loss.cpu().item(), Q1Vals=q1_prediction.detach().cpu().numpy(), Q2Vals=q2_prediction.detach().cpu().numpy(), VVals=v_prediction.detach().cpu().numpy(), LogPi=log_prob_a_tilda.detach().cpu().numpy()) ## store episode return and length to logger logger.store(EpRet=ep_ret, EpLen=ep_len) ## reset environment o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if (t + 1) % steps_per_epoch == 0: epoch = t // steps_per_epoch """ Save pytorch model, very different from tensorflow version We need to save the environment, the state_dict of each network and also the state_dict of each optimizer """ if not dont_save: sac_state_dict = { 'env': env, 'policy_net': policy_net.state_dict(), 'value_net': value_net.state_dict(), 'target_value_net': target_value_net.state_dict(), 'q1_net': q1_net.state_dict(), 'q2_net': q2_net.state_dict(), 'policy_opt': policy_optimizer, 'value_opt': value_optimizer, 'q1_opt': q1_optimizer, 'q2_opt': q2_optimizer } if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state(sac_state_dict, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def ppo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) o2, r, d, _ = env.step(a[0]) ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) # Update obs (critical!) o = o2 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = 0 if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def sddpg2( env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, # TODO: change it back to 5000 steps_per_epoch=1000, #steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, # TODO: change it back to 10000 start_steps=1000, #start_steps=10000, reward_scale=5, act_noise=0.1, policy_delay=2, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Deterministically computes actions | from policy given states. ``q`` (batch,) | Gives the current estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q_pi`` (batch,) | Gives the composition of ``q`` and | ``pi`` for states in ``x_ph``: | q(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to DDPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph # x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) x_ph, \ a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) # import pdb; pdb.set_trace() # Main outputs from computation graph with tf.variable_scope('main'): pi, pi_mu, pi_sigma, pi_rho, pi_cov, pi_corr_iter_number, q, q_pi = actor_critic( x_ph, a_ph, **ac_kwargs) # pi, q, q_mu, q_sigma, q_pi, q_pi_mu, q_pi_sigma = actor_critic(x_ph, a_ph, **ac_kwargs) # Target networks with tf.variable_scope('target'): # Note that the action placeholder going to actor_critic here is # irrelevant, because we only need q_targ(s, pi_targ(s)). pi_targ, pi_mu_targ, pi_sigma_targ, pi_rho_targ, pi_cov_targ, _, _, q_pi_targ = actor_critic( x2_ph, a_ph, **ac_kwargs) # pi_targ, _, _, _, q_pi_targ, q_pi_mu_targ, q_pi_sigma_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size, logger_fname='experiences_log.txt', **logger_kwargs) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main']) print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n' % var_counts) # Bellman backup for Q function backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * q_pi_targ) # DDPG losses # TODO: add term to penalize large variance, give penalize term cofficient # # pi_loss = tf.reduce_mean(-q_pi + # (1/act_dim) * tf.norm(pi_alpha,ord=2,axis=1) + # 1/(act_dim*(act_dim-1)/2) * tf.norm(pi_beta,ord=1,axis=1)) pi_loss = tf.reduce_mean(-q_pi) q_loss = tf.reduce_mean((q - backup)**2) # Separate train ops for pi, q pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q')) # Polyak averaging for target variables target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) writer = tf.summary.FileWriter( osp.join(logger_kwargs['output_dir'], 'graph'), sess.graph) writer.flush() # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'pi_mu': pi_mu, 'pi_sigma': pi_sigma, 'pi_beta': pi_rho, 'q': q }) def get_action(o): # import pdb; pdb.set_trace() # options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) # run_metadata = tf.RunMetadata() # a, a_mu, a_alpha, a_beta, a_cov = sess.run([pi, pi_mu, pi_sigma, pi_rho, pi_cov], # feed_dict={x_ph: o.reshape(1,-1)}, # options=options, run_metadata=run_metadata) # # Create the Timeline object, and write it to a json file # fetched_timeline = timeline.Timeline(run_metadata.step_stats) # chrome_trace = fetched_timeline.generate_chrome_trace_format() # with open('timeline_01.json', 'w') as f: # f.write(chrome_trace) a, a_mu, a_alpha, a_beta, a_cov, a_corr_iter_number = sess.run( [pi, pi_mu, pi_sigma, pi_rho, pi_cov, pi_corr_iter_number], feed_dict={x_ph: o.reshape(1, -1)}) a, a_mu, a_alpha, a_beta, a_cov = a[0], a_mu[0], a_alpha[0], a_beta[ 0], a_cov[0] return a, a_mu, a_alpha, a_beta, a_cov def test_agent(n=10): for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) a, a_mu, a_alpha, a_beta, a_cov = get_action(o) o, r, d, _ = test_env.step(a) # o, r, d, _ = test_env.step(a_mu) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ if t > start_steps: a, a_mu, a_alpha, a_beta, a_cov = get_action(o) # import pdb; pdb.set_trace() else: a = env.action_space.sample() a_mu = a a_alpha = np.zeros((act_dim, )) a_beta = np.zeros((int(act_dim * (act_dim - 1) / 2), )) a_cov = np.identity(act_dim) # Step the env env.render() o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, a_mu, a_alpha, a_beta, reward_scale * r, o2, d, t, steps_per_epoch, start_time) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all DDPG updates at the end of the trajectory, in accordance with tuning done by TD3 paper authors. """ train_start = time.time() for j in range(ep_len): if j % 100 == 0: train_end = time.time() print('training step={}, cost_time={}'.format( j, train_end - train_start)) train_start = train_end batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } # import pdb; pdb.set_trace() # # outs = sess.run([pi_mu, pi_sigma, pi_rho], feed_dict) # Q-learning update outs = sess.run([q_loss, q, train_q_op, pi_corr_iter_number], feed_dict) logger.store(LossQ=outs[0], QVals=outs[1]) if outs[0] > 10000: print('q_loss={}'.format(outs[0])) # import pdb; # pdb.set_trace() # Policy update if j % policy_delay == 0: # Delayed policy update outs = sess.run([ pi_loss, train_pi_op, target_update, pi_corr_iter_number ], feed_dict) logger.store(LossPi=outs[0]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 if t % 100 == 0: print('step={}'.format(t)) # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. # TODO: change test number test_agent(2) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('QVals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def ppo(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10, beta=0.1, k=5, lamb=1, privacy=False, teacher=False, teacher_directory=None): """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # Special function to avoid certain slowdowns from PyTorch + MPI combo. print("Privacy protection: ", privacy) setup_pytorch_for_mpi() # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Create actor-critic module ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) # Sync params across processes sync_params(ac) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) #Load the teacher policy if teacher == True: teacher_policy = load_teacher(fpath=teacher_directory) def Dirichlet_mechanism(p, k): return Categorical(Dirichlet(k * p).sample()) def Phi(p, q, k, beta, lamb): alpha = lamb * np.sqrt(np.log(1.0 / beta) / (2 * (k + 1))) diff = torch.norm(p - q, p=2, dim=1) if diff.mean() > alpha: # print("\n \n Teacher activated with alpha = ", alpha) # print("\n \n Policy differnce = ", diff.mean()) return diff else: # print("\n \n Teacher NOT activated with alpha = ", alpha) # print("\n \n Policy differnce = ", diff.mean()) return diff * 0 # Set up function for computing PPO policy loss def compute_loss_pi(data, k, beta, lamb, privacy): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[ 'logp'] # Policy loss pi, logp = ac.pi(obs, act) ratio = torch.exp(logp - logp_old) clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv if teacher == False: loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() elif privacy == True: loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() + Phi( pi.probs, Dirichlet_mechanism(teacher_policy(obs).probs, k).probs, k, beta, lamb).mean() elif privacy == False: loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() + Phi( pi.probs, teacher_policy(obs).probs, k, beta, 0).mean() # Useful extra info approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item() pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac) return loss_pi, pi_info # Set up function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs) - ret)**2).mean() # Set up optimizers for policy and value function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(k, beta, lamb, privacy): data = buf.get() pi_l_old, pi_info_old = compute_loss_pi(data, k, beta, lamb, privacy) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with multiple steps of gradient descent for i in range(train_pi_iters): pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data, k, beta, lamb, privacy) kl = mpi_avg(pi_info['kl']) if privacy == False: if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() logger.store(StopIter=i) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old)) # Prepare for interaction with environment start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32)) next_o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v, logp) logger.store(VVals=v) # Update obs (critical!) o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: if epoch_ended and not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32)) else: v = 0 buf.finish_path(v) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update(k, beta, lamb, privacy) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def ddpg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, act_noise=0.1, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, explorer=None, eps=0.0, pretrain_epochs=0): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope('main'): pi, q, q_pi = actor_critic(x_ph, a_ph, **ac_kwargs) # Target networks with tf.variable_scope('target'): # Note that the action placeholder going to actor_critic here is # irrelevant, because we only need q_targ(s, pi_targ(s)). pi_targ, _, q_pi_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main']) print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n' % var_counts) # Bellman backup for Q function backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * q_pi_targ) # DDPG losses pi_loss = -tf.reduce_mean(q_pi) q_loss = tf.reduce_mean((q - backup)**2) # Separate train ops for pi, q pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q')) # Polyak averaging for target variables target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'pi': pi, 'q': q }) def get_action(o, noise_scale): a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0] a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(n=10): for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, _ = test_env.step(get_action(o, 0)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 pretrain_steps = steps_per_epoch * pretrain_epochs total_epochs = epochs + pretrain_epochs total_steps = steps_per_epoch * total_epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Only use this exploration if you aren't pre-training with the MaxEnt agent. Afterwards, use the learned policy (with some noise, via act_noise). """ if t > start_steps: a = get_action(o, act_noise) elif pretrain_steps == 0: # only explore if not pretraining with MaxEnt a = env.action_space.sample() # use MaxEnt exploration if you are in a pretrain epoch or if eps-greedy pre = t < pretrain_steps during = random.random() < eps if pre or during: if explorer is None: raise ValueError('Trying to explore but explorer is None') state = env.env.state_vector() a = explorer.sample_action(state) # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all DDPG updates at the end of the trajectory, in accordance with tuning done by TD3 paper authors. """ for _ in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } # Q-learning update outs = sess.run([q_loss, q, train_q_op], feed_dict) logger.store(LossQ=outs[0], QVals=outs[1]) # Policy update outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) logger.store(LossPi=outs[0]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('QVals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def sac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=1000, epochs=200, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000, update_after=1000, update_every=50, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Soft Actor-Critic (SAC) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``mu`` (batch, act_dim) | Computes mean actions from policy | given states. ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. Critical: must be differentiable | with respect to policy parameters all | the way through action sampling. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope('main'): mu, pi, logp_pi, q1, q2 = actor_critic(x_ph, a_ph, **ac_kwargs) with tf.variable_scope('main', reuse=True): # compose q with pi, for pi-learning _, _, _, q1_pi, q2_pi = actor_critic(x_ph, pi, **ac_kwargs) # get actions and log probs of actions for next states, for Q-learning _, pi_next, logp_pi_next, _, _ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Target value network with tf.variable_scope('target'): # target q values, using actions from *current* policy _, _, _, q1_targ, q2_targ = actor_critic(x2_ph, pi_next, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main']) print( '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t total: %d\n' % var_counts) # Min Double-Q: min_q_pi = tf.minimum(q1_pi, q2_pi) min_q_targ = tf.minimum(q1_targ, q2_targ) # Entropy-regularized Bellman backup for Q functions, using Clipped Double-Q targets q_backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * (min_q_targ - alpha * logp_pi_next)) # Soft actor-critic losses pi_loss = tf.reduce_mean(alpha * logp_pi - min_q_pi) q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2) value_loss = q1_loss + q2_loss # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars('main/q') with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # All ops to call during one training step step_ops = [ pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, train_pi_op, train_value_op, target_update ] # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2 }) def get_action(o, deterministic=False): act_op = mu if deterministic else pi return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1)})[0] def test_agent(): for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy. if t > start_steps: a = get_action(o) # Stochastic policy for learning/training else: a = env.action_space.sample( ) # Random for 10k (epoch1+epoch2+0.5*epoch3) # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Don't let the env done if just reach max_ep_length # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical!, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if [(env is done) or (max_ep_legth reached)] if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Update handling (Learning/Training) if t >= update_after and t % update_every == 0: for j in range(update_every): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], } outs = sess.run(step_ops, feed_dict) # step_ops[0:5] = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi] logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2], Q1Vals=outs[3], Q2Vals=outs[4], LogPi=outs[5]) # End of epoch wrap-up if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Save model after each epoch: if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent # at the end of the epoch: test_agent() # Log info about epoch: logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def sac1(args, env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(2e6), gamma=0.99, reward_scale=1.0, polyak=0.995, lr=5e-4, alpha=0.2, batch_size=200, start_steps=10000, max_ep_len_train=1000, max_ep_len_test=1000, logger_kwargs=dict(), save_freq=1): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``mu`` (batch, act_dim) | Computes mean actions from policy | given states. ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. Critical: must be differentiable | with respect to policy parameters all | the way through action sampling. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). ``q2_pi`` (batch,) | Gives the composition of ``q2`` and | ``pi`` for states in ``x_ph``: | q2(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for policy/value/alpha learning). alpha (float/'auto'): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) / 'auto': alpha is automated. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ if not args.is_test: logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(3), env_fn(1) # for gym env obs_dim = env.observation_space.shape[0] o_shape = env.observation_space.shape # for google football # scenario_obsdim = {'academy_empty_goal':32, 'academy_empty_goal_random':32, 'academy_3_vs_1_with_keeper':44, 'academy_3_vs_1_with_keeper_random':44, 'academy_single_goal_versus_lazy':108} # scenario_obsdim['academy_single_goal_versus_lazy'] = 108 # scenario_obsdim['academy_single_goal_versus_lazy_random'] = 108 # scenario_obsdim['11_vs_11_stochastic']= 108 # scenario_obsdim['11_vs_11_stochastic_random'] = 108 # obs_dim = scenario_obsdim[args.env] # obs_space = Box(low=-1.0, high=1.0, shape=(obs_dim,), dtype=np.float32) # o_shape = obs_space.shape # act_dim = env.action_space.n act_dim = env.action_space.shape[0] act_space = env.action_space # Discrete(21) for gfootball a_shape = act_space.shape # () # Action limit for clamping: critically, assumes all dimensions share the same bound! # act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(o_shape, a_shape, o_shape, None, None) # Main outputs from computation graph with tf.variable_scope('main'): mu, pi, logp_pi, logp_pi2, q1, q2, q1_pi, q2_pi = actor_critic( x_ph, x2_ph, a_ph, **ac_kwargs) # Target value network with tf.variable_scope('target'): _, _, logp_pi_, _, _, _, q1_pi_, q2_pi_ = actor_critic( x2_ph, x2_ph, a_ph, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_shape=o_shape, act_shape=a_shape, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main']) print(('\nNumber of parameters: \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t total: %d\n')%var_counts) ###### if alpha == 'auto': target_entropy = (-np.prod(env.action_space.shape)) log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=0.0) alpha = tf.exp(log_alpha) alpha_loss = tf.reduce_mean(-log_alpha * tf.stop_gradient(logp_pi + target_entropy)) alpha_optimizer = tf.train.AdamOptimizer(learning_rate=lr * 0.1, name='alpha_optimizer') train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss, var_list=[log_alpha]) ###### # Min Double-Q: min_q_pi = tf.minimum(q1_pi_, q2_pi_) # Targets for Q and V regression v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi2) q_backup = r_ph + gamma * (1 - d_ph) * v_backup # Soft actor-critic losses pi_loss = tf.reduce_mean(alpha * logp_pi - q1_pi) q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2) value_loss = q1_loss + q2_loss # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars('main/q') with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # All ops to call during one training step if isinstance(alpha, Number): step_ops = [ pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, tf.identity(alpha), train_pi_op, train_value_op, target_update ] else: step_ops = [ pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, alpha, train_pi_op, train_value_op, target_update, train_alpha_op ] # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) ############################## save and restore ############################ saver = tf.train.Saver() checkpoint_path = logger_kwargs['output_dir'] + '/checkpoints' if not os.path.exists(checkpoint_path): os.makedirs(checkpoint_path) if args.is_test or args.is_restore_train: ckpt = tf.train.get_checkpoint_state(checkpoint_path) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) print("Model restored.") def get_action(o, deterministic=False): act_op = mu if deterministic else pi return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1)})[0] ############################## test ############################ if args.is_test: test_env = gym.make(args.env) ave_ep_ret = 0 for j in range(10000): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not d: # (d or (ep_len == 2000)): o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 if args.test_render: test_env.render() ave_ep_ret = (j * ave_ep_ret + ep_ret) / (j + 1) print('ep_len', ep_len, 'ep_ret:', ep_ret, 'ave_ep_ret:', ave_ep_ret, '({}/10000)'.format(j + 1)) return ############################## train ############################ def test_agent(n=25): global sess, mu, pi, q1, q2, q1_pi, q2_pi for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len_test)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 # test_env.render() logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs ep_index = 0 test_ep_ret_best = test_ep_ret = -10000.0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t > start_steps: a = get_action(o) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) # d = False if ep_len==max_ep_len_train else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of episode. Training (ep_len times). if d or (ep_len == max_ep_len_train): ep_index += 1 print('episode: {}, reward: {}'.format(ep_index, ep_ret / reward_scale)) """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], } # step_ops = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, alpha, train_pi_op, train_value_op, target_update] outs = sess.run(step_ops, feed_dict) logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2], Q1Vals=outs[3], Q2Vals=outs[4], LogPi=outs[5], Alpha=outs[6]) logger.store(EpRet=ep_ret / reward_scale, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch if epoch < 1000: test_agent(25) # test_ep_ret = logger.get_stats('TestEpRet')[0] # print('TestEpRet', test_ep_ret, 'Best:', test_ep_ret_best) else: test_agent(25) test_ep_ret = logger.get_stats('TestEpRet')[0] # logger.epoch_dict['TestEpRet'] = [] print('TestEpRet', test_ep_ret, 'Best:', test_ep_ret_best) # logger.store(): store the data; logger.log_tabular(): log the data; logger.dump_tabular(): write the data # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('Num_Ep', ep_index) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=False) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Alpha', average_only=True) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) # logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) # logger.log_tabular('LossV', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() # Save model if ((epoch % save_freq == 0) or (epoch == epochs - 1)) and test_ep_ret > test_ep_ret_best: save_path = saver.save(sess, checkpoint_path + '/model.ckpt', t) print("Model saved in path: %s" % save_path) test_ep_ret_best = test_ep_ret
def bc_reg_learn(env_set="Hopper-v2", seed=0, buffer_type="FinalSigma0.5", buffer_seed=0, buffer_size='1000K', cut_buffer_size='1000K', eval_freq=float(1e3), max_timesteps=float(1e6), lr=1e-3, wd=0, logger_kwargs=dict()): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("running on device:", device) """set up logger""" global logger logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) file_name = "BCue_%s_%s" % (env_set, seed) buffer_name = "%s_%s_%s" % (buffer_type, env_set, buffer_seed) print ("---------------------------------------") print ("Task: " + file_name) print ("---------------------------------------") if not os.path.exists("./results"): os.makedirs("./results") env = gym.make(env_set) test_env = gym.make(env_set) # Set seeds env.seed(seed) test_env.seed(seed) env.action_space.np_random.seed(seed) test_env.action_space.np_random.seed(seed) torch.manual_seed(seed) np.random.seed(seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) # Initialize policy policy = BC_reg.BC_reg(state_dim, action_dim, max_action, lr=lr, wd=wd) # Load buffer replay_buffer = utils.ReplayBuffer() replay_buffer.load(buffer_name + '_' + buffer_size) if buffer_size != cut_buffer_size: replay_buffer.cut_final(int(cut_buffer_size[:-1]) * 1e3) print(replay_buffer.get_length()) print('buffer setting:', buffer_name + '_' + cut_buffer_size) episode_num = 0 done = True training_iters, epoch = 0, 0 while training_iters < max_timesteps: epoch += 1 pol_vals = policy.train(replay_buffer, iterations=int(eval_freq), logger=logger) avgtest_reward = evaluate_policy(policy, test_env) training_iters += eval_freq logger.log_tabular('Epoch', epoch) logger.log_tabular('AverageTestEpRet', avgtest_reward) logger.log_tabular('TotalSteps', training_iters) logger.log_tabular('Loss', with_min_and_max=True) logger.dump_tabular()
def vpg(env_fn, actor_critic=core.ActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, pi_lr=3e-4, vf_lr=1e-3, train_v_iters=80, lam=0.97, max_ep_len=1000, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The agent's main model which is composed of the policy and value function model, where the policy takes some state, ``x``, and action, ``a``, and value function takes the state ``x`` and returns a tuple of: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a`` | in states ``x``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x``. (Critical: make sure | to flatten this via .item()!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic class you provided to VPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Main model actor_critic = actor_critic(in_features=obs_dim[0], **ac_kwargs) # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = VPGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple( core.count_vars(module) for module in [actor_critic.policy, actor_critic.value_function]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Optimizers train_pi = torch.optim.Adam(actor_critic.policy.parameters(), lr=pi_lr) train_v = torch.optim.Adam(actor_critic.value_function.parameters(), lr=vf_lr) # Sync params across processes sync_all_params(actor_critic.state_dict()) def update(): obs, act, adv, ret, logp_old = [torch.Tensor(x) for x in buf.get()] # Policy gradient step _, logp, _ = actor_critic.policy(obs, act) ent = (-logp).mean() # a sample estimate for entropy # VPG policy objective pi_loss = -(logp * adv).mean() # Policy gradient step train_pi.zero_grad() pi_loss.backward() average_gradients(train_pi.param_groups) train_pi.step() # Value function learning v = actor_critic.value_function(obs) v_l_old = F.mse_loss(v, ret) for _ in range(train_v_iters): # Output from value function graph v = actor_critic.value_function(obs) # VPG value objective v_loss = F.mse_loss(v, ret) # Value function gradient step train_v.zero_grad() v_loss.backward() average_gradients(train_v.param_groups) train_v.step() # Log changes from update _, logp, _, v = actor_critic(obs, act) pi_l_new = -(logp * adv).mean() v_l_new = F.mse_loss(v, ret) kl = (logp_old - logp).mean() # a sample estimate for KL-divergence logger.store(LossPi=pi_loss, LossV=v_l_old, KL=kl, Entropy=ent, DeltaLossPi=(pi_l_new - pi_loss), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): actor_critic.eval() for t in range(local_steps_per_epoch): a, _, logp_t, v_t = actor_critic(torch.Tensor(o.reshape(1, -1))) # save and log buf.store(o, a.data.numpy(), r, v_t.item(), logp_t.data.numpy()) logger.store(VVals=v_t) o, r, d, _ = env.step(a.data.numpy()[0]) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else actor_critic.value_function( torch.Tensor(o.reshape(1, -1))).item() buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, actor_critic, None) # Perform VPG update! actor_critic.train() update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def td3(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, update_after=1000, update_every=50, act_noise=0.1, target_noise=0.2, noise_clip=0.5, policy_delay=2, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Twin Delayed Deep Deterministic Policy Gradient (TD3) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with an ``act`` method, a ``pi`` module, a ``q1`` module, and a ``q2`` module. The ``act`` method and ``pi`` module should accept batches of observations as inputs, and ``q1`` and ``q2`` should accept a batch of observations and a batch of actions as inputs. When called, these should return: =========== ================ ====================================== Call Output Shape Description =========== ================ ====================================== ``act`` (batch, act_dim) | Numpy array of actions for each | observation. ``pi`` (batch, act_dim) | Tensor containing actions from policy | given observations. ``q1`` (batch,) | Tensor containing one current estimate | of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) ``q2`` (batch,) | Tensor containing the other current | estimate of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to TD3. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) target_noise (float): Stddev for smoothing noise added to target policy. noise_clip (float): Limit for absolute value of target policy smoothing noise. policy_delay (int): Policy will only be updated once every policy_delay times for each update of the Q-networks. num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Create actor-critic module and target networks ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) ac_targ = deepcopy(ac) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters()) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple( core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2]) logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' % var_counts) #=========================================================================# # # # All of your code goes in the space below. # # # #=========================================================================# # Set up function for computing TD3 Q-losses def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] # Q-values ####################### # # # YOUR CODE HERE # # # ####################### # q1 = # q2 = # Target policy smoothing ####################### # # # YOUR CODE HERE # # # ####################### # Target Q-values ####################### # # # YOUR CODE HERE # # # ####################### # MSE loss against Bellman backup ####################### # # # YOUR CODE HERE # # # ####################### # loss_q1 = # loss_q2 = # loss_q = # Useful info for logging loss_info = dict(Q1Vals=q1.detach().numpy(), Q2Vals=q2.detach().numpy()) return loss_q, loss_info # Set up function for computing TD3 pi loss def compute_loss_pi(data): ####################### # # # YOUR CODE HERE # # # ####################### # loss_pi = return loss_pi #=========================================================================# # # # All of your code goes in the space above. # # # #=========================================================================# # Set up optimizers for policy and q-function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) q_optimizer = Adam(q_params, lr=q_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(data, timer): # First run one gradient descent step for Q1 and Q2 q_optimizer.zero_grad() loss_q, loss_info = compute_loss_q(data) loss_q.backward() q_optimizer.step() # Record things logger.store(LossQ=loss_q.item(), **loss_info) # Possibly update pi and target networks if timer % policy_delay == 0: # Freeze Q-networks so you don't waste computational effort # computing gradients for them during the policy learning step. for p in q_params: p.requires_grad = False # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi = compute_loss_pi(data) loss_pi.backward() pi_optimizer.step() # Unfreeze Q-networks so you can optimize it at next DDPG step. for p in q_params: p.requires_grad = True # Record things logger.store(LossPi=loss_pi.item()) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) def get_action(o, noise_scale): a = ac.act(torch.as_tensor(o, dtype=torch.float32)) a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(): for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, _ = test_env.step(get_action(o, 0)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy (with some noise, via act_noise). if t > start_steps: a = get_action(o, act_noise) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Update handling if t >= update_after and t % update_every == 0: for j in range(update_every): batch = replay_buffer.sample_batch(batch_size) update(data=batch, timer=j) # End of epoch handling if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def vpg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, pi_lr=3e-4, vf_lr=1e-3, train_v_iters=80, lam=0.97, max_ep_len=1000, logger_kwargs=dict(), save_freq=10, explorer=None, eps=0.05, pretrain_epochs=0): tf.reset_default_graph() logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = VPGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # VPG objectives pi_loss = -tf.reduce_mean(logp * adv_ph) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Policy gradient step sess.run(train_pi, feed_dict=inputs) # Value function learning for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl = sess.run([pi_loss, v_loss, approx_kl], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_epochs = epochs + pretrain_epochs # Main loop: collect experience in env and update/log each epoch for epoch in range(total_epochs): # TODO(abbyvs): deteriorate eps somehow. factor of .99? # eps = eps*0.99 for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) # explore if you are in a pretrain epoch or if eps-greedy pre = epoch < pretrain_epochs during = random.random() < eps if pre or during: if explorer is None: raise ValueError('Trying to explore but explorer is None') state = env.env.state_vector() a = explorer.sample_action(state) # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) o, r, d, _ = env.step(a[0]) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform VPG update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def sac1_rnn(args, env_fn, actor_critic=core.mlp_actor_critic, sac1_dynamic_rnn=core.sac1_dynamic_rnn, ac_kwargs=dict(), seed=0, Lb=10, Lt=10, hc_dim=128, steps_per_epoch=3000, epochs=100, replay_size=int(1e5), gamma=0.99, reward_scale=1.0, polyak=0.995, lr=5e-4, alpha=0.2, h0=1.0, batch_size=150, start_steps=10000, max_ep_len_train=1000, max_ep_len_test=1000, logger_kwargs=dict(), save_freq=1): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``mu`` (batch, act_dim) | Computes mean actions from policy | given states. ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. Critical: must be differentiable | with respect to policy parameters all | the way through action sampling. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). ``q2_pi`` (batch,) | Gives the composition of ``q2`` and | ``pi`` for states in ``x_ph``: | q2(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for policy/value/alpha learning). alpha (float/'auto'): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) / 'auto': alpha is automated. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn('train'), env_fn('test') obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space ###################################### # Inputs to computation graph # x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) # # # Main outputs from computation graph # with tf.variable_scope('main'): # mu, pi, logp_pi, q1, q2, q1_pi, q2_pi = actor_critic(x_ph, a_ph, **ac_kwargs) # # # Target value network # with tf.variable_scope('target'): # _, _, logp_pi_, _, _, q1_pi_, q2_pi_ = actor_critic(x2_ph, a_ph, **ac_kwargs) # ###################################### obs_ph, hc_ph = core.placeholders((Lb + Lt + 1, obs_dim), (hc_dim, )) a_ph_all, r_ph_all, d_ph_all, data01_ph = core.placeholders( (Lb + Lt, act_dim), (Lb + Lt, ), (Lb + Lt, ), (Lb + Lt, )) obs_burn = obs_ph[:, :Lb] obs_train = obs_ph[:, Lb:] obs12_train = data01_ph[:, Lb:] # obs12_train = tf.transpose(obs12_train, perm=[1, 0]) a_ph = a_ph_all[:, Lb:] r_ph = r_ph_all[:, Lb:] d_ph = d_ph_all[:, Lb:] _, state_burn_in = sac1_dynamic_rnn(obs_burn, hc_ph) state_burn_in = tf.stop_gradient(state_burn_in) * data01_ph[:, 0][..., tf.newaxis] s_outputs, _ = sac1_dynamic_rnn(obs_train, state_burn_in) s_ph = s_outputs[:, :-1] s2_ph = s_outputs[:, 1:] logp_pi, logp_pi2, q1, q2, q1_pi, q2_pi = [None, ] * Lt, [None, ] * Lt, [None, ] * Lt, \ [None, ] * Lt, [None, ] * Lt, [None, ] * Lt logp_pi_, q1_pi_, q2_pi_ = [ None, ] * Lt, [ None, ] * Lt, [ None, ] * Lt for i in range(Lt): # Main outputs from computation graph with tf.variable_scope('main', reuse=tf.AUTO_REUSE): ###################################### _, _, logp_pi[i], logp_pi2[i], q1[i], q2[i], q1_pi[i], q2_pi[ i] = actor_critic(s_ph[:, i], s2_ph[:, i], a_ph[:, i], **ac_kwargs) # Target value network with tf.variable_scope('target', reuse=tf.AUTO_REUSE): _, _, logp_pi_[i], _, _, _, q1_pi_[i], q2_pi_[i] = actor_critic( s2_ph[:, i], s2_ph[:, i], a_ph[:, i], **ac_kwargs) logp_pi, logp_pi2, q1, q2, q1_pi, q2_pi = tf.stack(logp_pi, axis=1), tf.stack(logp_pi2, axis=1), \ tf.stack(q1, axis=1), tf.stack(q2, axis=1), tf.stack(q1_pi, axis=1), tf.stack(q2_pi, axis=1) logp_pi_, q1_pi_, q2_pi_ = tf.stack(logp_pi_, axis=1), tf.stack( q1_pi_, axis=1), tf.stack(q2_pi_, axis=1) ###################################### # Experience buffer # replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) replay_buffer_rnn = ReplayBuffer_RNN(Lb=Lb, Lt=Lt, hc_dim=hc_dim, obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables # var_counts = tuple(core.count_vars(scope) for scope in # ['main/pi', 'main/q1', 'main/q2', 'rnn']) # print(('\nNumber of parameters: \t pi: %d, \t' + 'q1: %d, \t q2: %d, \t rnn: %d\n') % var_counts) # print('Number of parameters: \t Total: %d\n' % sum(var_counts)) ###### if alpha == 'auto': target_entropy = (-np.prod(env.action_space.shape)) log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=0.0) alpha = tf.exp(log_alpha) alpha_loss = tf.reduce_mean(-log_alpha * tf.stop_gradient(logp_pi + target_entropy)) alpha_optimizer = tf.train.AdamOptimizer(learning_rate=lr * h0, name='alpha_optimizer') train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss, var_list=[log_alpha]) ###### # Min Double-Q: min_q_pi_ = tf.minimum(q1_pi_, q2_pi_) # Targets for Q and V regression v_backup = tf.stop_gradient(min_q_pi_ - alpha * logp_pi2) q_backup = r_ph + gamma * (1 - d_ph) * v_backup # Soft actor-critic losses pi_loss = tf.reduce_mean(obs12_train * (alpha * logp_pi - q1_pi)) q1_loss = 0.5 * tf.reduce_mean(obs12_train * (q_backup - q1)**2) q2_loss = 0.5 * tf.reduce_mean(obs12_train * (q_backup - q2)**2) value_loss = q1_loss + q2_loss # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) pi_params = get_vars('main/pi') train_pi_op = pi_optimizer.minimize(pi_loss, var_list=pi_params) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars('main/q') + get_vars('rnn') with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # All ops to call during one training step if isinstance(alpha, Number): step_ops = [ pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, tf.identity(alpha), train_pi_op, train_value_op, target_update ] else: step_ops = [ pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, alpha, train_pi_op, train_value_op, target_update, train_alpha_op ] # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Inputs to computation graph x_ph_geta, hc_ph_geta, a_ph_geta = core.placeholders((1, obs_dim), hc_dim, act_dim) s_geta, hc_geta = sac1_dynamic_rnn(x_ph_geta, hc_ph_geta) # Main outputs from computation graph with tf.variable_scope('main', reuse=tf.AUTO_REUSE): mu, pi, _, _, _, _, _, _ = actor_critic(s_geta[:, 0], s_geta[:, 0], a_ph_geta, **ac_kwargs) # Setup model saving # logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, # outputs={'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2}) saver = tf.train.Saver() checkpoint_path = logger_kwargs['output_dir'] + '/checkpoints' if not os.path.exists(checkpoint_path): os.makedirs(checkpoint_path) if args.is_test or args.is_restore_train: ckpt = tf.train.get_checkpoint_state(checkpoint_path) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) print("Model restored.") # def get_action(o, deterministic=False): # act_op = mu if deterministic else pi # return sess.run(act_op, feed_dict={x_ph_geta: o.reshape(1, -1)})[0]#[0] def get_action(o, hc_0, deterministic=False): """s_t_0_ starting step for testing 1 H""" act_op = mu if deterministic else pi action, hc_1 = sess.run([act_op, hc_geta], feed_dict={ x_ph_geta: o.reshape(1, 1, obs_dim), hc_ph_geta: hc_0 }) return action[0], hc_1 ############################## test ############################ if args.is_test: test_env = gym.make(args.env) ave_ep_ret = 0 for j in range(10000): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not d: # (d or (ep_len == 2000)): o, r, d, _ = test_env.step(get_action(o)) ep_ret += r ep_len += 1 if args.test_render: test_env.render() ave_ep_ret = (j * ave_ep_ret + ep_ret) / (j + 1) print('ep_len', ep_len, 'ep_ret:', ep_ret, 'ave_ep_ret:', ave_ep_ret, '({}/10000)'.format(j + 1)) return ############################## train ############################ def test_agent(n=5): # print('test') global sess, mu, pi, q1, q2, q1_pi, q2_pi for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 hc_run_test = np.zeros(( 1, hc_dim, ), dtype=np.float32) while not (d or (ep_len == max_ep_len_test)): # Take deterministic actions at test time a_test, hc_run_test = get_action(o, hc_run_test, True) o, r, d, _ = test_env.step(a_test) ep_ret += r ep_len += 1 # test_env.render() logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) ################################## deques obs_hc_queue = deque([], maxlen=Lb + Lt + 1) a_r_d_data01_queue = deque([], maxlen=Lb + Lt) ################################## deques start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 ################################## deques reset t_queue = 1 hc_run = np.zeros(( 1, hc_dim, ), dtype=np.float32) for _i in range(Lb): obs_hc_queue.append((np.zeros((obs_dim, ), dtype=np.float32), np.zeros((hc_dim, ), dtype=np.float32))) a_r_d_data01_queue.append((np.zeros( (act_dim, ), dtype=np.float32), 0.0, False, False)) obs_hc_queue.append((o, hc_run[0])) ################################## deques reset total_steps = steps_per_epoch * epochs # test_ep_ret = test_ep_ret_1 = -10000.0 test_ep_ret_best = test_ep_ret = -10000.0 # Main loop: collect experience in env and update/log each epoch start = time.time() for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t > start_steps: a, hc_run = get_action(o, hc_run) else: _, hc_run = get_action(o, hc_run) a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) # d = False if ep_len==max_ep_len_train else d # Store experience to replay buffer # replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 #################################### deques store a_r_d_data01_queue.append((a, r, d, True)) obs_hc_queue.append((o2, hc_run[0])) if t_queue % Lt == 0: replay_buffer_rnn.store(obs_hc_queue, a_r_d_data01_queue) if (d or (ep_len == max_ep_len_train)) and t_queue % Lt != 0: for _0 in range(Lt - t_queue % Lt): a_r_d_data01_queue.append((np.zeros( (act_dim, ), dtype=np.float32), 0.0, False, False)) obs_hc_queue.append((np.zeros((obs_dim, ), dtype=np.float32), np.zeros((hc_dim, ), dtype=np.float32))) replay_buffer_rnn.store(obs_hc_queue, a_r_d_data01_queue) t_queue += 1 #################################### deques store # End of episode. Training (ep_len times). if d or (ep_len == max_ep_len_train): """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ for j in range(ep_len): batch = replay_buffer_rnn.sample_batch(batch_size) feed_dict = { obs_ph: batch['obs'], hc_ph: batch['hc'], a_ph_all: batch['acts'], r_ph_all: batch['rews'], d_ph_all: batch['done'], data01_ph: batch['data01'] } # step_ops = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, alpha, train_pi_op, train_value_op, target_update] outs = sess.run(step_ops, feed_dict) logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2], Q1Vals=outs[3][:, 0], Q2Vals=outs[4][:, 0], LogPi=outs[5][:, 0], Alpha=outs[6]) logger.store(EpRet=ep_ret / reward_scale, EpLen=ep_len) print("ep_len", ep_len, "time", time.time() - start) start = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 ################################## deques reset t_queue = 1 hc_run = np.zeros(( 1, hc_dim, ), dtype=np.float32) for _i in range(Lb): obs_hc_queue.append((np.zeros((obs_dim, ), dtype=np.float32), np.zeros((hc_dim, ), dtype=np.float32))) a_r_d_data01_queue.append((np.zeros( (act_dim, ), dtype=np.float32), 0.0, False, False)) obs_hc_queue.append((o, hc_run[0])) ################################## deques reset # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch if epoch < 2000: test_agent(25) # test_ep_ret = logger.get_stats('TestEpRet')[0] # print('TestEpRet', test_ep_ret, 'Best:', test_ep_ret_best) else: test_agent(25) test_ep_ret = logger.get_stats('TestEpRet')[0] # logger.epoch_dict['TestEpRet'] = [] print('TestEpRet', test_ep_ret, 'Best:', test_ep_ret_best) # test_agent(25) # logger.store(): store the data; logger.log_tabular(): log the data; logger.dump_tabular(): write the data # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('Name', name) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) # test_ep_ret_1 = logger.get_stats('TestEpRet')[0] logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Alpha', average_only=True) logger.log_tabular('Q1Vals', with_min_and_max=False) logger.log_tabular('Q2Vals', with_min_and_max=True) # logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) # logger.log_tabular('LossV', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() # Save model if ((epoch % save_freq == 0) or (epoch == epochs - 1)) and test_ep_ret > test_ep_ret_best: save_path = saver.save(sess, checkpoint_path + '/model.ckpt', t) print("Model saved in path: %s" % save_path) test_ep_ret_best = test_ep_ret
def __init__(self, observation_space, action_space, q_type, train_mode, model_path, model_name, batch_size=301, market_price_dim=10, om=False, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, logger_kwargs=dict(), save_freq=1): self.logger = EpochLogger(**logger_kwargs) self.logger.save_config(locals()) self.market_price_dim = market_price_dim # self.batch_size = batch_size tf.set_random_seed(seed) np.random.seed(seed) self.obs_dim = observation_space.shape[0] self.act_dim = action_space.shape[0] self.om = om self.o_dim = action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! self.act_limit = 1 # Share information about action space with policy architecture ac_kwargs['action_space'] = action_space # Inputs to computation graph self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = core.placeholders( self.obs_dim, self.act_dim, self.obs_dim, None, None) self.x_m_ph, self.x2_m_ph = None, None self.a_op = None self.model_path = model_path if self.om: self.x_m_ph, self.x2_m_ph, self.a_op = core.placeholders( self.market_price_dim, self.market_price_dim, self.market_price_dim) self.q_type = q_type self.pi, self.q, self.q_pi, self.q_pi_targ = \ actor_critic(self.x_ph, self.a_ph, self.x2_ph, self.om, self.a_op, self.x_m_ph, self.x2_m_ph, self.q_type, **ac_kwargs) # Experience buffer self.replay_buffer = ReplayBuffer(obs_dim=self.obs_dim, act_dim=self.act_dim, size=replay_size, market_dim=self.market_price_dim) # Action Noise self.ou_noise = NNoise(action_dimension=self.act_dim) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main']) print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n' % var_counts) # Bellman backup for Q function backup = tf.stop_gradient(self.r_ph + gamma * (1 - self.d_ph) * self.q_pi_targ) # DDPG losses self.pi_loss = -tf.reduce_mean(self.q_pi) self.q_loss = tf.reduce_mean((self.q - backup)**2) # Separate train ops for pi, q pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) self.train_pi_op = pi_optimizer.minimize(self.pi_loss, var_list=get_vars('main/pi')) self.train_q_op = q_optimizer.minimize(self.q_loss, var_list=get_vars('main/q')) # Polyak averaging for target variables self.target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) self.tf_phs = { 'x_ph': self.x_ph, 'a_ph': self.a_ph, 'x2_ph': self.x2_ph, 'r_ph': self.r_ph, 'd_ph': self.d_ph } if self.om: self.tf_phs.update({ 'x_m_ph': self.x_m_ph, 'x2_m_ph': self.x2_m_ph, 'a_op': self.a_op }) self.tf_outputs = { 'pi': self.pi, 'q': self.q, 'q_pi': self.q_pi, 'q_pi_targ': self.q_pi_targ } self.logger.setup_tf_saver(self.sess, inputs=self.tf_phs, outputs=self.tf_outputs) self.sess.run(target_init) self.saver = tf.train.Saver() self.train_mode = train_mode self.model_name = model_name if not self.train_mode: self.saver = tf.train.import_meta_graph(self.model_name + '.meta') self.saver.restore(self.sess, tf.train.latest_checkpoint(self.model_path))
def ddpg_her(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, update_after=1000, update_every=50, act_noise=0.1, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, num_additional_goals=1, goal_selection_strategy='final'): """ Deep Deterministic Policy Gradient (DDPG) with Hindsight Experience Repley (HER) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the GoalEnv OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with an ``act`` method, a ``pi`` module, and a ``q`` module. The ``act`` method and ``pi`` module should accept batches of observations as inputs, and ``q`` should accept a batch of observations and a batch of actions as inputs. When called, these should return: =========== ================ ====================================== Call Output Shape Description =========== ================ ====================================== ``act`` (batch, act_dim) | Numpy array of actions for each | observation. ``pi`` (batch, act_dim) | Tensor containing actions from policy | given observations. ``q`` (batch,) | Tensor containing the current estimate | of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to DDPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. num_additional_goals (int): Number of additional HER goals for replay. goal_selection_strategy (final, future, episode, random): Goal selection method for HER goal generation. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.spaces["observation"].shape act_dim = env.action_space.shape[0] goal_dim = env.observation_space.spaces["desired_goal"].shape # The space of an observation concatenated with a goal low_val = np.concatenate([ env.observation_space.spaces["observation"].low, env.observation_space.spaces["desired_goal"].low]) high_val = np.concatenate([ env.observation_space.spaces["observation"].high, env.observation_space.spaces["desired_goal"].high]) og_space = gym.spaces.Box(low_val, high_val, dtype=np.float32) # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Create actor-critic module and target networks ac = actor_critic(og_space, env.action_space, **ac_kwargs) ac_targ = deepcopy(ac) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, goal_dim=goal_dim, size=replay_size) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.q]) logger.log('\nNumber of parameters: \t pi: %d, \t q: %d\n' % var_counts) # Set up function for computing DDPG Q-loss def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done'] q = ac.q(o, a) # Bellman backup for Q function with torch.no_grad(): q_pi_targ = ac_targ.q(o2, ac_targ.pi(o2)) backup = r + gamma * (1 - d) * q_pi_targ # MSE loss against Bellman backup loss_q = ((q - backup)**2).mean() # Useful info for logging loss_info = dict(QVals=q.detach().numpy()) return loss_q, loss_info # Set up function for computing DDPG pi loss def compute_loss_pi(data): o = data['obs'] q_pi = ac.q(o, ac.pi(o)) return -q_pi.mean() # Set up optimizers for policy and q-function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) q_optimizer = Adam(ac.q.parameters(), lr=q_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(data): # First run one gradient descent step for Q. q_optimizer.zero_grad() loss_q, loss_info = compute_loss_q(data) loss_q.backward() q_optimizer.step() # Freeze Q-network so you don't waste computational effort # computing gradients for it during the policy learning step. for p in ac.q.parameters(): p.requires_grad = False # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi = compute_loss_pi(data) loss_pi.backward() pi_optimizer.step() # Unfreeze Q-network so you can optimize it at next DDPG step. for p in ac.q.parameters(): p.requires_grad = True # Record things logger.store(LossQ=loss_q.item(), LossPi=loss_pi.item(), **loss_info) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) def get_action(o, noise_scale): ac.eval() a = ac.act(torch.as_tensor(o, dtype=torch.float32)) ac.train() a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(): for j in range(num_test_episodes): o_dict, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 o = o_dict["observation"] g = o_dict["desired_goal"] while not(d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) og = np.concatenate([o, g], axis=-1) o_dict, r, d, _ = test_env.step(get_action(og, 0)) o = o_dict["observation"] g = o_dict["desired_goal"] ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) def synthesize_experience(env, ep_start_ptr, ep_len, replay_buffer, num=1, selection_strategy='final'): ep = replay_buffer.get_episode(ep_start_ptr, ep_len) ep_end = len(ep['obs']) for idx in range(ep_end): obs = ep['obs'][idx] act = ep['act'][idx] obs2 = ep['obs2'][idx] agoal = ep['agoal'][idx] info = ep['info'][idx] for _ in range(num): if selection_strategy == 'final': sel_idx = -1 elif selection_strategy == 'future': # We cannot sample a goal from the future in the last step of an episode if idx == ep_end - 1: break sel_idx = np.random.choice(np.arange(idx + 1, ep_end)) elif selection_strategy == 'episode': sel_idx = np.random.choice(np.arange(ep_end)) else: raise ValueError(f"Unsupported selection_strategy: {selection_strategy}") sel_agoal = ep['agoal'][sel_idx] rew = env.compute_reward(agoal, sel_agoal, info) replay_buffer.store(obs, act, rew, obs2, False, sel_agoal, agoal, info) # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() o_dict, ep_ret, ep_len = env.reset(), 0, 0 ep_start_ptr = replay_buffer.ptr o = o_dict["observation"] dg = o_dict["desired_goal"] # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy (with some noise, via act_noise). if t > start_steps: # Concatenate observation with desired goal odg = np.concatenate([o, dg], axis=-1) a = get_action(odg, act_noise) else: a = env.action_space.sample() # Step the env o2_dict, r, d, i = env.step(a) ep_ret += r ep_len += 1 o2 = o2_dict["observation"] dg2 = o2_dict["desired_goal"] ag2 = o2_dict["achieved_goal"] # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d, dg, ag2, i) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 dg = dg2 # End of trajectory handling if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) synthesize_experience(env, ep_start_ptr=ep_start_ptr, ep_len=ep_len, replay_buffer=replay_buffer, num=num_additional_goals, selection_strategy=goal_selection_strategy) ep_start_ptr = replay_buffer.ptr o_dict, ep_ret, ep_len = env.reset(), 0, 0 o = o_dict["observation"] dg = o_dict["desired_goal"] # Update handling if t >= update_after and t % update_every == 0: for _ in range(update_every): batch = replay_buffer.sample_batch(batch_size) og_batch = dict( obs=torch.as_tensor( np.concatenate([batch['obs'], batch['dgoal']], axis=-1), dtype=torch.float32), obs2=torch.as_tensor( np.concatenate([batch['obs2'], batch['dgoal']], axis=-1), dtype=torch.float32), act=torch.as_tensor(batch['act'], dtype=torch.float32), rew=torch.as_tensor(batch['rew'], dtype=torch.float32), done=torch.as_tensor(batch['done'], dtype=torch.float32) ) update(data=og_batch) # End of epoch handling if (t+1) % steps_per_epoch == 0: epoch = (t+1) // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('QVals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular()
def ppo(workload_file, model_path, ac_kwargs=dict(), seed=0, traj_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10,pre_trained=0,trained_model=None,attn=False,shuffle=False, backfil=False, skip=False, score_type=0, batch_job_slice=0): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env = HPCEnv(shuffle=shuffle, backfil=backfil, skip=skip, job_score_type=score_type, batch_job_slice=batch_job_slice, build_sjf=False) env.seed(seed) env.my_init(workload_file=workload_file, sched_file=model_path) obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space ac_kwargs['attn'] = attn # Inputs to computation graph buf = PPOBuffer(obs_dim, act_dim, traj_per_epoch * JOB_SEQUENCE_SIZE, gamma, lam) if pre_trained: sess = tf.Session() model = restore_tf_graph(sess, trained_model) logger.log('load pre-trained model') # Count variables var_counts = tuple(count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) x_ph = model['x'] a_ph = model['a'] mask_ph = model['mask'] adv_ph = model['adv'] ret_ph = model['ret'] logp_old_ph = model['logp_old_ph'] pi = model['pi'] v = model['v'] # logits = model['logits'] out = model['out'] logp = model['logp'] logp_pi = model['logp_pi'] pi_loss = model['pi_loss'] v_loss = model['v_loss'] approx_ent = model['approx_ent'] approx_kl = model['approx_kl'] clipfrac = model['clipfrac'] clipped = model['clipped'] # Optimizers #graph = tf.get_default_graph() #op = sess.graph.get_operations() #[print(m.values()) for m in op] #train_pi = graph.get_tensor_by_name('pi/conv2d/kernel/Adam:0') #train_v = graph.get_tensor_by_name('v/conv2d/kernel/Adam:0') train_pi = tf.get_collection("train_pi")[0] train_v = tf.get_collection("train_v")[0] # train_pi_optimizer = MpiAdamOptimizer(learning_rate=pi_lr, name='AdamLoad') # train_pi = train_pi_optimizer.minimize(pi_loss) # train_v_optimizer = MpiAdamOptimizer(learning_rate=vf_lr, name='AdamLoad') # train_v = train_v_optimizer.minimize(v_loss) # sess.run(tf.variables_initializer(train_pi_optimizer.variables())) # sess.run(tf.variables_initializer(train_v_optimizer.variables())) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, mask_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi, out] else: x_ph, a_ph = placeholders_from_spaces(env.observation_space, env.action_space) # y_ph = placeholder(JOB_SEQUENCE_SIZE*3) # 3 is the number of sequence features mask_ph = placeholder(MAX_QUEUE_SIZE) adv_ph, ret_ph, logp_old_ph = placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi, v, out = actor_critic(x_ph, a_ph, mask_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, mask_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi, out] # Experience buffer # Count variables var_counts = tuple(count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) v_loss = tf.reduce_mean((ret_ph - v) ** 2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean(logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean(-logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = tf.train.AdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = tf.train.AdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) tf.add_to_collection("train_pi", train_pi) tf.add_to_collection("train_v", train_v) # Setup model saving # logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'action_probs': action_probs, 'log_picked_action_prob': log_picked_action_prob, 'v': v}) logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a':a_ph, 'adv':adv_ph, 'mask':mask_ph, 'ret':ret_ph, 'logp_old_ph':logp_old_ph}, outputs={'pi': pi, 'v': v, 'out':out, 'pi_loss':pi_loss, 'logp': logp, 'logp_pi':logp_pi, 'v_loss':v_loss, 'approx_ent':approx_ent, 'approx_kl':approx_kl, 'clipped':clipped, 'clipfrac':clipfrac}) def update(): inputs = {k:v for k,v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log('Early stopping at step %d due to reaching max kl.'%i) break logger.store(StopIter=i) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run([pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() [o, co], r, d, ep_ret, ep_len, show_ret, sjf, f1 = env.reset(), 0, False, 0, 0,0,0,0 # Main loop: collect experience in env and update/log each epoch start_time = time.time() num_total = 0 for epoch in range(epochs): t = 0 while True: lst = [] for i in range(0, MAX_QUEUE_SIZE * JOB_FEATURES, JOB_FEATURES): if all(o[i:i+JOB_FEATURES] == [0]+[1]*(JOB_FEATURES-2)+[0]): lst.append(0) elif all(o[i:i+JOB_FEATURES] == [1]*JOB_FEATURES): lst.append(0) else: lst.append(1) a, v_t, logp_t, output = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1,-1), mask_ph: np.array(lst).reshape(1,-1)}) # print(a, end=" ") num_total += 1 ''' action = np.random.choice(np.arange(MAX_QUEUE_SIZE), p=action_probs) log_action_prob = np.log(action_probs[action]) ''' # save and log buf.store(o,None, a, np.array(lst), r, v_t, logp_t) logger.store(VVals=v_t) o, r, d, r2, sjf_t, f1_t = env.step(a[0]) ep_ret += r ep_len += 1 show_ret += r2 sjf += sjf_t f1 += f1_t if d: t += 1 buf.finish_path(r) logger.store(EpRet=ep_ret, EpLen=ep_len, ShowRet=show_ret, SJF=sjf, F1=f1) [o, co], r, d, ep_ret, ep_len, show_ret, sjf, f1 = env.reset(), 0, False, 0, 0, 0, 0, 0 if t >= traj_per_epoch: # print ("state:", state, "\nlast action in a traj: action_probs:\n", action_probs, "\naction:", action) break # print("Sample time:", (time.time()-start_time)/num_total, num_total) # Save model if (epoch % save_freq == 0) or (epoch == epochs-1): logger.save_state({'env': env}, None) # Perform PPO update! # start_time = time.time() update() # print("Train time:", time.time()-start_time) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', with_min_and_max=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch+1)* traj_per_epoch * JOB_SEQUENCE_SIZE) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('ShowRet', average_only=True) logger.log_tabular('SJF', average_only=True) logger.log_tabular('F1', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular()
def td3(env_fn, env_fn_test, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, act_noise=0.1, target_noise=0.2, noise_clip=0.5, policy_delay=2, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, logdir=None, nstep=None, alpha=None, beta=None, sil_weight=None): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Deterministically computes actions | from policy given states. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to TD3. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) target_noise (float): Stddev for smoothing noise added to target policy. noise_clip (float): Limit for absolute value of target policy smoothing noise. policy_delay (int): Policy will only be updated once every policy_delay times for each update of the Q-networks. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ assert logdir is not None if not os.path.exists(logdir): os.makedirs(logdir) sess = tf.Session() logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn_test() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) x_ph_sil, a_ph_sil, x2_ph_sil, r_ph_sil, d_ph_sil = core.placeholders( obs_dim, act_dim, obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope('main'): pi, q1, q2, q1_pi = actor_critic(x_ph, a_ph, **ac_kwargs) with tf.variable_scope('main', reuse=True): _, q1_sil, q2_sil, _ = actor_critic(x_ph_sil, a_ph_sil, **ac_kwargs) # Target policy network with tf.variable_scope('target'): pi_targ, _, _, _ = actor_critic(x2_ph, a_ph, **ac_kwargs) with tf.variable_scope('target', reuse=True): pi_targ_sil, _, _, _ = actor_critic(x2_ph_sil, a_ph_sil, **ac_kwargs) # Target Q networks with tf.variable_scope('target', reuse=True): # Target policy smoothing, by adding clipped noise to target actions epsilon = tf.random_normal(tf.shape(pi_targ), stddev=target_noise) epsilon = tf.clip_by_value(epsilon, -noise_clip, noise_clip) a2 = pi_targ + epsilon a2 = tf.clip_by_value(a2, -act_limit, act_limit) # Target Q-values, using action from target policy _, q1_targ, q2_targ, _ = actor_critic(x2_ph, a2, **ac_kwargs) # Target Q networks with tf.variable_scope('target', reuse=True): # Target policy smoothing, by adding clipped noise to target actions epsilon = tf.random_normal(tf.shape(pi_targ_sil), stddev=target_noise) epsilon = tf.clip_by_value(epsilon, -noise_clip, noise_clip) a2 = pi_targ_sil + epsilon a2 = tf.clip_by_value(a2, -act_limit, act_limit) # Target Q-values, using action from target policy _, q1_targ_sil, q2_targ_sil, _ = actor_critic(x2_ph_sil, a2, **ac_kwargs) # Experience buffer replay_buffer = BaseReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Prioritized replay for expert data sil_replay_buffer = PrioritizedReplayBuffer(size=replay_size, alpha=alpha) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main']) print( '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t total: %d\n' % var_counts) # Bellman backup for Q functions, using Clipped Double-Q targets backup_discount = gamma min_q_targ = tf.minimum(q1_targ, q2_targ) backup = tf.stop_gradient(r_ph + backup_discount * (1 - d_ph) * min_q_targ) # TD3 losses pi_loss = -tf.reduce_mean(q1_pi) q1_loss = tf.reduce_mean((q1 - backup)**2) q2_loss = tf.reduce_mean((q2 - backup)**2) q_loss = q1_loss + q2_loss # sil q loss weights_ph = tf.placeholder(tf.float32, [None]) ret_ph = tf.placeholder(tf.float32, [None]) backup_sil = ret_ph # TD3 losses gains_1 = tf.nn.relu(backup_sil - q1_sil) gains_2 = tf.nn.relu(backup_sil - q2_sil) q1_loss_sil = tf.reduce_mean(weights_ph * tf.square(gains_1)) q2_loss_sil = tf.reduce_mean(weights_ph * tf.square(gains_2)) q_loss_sil = q1_loss_sil + q2_loss_sil gains = gains_1 + gains_2 # add to the q loss q_loss += sil_weight * q_loss_sil # Separate train ops for pi, q pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q')) # Polyak averaging for target variables target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess.run(tf.global_variables_initializer()) sess.run(target_init) def get_action(o, noise_scale): a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)}) a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(n=10): # test recorder ep_ret_list = [] # set up for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, _ = test_env.step(get_action(o, 0).flatten()) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) ep_ret_list.append(ep_ret) return ep_ret_list olist, alist, rlist, o2list, dlist = [], [], [], [], [] start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # record training ep_ret_record = [] time_step_record = [] # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ if t > start_steps: a = get_action(o, act_noise).flatten() else: a = env.action_space.sample() # Step the env o2, r, d, info = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d if 'nstep_data_1' in info.keys(): info['nstep_data_1'][-1] = d if 'nstep_data_{}'.format(nstep) in info.keys(): info['nstep_data_{}'.format(nstep)][-1] = d # Store experience to replay buffer if 'nstep_data_1' in info.keys(): replay_buffer.store(*info['nstep_data_1']) if nstep == 1: try: assert info['nstep_data_1'] == [o, a, r, o2, d] except: import pdb pdb.set_trace() olist.append(o) alist.append(a) rlist.append(r) o2list.append(o2) dlist.append(d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all TD3 updates at the end of the trajectory (in accordance with source code of TD3 published by original authors). """ retlist = list(discount_with_dones(rlist, dlist, gamma)) for o, a, r, o2, d, ret in zip(olist, alist, rlist, o2list, dlist, retlist): sil_replay_buffer.store(o, a, r, o2, d, ret) for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) batch_sil, weights, batch_idxes = sil_replay_buffer.sample_batch( batch_size, beta=beta) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], x_ph_sil: batch_sil['obs1'], x2_ph_sil: batch_sil['obs2'], a_ph_sil: batch_sil['acts'], r_ph_sil: batch_sil['rews'], d_ph_sil: batch_sil['done'], ret_ph: batch_sil['ret'], weights_ph: weights } q_step_ops = [q_loss, q1, q2, train_q_op] + [gains] outs = sess.run(q_step_ops, feed_dict) logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2]) # get the priorities new_priorities = outs[-1] + 1e-8 sil_replay_buffer.update_priorities(batch_idxes, new_priorities) #print_stats('new priorities', new_priorities) if j % policy_delay == 0: # Delayed policy update outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) logger.store(LossPi=outs[0]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 olist, alist, rlist, o2list, dlist = [], [], [], [], [] # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Test the performance of the deterministic version of the agent. ep_rets = test_agent() ep_ret_record.append(np.mean(ep_rets)) time_step_record.append(t) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() # save the records np.save(logdir + '/ep_rets', ep_ret_record) np.save(logdir + '/timesteps', time_step_record)
def dqn(env_fn, action_value=core.mlp_action_value, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, lr=1e-3, batch_size=100, start_steps=10000, update_period=10, eps_start=1, eps_end=0.1, eps_step=1e-4, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_num = env.action_space.n # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, obs_dim, None, None) a_ph = tf.placeholder(tf.int32) with tf.variable_scope('main'): q = mlp_action_value(x_ph, **ac_kwargs) with tf.variable_scope('target'): q_targ = mlp_action_value(x2_ph, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, size=replay_size) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['main']) print('\nNumber of parameters: %d\n' % var_counts) # Bellman backup for Q function backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * tf.reduce_max(q_targ, axis=1)) loss = tf.reduce_mean( (tf.reduce_sum(tf.one_hot(a_ph, act_num) * q, 1) - backup)**2) # Separate train ops for q optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_q_op = optimizer.minimize(loss, var_list=get_vars('main/q')) # Update target variables target_update = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'q': q}) def get_action(o, eps): if np.random.random() < eps: return env.action_space.sample() else: return np.squeeze( sess.run(tf.argmax(q, axis=1), feed_dict={x_ph: np.expand_dims(o, 0)})) def test_agent(n=10): for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, _ = test_env.step(get_action(o, 0)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): eps = max(eps_start - t * eps_step, eps_end) if t > 1: a = get_action(o, eps) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) o = o2 if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 if t % update_period == 0: sess.run([target_update]) batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } # Q-learning update outs = sess.run([loss, q, train_q_op], feed_dict) logger.store(Loss=outs[0], QVals=outs[1]) # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('QVals', with_min_and_max=True) logger.log_tabular('Loss', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def ddpg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=200, replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, act_noise=0.1, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Deterministically computes actions | from policy given states. ``q`` (batch,) | Gives the current estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q_pi`` (batch,) | Gives the composition of ``q`` and | ``pi`` for states in ``x_ph``: | q(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to DDPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope('main'): pi, q, q_pi = actor_critic(x_ph, a_ph, **ac_kwargs) # Target networks with tf.variable_scope('target'): # Note that the action placeholder going to actor_critic here is # irrelevant, because we only need q_targ(s, pi_targ(s)). pi_targ, _, q_pi_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main']) print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n' % var_counts) # Bellman backup for Q function backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * q_pi_targ) # DDPG losses pi_loss = -tf.reduce_mean(q_pi) q_loss = tf.reduce_mean((q - backup)**2) # Separate train ops for pi, q pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q')) # Polyak averaging for target variables target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'pi': pi, 'q': q }) def get_action(o, noise_scale): a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)}) a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(n=10): for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, _ = test_env.step(get_action(o, 0)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ if t > start_steps: a = get_action(o, act_noise) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all DDPG updates at the end of the trajectory, in accordance with tuning done by TD3 paper authors. """ for _ in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } # Q-learning update outs = sess.run([q_loss, q, train_q_op], feed_dict) logger.store(LossQ=outs[0], QVals=outs[1]) # Policy update outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) logger.store(LossPi=outs[0]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('QVals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def vpg(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, pi_lr=3e-4, vf_lr=1e-3, train_v_iters=80, lam=0.97, max_ep_len=1000, logger_kwargs=dict(), save_freq=10): """ Vanilla Policy Gradient (with GAE-Lambda for advantage estimation) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to VPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ device = torch.device( "cuda") if torch.cuda.is_available() else torch.device("cpu") device = torch.device('cpu') print(device) # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) print(logger_kwargs) logger.save_config(locals()) # Random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Create actor-critic module ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs).to(device) # Sync params across processes sync_params(ac) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = VPGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Set up function for computing VPG policy loss def compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[ 'logp'] obs, act, adv, logp_old = obs.to(device), act.to(device), adv.to( device), logp_old.to(device) # Policy loss pi, logp = ac.pi(obs, act) loss_pi = -(logp * adv).mean() # Useful extra info approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() pi_info = dict(kl=approx_kl, ent=ent) return loss_pi, pi_info # Set up function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs) - ret)**2).mean() # Set up optimizers for policy and value function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(): data = buf.get() # Get loss and info values before update pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with a single step of gradient descent pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update kl, ent = pi_info['kl'], pi_info_old['ent'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old)) # Prepare for interaction with environment start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32)) next_o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v, logp) logger.store(VVals=v) # Update obs (critical!) o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: if epoch_ended and not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32)) else: v = 0 buf.finish_path(v) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform VPG update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def td3(env_name, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), actor_hidden_layers=[300, 300], critic_hidden_layers=[300, 300], reward_scale=1, seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, without_start_steps=True, batch_size=100, start_steps=10000, without_delay_train=False, without_target_policy_smoothing=False, act_noise=0.1, target_noise=0.2, noise_clip=0.5, policy_delay=2, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Deterministically computes actions | from policy given states. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to TD3. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) target_noise (float): Stddev for smoothing noise added to target policy. noise_clip (float): Limit for absolute value of target policy smoothing noise. policy_delay (int): Policy will only be updated once every policy_delay times for each update of the Q-networks. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = gym.make(env_name), gym.make(env_name) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) actor_hidden_sizes = actor_hidden_layers critic_hidden_sizes = critic_hidden_layers actor_hidden_activation = tf.keras.activations.relu actor_output_activation = tf.keras.activations.tanh critic_hidden_activation = tf.keras.activations.relu critic_output_activation = tf.keras.activations.linear # Main outputs from computation graph with tf.variable_scope('main'): actor = MLP(layer_sizes=actor_hidden_sizes + [act_dim], hidden_activation=actor_hidden_activation, output_activation=actor_output_activation) critic1 = MLP(layer_sizes=critic_hidden_sizes + [1], hidden_activation=critic_hidden_activation, output_activation=critic_output_activation) critic2 = MLP(layer_sizes=critic_hidden_sizes + [1], hidden_activation=critic_hidden_activation, output_activation=critic_output_activation) pi = act_limit * actor(x_ph) q1 = tf.squeeze(critic1(tf.concat([x_ph, a_ph], axis=-1)), axis=1) q1_pi = tf.squeeze(critic1(tf.concat([x_ph, pi], axis=-1)), axis=1) q2 = tf.squeeze(critic2(tf.concat([x_ph, a_ph], axis=-1)), axis=1) q2_pi = tf.squeeze(critic2(tf.concat([x_ph, pi], axis=-1)), axis=1) # Target policy network with tf.variable_scope('target'): actor_targ = MLP(layer_sizes=actor_hidden_sizes + [act_dim], hidden_activation=actor_hidden_activation, output_activation=actor_output_activation) pi_targ = act_limit * actor_targ(x2_ph) if without_target_policy_smoothing: a2 = pi_targ else: # Target policy smoothing, by adding clipped noise to target actions epsilon = tf.random_normal(tf.shape(pi_targ), stddev=target_noise) epsilon = tf.clip_by_value(epsilon, -noise_clip, noise_clip) a2 = pi_targ + epsilon a2 = tf.clip_by_value(a2, -act_limit, act_limit) critic1_targ = MLP(layer_sizes=critic_hidden_sizes + [1], hidden_activation=critic_hidden_activation, output_activation=critic_output_activation) critic2_targ = MLP(layer_sizes=critic_hidden_sizes + [1], hidden_activation=critic_hidden_activation, output_activation=critic_output_activation) # Target Q-values, using action from target policy q1_targ = tf.squeeze(critic1_targ(tf.concat([x2_ph, a2], axis=-1)), axis=1) q2_targ = tf.squeeze(critic2_targ(tf.concat([x2_ph, a2], axis=-1)), axis=1) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Bellman backup for Q functions, using Clipped Double-Q targets # TD3 losses min_q_targ = tf.minimum(q1_targ, q2_targ) backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * min_q_targ) # MSE q1_loss = 0.5 * tf.reduce_mean((backup - q1)**2) q2_loss = 0.5 * tf.reduce_mean((backup - q2)**2) pi_loss = -tf.reduce_mean(q1_pi) q_loss = q1_loss + q2_loss # Separate train ops for pi, q pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=actor.variables) train_q_op = q_optimizer.minimize(q_loss, var_list=critic1.variables + critic2.variables) # Polyak averaging for target variables target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip( actor.variables + critic1.variables + critic2.variables, actor_targ.variables + critic1_targ.variables + critic2_targ.variables) ]) # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip( actor.variables + critic1.variables + critic2.variables, actor_targ.variables + critic1_targ.variables + critic2_targ.variables) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving # logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'pi': pi, 'q1': q1, 'q2': q2}) def get_action(o, noise_scale): a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0] a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(n=10): for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, _ = test_env.step(get_action(o, 0)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ if t > start_steps: a = get_action(o, act_noise) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, reward_scale * r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if without_delay_train: batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } q_step_ops = [q_loss, q1, q2, train_q_op] outs = sess.run(q_step_ops, feed_dict) logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2]) # Delayed policy update outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) logger.store(LossPi=outs[0]) if d or (ep_len == max_ep_len): """ Perform all TD3 updates at the end of the trajectory (in accordance with source code of TD3 published by original authors). """ if not without_delay_train: for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } q_step_ops = [q_loss, q1, q2, train_q_op] outs = sess.run(q_step_ops, feed_dict) logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2]) if j % policy_delay == 0: # Delayed policy update outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) logger.store(LossPi=outs[0]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model # Save actor-critic model if (epoch % save_freq == 0) or (epoch == epochs - 1): model_save_dir = os.path.join(logger.output_dir, 'checkpoints') if not os.path.exists(model_save_dir): os.makedirs(model_save_dir) actor.save_weights( os.path.join(model_save_dir, 'epoch{}_actor'.format(epoch))) critic1.save_weights( os.path.join(model_save_dir, 'epoch{}_critic1'.format(epoch))) critic2.save_weights( os.path.join(model_save_dir, 'epoch{}_critic2'.format(epoch))) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def maxsqn(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``mu`` (batch, act_dim) | Computes mean actions from policy | given states. ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. Critical: must be differentiable | with respect to policy parameters all | the way through action sampling. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). ``q2_pi`` (batch,) | Gives the composition of ``q2`` and | ``pi`` for states in ``x_ph``: | q2(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for policy/value/alpha learning). alpha (float/'auto'): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) / 'auto': alpha is automated. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # print(max_ep_len,type(max_ep_len)) logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] obs_space = env.observation_space act_dim = env.action_space.n act_space = env.action_space # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders_from_space(obs_space, act_space, obs_space, None, None) ###### if alpha == 'auto': # target_entropy = (-np.prod(env.action_space.n)) # target_entropy = (np.prod(env.action_space.n))/4/10 target_entropy = 0.4 log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=0.0) alpha = tf.exp(log_alpha) ###### # Main outputs from computation graph with tf.variable_scope('main'): mu, pi, logp_pi, logp_pi2, q1, q2, q1_pi, q2_pi, q1_mu, q2_mu = actor_critic(x_ph,x2_ph, a_ph, alpha, **ac_kwargs) # Target value network with tf.variable_scope('target'): _, _, logp_pi_, _, _, _,q1_pi_, q2_pi_,q1_mu_, q2_mu_= actor_critic(x2_ph, x2_ph,a_ph, alpha, **ac_kwargs) # Experience buffer if isinstance(act_space, Box): a_dim = act_dim elif isinstance(act_space, Discrete): a_dim = 1 replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=a_dim, size=replay_size) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main']) print(('\nNumber of parameters: \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t total: %d\n')%var_counts) ###### if isinstance(alpha,tf.Tensor): alpha_loss = tf.reduce_mean(-log_alpha * tf.stop_gradient(logp_pi_ + target_entropy)) alpha_optimizer = tf.train.AdamOptimizer(learning_rate=lr, name='alpha_optimizer') train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss, var_list=[log_alpha]) ###### # Min Double-Q: # min_q_pi = tf.minimum(q1_pi_, q2_pi_) min_q_pi = tf.minimum(q1_mu_, q2_mu_) # Targets for Q and V regression v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi2) ############################## alpha=0 q_backup = r_ph + gamma*(1-d_ph)*v_backup # Soft actor-critic losses q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2) value_loss = q1_loss + q2_loss # # Policy train op # # (has to be separate from value train op, because q1_pi appears in pi_loss) # pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) # train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars('main/q') #with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) # All ops to call during one training step if isinstance(alpha, Number): step_ops = [q1_loss, q2_loss, q1, q2, logp_pi_, tf.identity(alpha), train_value_op, target_update] else: step_ops = [q1_loss, q2_loss, q1, q2, logp_pi_, alpha, train_value_op, target_update, train_alpha_op] # Initializing targets to match main variables target_init = tf.group([tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2}) def get_action(o, deterministic=False): act_op = mu if deterministic else pi return sess.run(act_op, feed_dict={x_ph: np.expand_dims(o, axis=0)})[0] def test_agent(n=20): # n: number of tests global sess, mu, pi, q1, q2, q1_pi, q2_pi for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not(d or (ep_len == max_ep_len)): # max_ep_len # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() # o = env.reset() ##################### # o, r, d, ep_ret, ep_len = env.step(1)[0], 0, False, 0, 0 ##################### o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs ep_index = 0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ # if t > start_steps and 100*t/total_steps > np.random.random(): # greedy, avoid falling into sub-optimum if t > start_steps: a = get_action(o) else: a = env.action_space.sample() np.random.random() # Step the env o2, r, d, _ = env.step(a) #print(a,o2) # o2, r, _, d = env.step(a) ##################### # d = d['ale.lives'] < 5 ##################### ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len==max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of episode. Training (ep_len times). if d or (ep_len == max_ep_len): # make sure: max_ep_len < steps_per_epoch ep_index += 1 print('episode: {}, ep_len: {}, reward: {}'.format(ep_index, ep_len, ep_ret)) """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = {x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], } # step_ops = [q1_loss, q2_loss, q1, q2, logp_pi, alpha, train_pi_op, train_value_op, target_update] outs = sess.run(step_ops, feed_dict) logger.store(LossQ1=outs[0], LossQ2=outs[1], Q1Vals=outs[2], Q2Vals=outs[3], LogPi=outs[4], Alpha=outs[5]) #if d: logger.store(EpRet=ep_ret, EpLen=ep_len) # o = env.reset() ##################### # o, r, d, ep_ret, ep_len = env.step(1)[0], 0, False, 0, 0 ##################### o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # # Save model # if (epoch % save_freq == 0) or (epoch == epochs-1): # logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent(1) # logger.store(): store the data; logger.log_tabular(): log the data; logger.dump_tabular(): write the data # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Alpha',average_only=True) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) # logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) # logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) # logger.log_tabular('LossV', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular()
def rbiflow(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=256, start_steps=1000, update_after=1000, update_every=50, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, eps=0.2, n_explore=32, device='cuda', n_samples=100, cmin=0.25, cmax=1.75, greed=0.01, rand=0.01): """ Rerouted Behavior Improvement (rbiflow) """ device = torch.device(device) logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Create actor-critic module and target networks ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs).to(device) ac_targ = deepcopy(ac) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters()) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size, device=device) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple( core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2]) logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' % var_counts) def max_reroute(o): b, _ = o.shape o = repeat_and_reshape(o, n_samples) with torch.no_grad(): ai, _ = ac.pi(o) q1 = ac.q1(o, ai) q2 = ac.q2(o, ai) qi = torch.min(q1, q2).unsqueeze(-1) qi = qi.view(n_samples, b, 1) ai = ai.view(n_samples, b, act_dim) rank = torch.argsort(torch.argsort(qi, dim=0, descending=True), dim=0, descending=False) w = cmin * torch.ones_like(ai) m = int((1 - cmin) * n_samples / (cmax - cmin)) w += (cmax - cmin) * (rank < m).float() w += ((1 - cmin) * n_samples - m * (cmax - cmin)) * (rank == m).float() w -= greed w += greed * n_samples * (rank == 0).float() w = w * (1 - rand) + rand w = w / w.sum(dim=0, keepdim=True) prob = torch.distributions.Categorical(probs=w.permute(1, 2, 0)) a = torch.gather(ai.permute(1, 2, 0), 2, prob.sample().unsqueeze(2)).squeeze(2) return a, (ai, w.mean(-1)) # Set up function for computing SAC Q-losses def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] q1 = ac.q1(o, a) q2 = ac.q2(o, a) # Bellman backup for Q functions with torch.no_grad(): # Target actions come from *current* policy a2, logp_a2 = ac.pi(o2) # Target Q-values q1_pi_targ = ac_targ.q1(o2, a2) q2_pi_targ = ac_targ.q2(o2, a2) q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) backup = r + gamma * (1 - d) * (q_pi_targ - alpha * logp_a2) # MSE loss against Bellman backup loss_q1 = ((q1 - backup)**2).mean() loss_q2 = ((q2 - backup)**2).mean() loss_q = loss_q1 + loss_q2 # Useful info for logging q_info = dict(Q1Vals=q1.detach().cpu().numpy(), Q2Vals=q2.detach().cpu().numpy()) return loss_q, q_info # Set up function for computing SAC pi loss def compute_loss_pi(data): o = data['obs'] _, (ai, w) = max_reroute(o) pi, logp_pi = ac.pi(o) log_ai = ac.pi.log_prob(ai) # Entropy-regularized policy loss loss_pi = (alpha * logp_pi - (log_ai * w).sum(dim=0)).mean() # Useful info for logging pi_info = dict(LogPi=logp_pi.detach().cpu().numpy()) return loss_pi, pi_info # Set up optimizers for policy and q-function pi_optimizer = Adam(ac.pi.parameters(), lr=lr) q_optimizer = Adam(q_params, lr=lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(data): # First run one gradient descent step for Q1 and Q2 q_optimizer.zero_grad() loss_q, q_info = compute_loss_q(data) loss_q.backward() q_optimizer.step() # Record things logger.store(LossQ=loss_q.item(), **q_info) # Freeze Q-networks so you don't waste computational effort # computing gradients for them during the policy learning step. for p in q_params: p.requires_grad = False # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) loss_pi.backward() pi_optimizer.step() # Unfreeze Q-networks so you can optimize it at next DDPG step. for p in q_params: p.requires_grad = True # Record things logger.store(LossPi=loss_pi.item(), **pi_info) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) def get_action(o, deterministic=False): o = torch.as_tensor(o, dtype=torch.float32, device=device) if deterministic: a = ac.act(o, deterministic) else: o = o.unsqueeze(0) a, _ = max_reroute(o) a = a.flatten().cpu().numpy() return a def test_agent(): for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for t in tqdm(range(total_steps)): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy. if t > start_steps: a = get_action(o) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Update handling if t >= update_after and t % update_every == 0: for j in range(update_every): batch = replay_buffer.sample_batch(batch_size) update(data=batch) # End of epoch handling if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()