def my_td3(env_fn, seed=0, steps_per_epoch=4000, epochs=100, max_ep_len=1000, hidden_sizes=[256, 256], logger_kwargs=dict(), save_freq=1, batch_size=100, start_steps=10000, update_after=1000, update_every=50, num_test_episodes=10, gamma=0.99, polyak=0.995, act_noise=0.1, pi_lr=1e-3, q_lr=1e-3, buffer_size=int(1e6), target_noise=0.2, noise_clip=0.5, policy_delay=2): """ My TD3 implementation """ # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random seed torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env_fn() test_env = env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] print("env.observation_space", env.observation_space) print("env.observation_space.shape", env.observation_space.shape) print("env.action_space", env.action_space) action_min = env.action_space.low[0] action_max = env.action_space.high[0] if isinstance(env.action_space, gym.spaces.Discrete): print("Discrete action space not supported for my_td3!") return # Set up experience buffer buf = ReplayBuffer(obs_dim, act_dim, buffer_size) # Instantiate models assert action_max == abs(action_min) policy = DeterministicPolicyNet(obs_dim, act_dim, hidden_sizes, action_max) policy_target = copy.deepcopy(policy) policy_optimizer = torch.optim.Adam(policy.mu_net.parameters(), lr=pi_lr) # Two Q-functions for Double Q Learning q_function_1 = QNet(obs_dim, act_dim, hidden_sizes) q_function_target_1 = copy.deepcopy(q_function_1) q_optimizer_1 = torch.optim.Adam(q_function_1.q_net.parameters(), lr=q_lr) q_function_2 = QNet(obs_dim, act_dim, hidden_sizes) q_function_target_2 = copy.deepcopy(q_function_2) q_optimizer_2 = torch.optim.Adam(q_function_2.q_net.parameters(), lr=q_lr) # Set up model saving logger.setup_pytorch_saver(policy) # TODO: Save value network as well # Freeze target networks with respect to optimizers (only update via polyak averaging) for p_targ in policy_target.parameters(): p_targ.requires_grad = False for q_targ in q_function_target_1.parameters(): q_targ.requires_grad = False for q_targ in q_function_target_2.parameters(): q_targ.requires_grad = False # Prepare for interaction with environment num_steps = epochs * steps_per_epoch start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for step in range( num_steps ): # TODO: Change to for loop over range(epochs) and range(steps_per_epoch) with torch.no_grad(): if step < start_steps: # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy (with some noise, via act_noise). a = env.action_space.sample() else: assert o.shape == (obs_dim, ) a = policy(torch.tensor(o, dtype=torch.float32).unsqueeze(0)) assert a.shape == (1, act_dim) a = a[0] # Remove batch dimension a = torch.clamp(a + act_noise * torch.randn(act_dim), action_min, action_max) # Add exploration noise a = a.numpy() # Convert to numpy next_o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d buf.store(o, a, r, next_o, d) # Update obs (critical!) o = next_o # Trajectory finished if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 if step >= update_after and step % update_every == 0: for j in range(update_every): def update(): o, a, r, next_o, d = buf.sample_batch(batch_size) # Compute targets with torch.no_grad(): next_a_targ = policy_target(next_o) # TD3 modification 1: Target policy smoothing eps = torch.clamp( torch.randn_like(next_a_targ) * target_noise, -noise_clip, noise_clip) next_a_targ = torch.clamp(next_a_targ + eps, action_min, action_max) # Clipped Double Q-Learning next_q_targ_1 = q_function_target_1( next_o, next_a_targ) next_q_targ_2 = q_function_target_2( next_o, next_a_targ) next_q_targ = torch.min(next_q_targ_1, next_q_targ_2) q_targ_1 = r + gamma * (1 - d) * next_q_targ q_targ_2 = r + gamma * (1 - d) * next_q_targ # Update Q functions q_optimizer_1.zero_grad() q_loss_1 = ((q_function_1(o, a) - q_targ_1)**2).mean() q_loss_1.backward() q_optimizer_1.step() q_optimizer_2.zero_grad() q_loss_2 = ((q_function_2(o, a) - q_targ_2)**2).mean() q_loss_2.backward() q_optimizer_2.step() # Delayed policy updates if j % policy_delay == 0: # Freeze Q-network so you don't waste computational effort # computing gradients for it during the policy learning step. for p in q_function_1.parameters(): p.requires_grad = False for p in q_function_2.parameters(): p.requires_grad = False # Policy function update policy_optimizer.zero_grad() policy_loss = -(q_function_1(o, policy(o))).mean() policy_loss.backward() policy_optimizer.step() # Unfreeze Q-network so you can optimize it at next DDPG step. for p in q_function_1.parameters(): p.requires_grad = True for p in q_function_2.parameters(): p.requires_grad = True # Update target networks with polyak with torch.no_grad(): for p, p_targ in zip(policy.parameters(), policy_target.parameters()): p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) for q, q_targ in zip( q_function_1.parameters(), q_function_target_1.parameters()): q_targ.data.mul_(polyak) q_targ.data.add_((1 - polyak) * q.data) for q, q_targ in zip( q_function_2.parameters(), q_function_target_2.parameters()): q_targ.data.mul_(polyak) q_targ.data.add_((1 - polyak) * q.data) update() if (step + 1) % steps_per_epoch == 0: epoch = (step + 1) // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. def test_agent(): with torch.no_grad(): for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time a = policy( torch.tensor(o, dtype=torch.float32).unsqueeze(0)) assert a.shape == (1, act_dim) a = a[0] # Remove batch dimension a = a.numpy() # Convert to numpy o, r, d, _ = test_env.step(a) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', step) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def egl(env_fn, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=256, start_steps=10000, update_after=1000, update_every=50, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, eps=0.4, n_explore=32, device='cuda', architecture='mlp', sample='on_policy'): """ Soft Actor-Critic (SAC) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with an ``act`` method, a ``pi`` module, a ``q1`` module, and a ``q2`` module. The ``act`` method and ``pi`` module should accept batches of observations as inputs, and ``q1`` and ``q2`` should accept a batch of observations and a batch of actions as inputs. When called, ``act``, ``q1``, and ``q2`` should return: =========== ================ ====================================== Call Output Shape Description =========== ================ ====================================== ``act`` (batch, act_dim) | Numpy array of actions for each | observation. ``q1`` (batch,) | Tensor containing one current estimate | of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) ``q2`` (batch,) | Tensor containing the other current | estimate of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) =========== ================ ====================================== Calling ``pi`` should return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Tensor containing actions from policy | given observations. ``logp_pi`` (batch,) | Tensor containing log probabilities of | actions in ``a``. Importantly: gradients | should be able to flow back into ``a``. =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ if architecture == 'mlp': actor_critic = core.MLPActorCritic elif architecture == 'spline': actor_critic = core.SplineActorCritic else: raise NotImplementedError device = torch.device(device) logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Create actor-critic module and target networks ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs).to(device) ac_targ = deepcopy(ac) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters()) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size, device=device) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple( core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2, ac.geps]) logger.log( '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t geps: %d\n' % var_counts) n_samples = 100 cmin = 0.25 cmax = 1.75 greed = 0.01 rand = 0.01 def max_reroute(o): b, _ = o.shape o = repeat_and_reshape(o, n_samples) with torch.no_grad(): ai, _ = ac.pi(o) q1 = ac.q1(o, ai) q2 = ac.q2(o, ai) qi = torch.min(q1, q2).unsqueeze(-1) qi = qi.view(n_samples, b, 1) ai = ai.view(n_samples, b, act_dim) rank = torch.argsort(torch.argsort(qi, dim=0, descending=True), dim=0, descending=False) w = cmin * torch.ones_like(ai) m = int((1 - cmin) * n_samples / (cmax - cmin)) w += (cmax - cmin) * (rank < m).float() w += ((1 - cmin) * n_samples - m * (cmax - cmin)) * (rank == m).float() w -= greed w += greed * n_samples * (rank == 0).float() w = w * (1 - rand) + rand w = w / w.sum(dim=0, keepdim=True) prob = torch.distributions.Categorical(probs=w.permute(1, 2, 0)) a = torch.gather(ai.permute(1, 2, 0), 2, prob.sample().unsqueeze(2)).squeeze(2) return a, (ai, w.mean(-1)) # Set up function for computing SAC Q-losses def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] q1 = ac.q1(o, a) q2 = ac.q2(o, a) # Bellman backup for Q functions with torch.no_grad(): # Target actions come from *current* policy a2, logp_a2 = ac.pi(o2) # Target Q-values q1_pi_targ = ac_targ.q1(o2, a2) q2_pi_targ = ac_targ.q2(o2, a2) q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) backup = r + gamma * (1 - d) * (q_pi_targ - alpha * logp_a2) # MSE loss against Bellman backup loss_q1 = ((q1 - backup)**2).mean() loss_q2 = ((q2 - backup)**2).mean() loss_q = loss_q1 + loss_q2 # Useful info for logging q_info = dict(Q1Vals=q1.detach().cpu().numpy(), Q2Vals=q2.detach().cpu().numpy()) return loss_q, q_info # # Set up function for computing EGL mean-gradient-losses # def compute_loss_g(data): # # o, a1, r, o_tag, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done'] # # a2 = ball_explore(a1, n_explore, eps) # # a2 = a2.view(n_explore * len(r), act_dim) # o_expand = repeat_and_reshape(o, n_explore) # # # Bellman backup for Q functions # with torch.no_grad(): # # q1 = ac.q1(o_expand, a2) # q2 = ac.q2(o_expand, a2) # q_dither = torch.min(q1, q2) # # # Target actions come from *current* policy # a_tag, logp_a_tag = ac.pi(o_tag) # # # Target Q-values # q1_pi_targ = ac_targ.q1(o_tag, a_tag) # q2_pi_targ = ac_targ.q2(o_tag, a_tag) # q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) # q_anchor = r + gamma * (1 - d) * (q_pi_targ - alpha * logp_a_tag) # # q_anchor = repeat_and_reshape(q_anchor, n_explore).squeeze(-1) # # geps = ac.geps(o, a1) # geps = repeat_and_reshape(geps, n_explore) # a1 = repeat_and_reshape(a1, n_explore) # # geps = (geps * (a2 - a1)).sum(-1) # # l1 loss against Bellman backup # # loss_g = F.smooth_l1_loss(geps, q_dither - q_anchor) # # # Useful info for logging # g_info = dict(GVals=geps.flatten().detach().cpu().numpy()) # # return loss_g, g_info # Set up function for computing EGL mean-gradient-losses def compute_loss_g(data): o, a1, r, o_tag, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] a2 = ball_explore(a1, n_explore, eps) a2 = a2.view(n_explore * len(r), act_dim) o_expand = repeat_and_reshape(o, n_explore) # Bellman backup for Q functions with torch.no_grad(): q1 = ac.q1(o_expand, a2) q2 = ac.q2(o_expand, a2) q_dither = torch.min(q1, q2) # Target actions come from *current* policy # Target Q-values q1 = ac.q1(o, a1) q2 = ac.q2(o, a1) q_anchor = torch.min(q1, q2) q_anchor = repeat_and_reshape(q_anchor, n_explore).squeeze(-1) geps = ac.geps(o, a1) geps = repeat_and_reshape(geps, n_explore) a1 = repeat_and_reshape(a1, n_explore) geps = (geps * (a2 - a1)).sum(-1) # l1 loss against Bellman backup loss_g = F.smooth_l1_loss(geps, q_dither - q_anchor) # Useful info for logging g_info = dict(GVals=geps.flatten().detach().cpu().numpy()) return loss_g, g_info # Set up function for computing SAC pi loss def compute_loss_pi(data): o = data['obs'] pi, logp_pi = ac.pi(o) geps_pi = ac.geps(o, pi) # Entropy-regularized policy loss loss_pi = (alpha * logp_pi - (geps_pi * pi).sum(-1)).mean() beta = autograd.Variable(pi.detach().clone(), requires_grad=True) q1_pi = ac.q1(o, beta) q2_pi = ac.q2(o, beta) qa = torch.min(q1_pi, q2_pi).unsqueeze(-1) grad_q = autograd.grad(outputs=qa, inputs=beta, grad_outputs=torch.cuda.FloatTensor( qa.size()).fill_(1.), create_graph=False, retain_graph=False, only_inputs=True)[0] # Useful info for logging pi_info = dict( LogPi=logp_pi.detach().cpu().numpy(), GradGAmp=torch.norm(geps_pi, dim=-1).detach().cpu().numpy(), GradQAmp=torch.norm(grad_q, dim=-1).detach().cpu().numpy(), GradDelta=torch.norm(geps_pi - grad_q, dim=-1).detach().cpu().numpy(), GradSim=F.cosine_similarity(geps_pi, grad_q, dim=-1).detach().cpu().numpy(), ) return loss_pi, pi_info if architecture == 'mlp': # Set up optimizers for policy and q-function pi_optimizer = Adam(ac.pi.parameters(), lr=lr) q_optimizer = Adam(q_params, lr=lr) g_optimizer = Adam(ac.geps.parameters(), lr=lr) elif architecture == 'spline': # Set up optimizers for policy and q-function pi_optimizer = SparseDenseAdamOptimizer(ac.pi, dense_args={'lr': lr}, sparse_args={'lr': 10 * lr}) q_optimizer = SparseDenseAdamOptimizer([ac.q1, ac.q2], dense_args={'lr': lr}, sparse_args={'lr': 10 * lr}) g_optimizer = SparseDenseAdamOptimizer(ac.geps, dense_args={'lr': lr}, sparse_args={'lr': 10 * lr}) else: raise NotImplementedError # Set up model saving logger.setup_pytorch_saver(ac) def update(data): # First run one gradient descent step for Q1 and Q2 q_optimizer.zero_grad() loss_q, q_info = compute_loss_q(data) loss_q.backward() q_optimizer.step() # Record things logger.store(LossQ=loss_q.item(), **q_info) # Next run one gradient descent step for the mean-gradient g_optimizer.zero_grad() loss_g, g_info = compute_loss_g(data) loss_g.backward() g_optimizer.step() # Record things logger.store(LossG=loss_g.item(), **g_info) # Freeze Q-networks so you don't waste computational effort # computing gradients for them during the policy learning step. for p in ac.geps.parameters(): p.requires_grad = False # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) loss_pi.backward() pi_optimizer.step() # Unfreeze Q-networks so you can optimize it at next DDPG step. for p in ac.geps.parameters(): p.requires_grad = True # Record things logger.store(LossPi=loss_pi.item(), **pi_info) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) def get_action_on_policy(o, deterministic=False): return ac.act(torch.as_tensor(o, dtype=torch.float32, device=device), deterministic) def get_action_rbi(o, deterministic=False): o = torch.as_tensor(o, dtype=torch.float32, device=device) if deterministic: a = ac.act(o, deterministic) else: o = o.unsqueeze(0) a, _ = max_reroute(o) a = a.flatten().cpu().numpy() return a if sample == 'on_policy': get_action = get_action_on_policy elif sample == 'rbi': get_action = get_action_rbi else: raise NotImplementedError def test_agent(): for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for t in tqdm(range(total_steps)): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy. if t > start_steps: a = get_action(o) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Update handling if t >= update_after and t % update_every == 0: for j in range(update_every): batch = replay_buffer.sample_batch(batch_size) update(data=batch) # End of epoch handling if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('GVals', with_min_and_max=True) logger.log_tabular('LossG', with_min_and_max=True) logger.log_tabular('GradGAmp', with_min_and_max=True) logger.log_tabular('GradQAmp', with_min_and_max=True) logger.log_tabular('GradDelta', with_min_and_max=True) logger.log_tabular('GradSim', with_min_and_max=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def sppo(args, env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=200, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) ########### if args.alpha == 'auto': target_entropy = 0.35 log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=tf.log(0.2)) alpha = tf.exp(log_alpha) else: alpha = args.alpha ########### # Main outputs from computation graph mu, pi, logp, logp_pi, v, q, h = actor_critic(alpha, x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi, h] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) ###### if args.alpha == 'auto': alpha_loss = tf.reduce_mean( -log_alpha * tf.stop_gradient(-h + target_entropy) ) # tf.clip_by_value(-h + target_entropy, 0.0, 1000.0 ) alpha_optimizer = MpiAdamOptimizer(learning_rate=1e-5) train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss, var_list=[log_alpha]) ###### # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) # For PPO # min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) # pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) # ### Scheme1: SPPO NO.2: add entropy # adv_logp = adv_ph - tf.stop_gradient(alpha) * tf.stop_gradient(logp) # min_adv = tf.where(adv_logp>0, (1+clip_ratio)*adv_logp, (1-clip_ratio)*adv_logp) # pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_logp, min_adv)) # ### Scheme3: SPPO NO.3: add entropy # adv_logp = adv_ph - tf.stop_gradient(alpha) * logp_old_ph # min_adv = tf.where(adv_logp>0, (1+clip_ratio)*adv_logp, (1-clip_ratio)*adv_logp) # pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_logp, min_adv)) ### Scheme2: SPPO NO.2: add entropy min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean( tf.minimum(ratio * adv_ph, min_adv) + tf.stop_gradient(alpha) * h) v_loss = tf.reduce_mean((ret_ph - v)**2) #+(ret_ph - q)**2)/2.0 # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( h) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer( learning_rate=args.pi_lr).minimize(pi_loss + 0.1 * v_loss) # train_v = MpiAdamOptimizer(learning_rate=args.vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): if args.alpha == 'auto': sess.run(train_alpha_op, feed_dict=inputs) _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) # for _ in range(train_v_iters): # sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old), Alpha=sess.run(alpha) if args.alpha == 'auto' else alpha) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t, h_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) # q_t = sess.run(q, feed_dict={x_ph: o.reshape(1,-1), a_ph: a}) # SPPO NO.1: add entropy # rh = r - args.alpha * logp_t if args.alpha == 'auto': rh = r + sess.run(alpha) * h_t else: rh = r + alpha * h_t # exact entropy # save and log buf.store(o, a, rh, v_t, logp_t) logger.store(VVals=v_t) o, r, d, _ = env.step(a[0]) ep_ret += r ep_len += 1 # d = False if ep_len == max_ep_len else d terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # # Save model # if (epoch % save_freq == 0) or (epoch == epochs-1): # logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Alpha', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def ppo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 maxRev = float("-inf") #negative infinity in the beginning #maxRevActionSeq=[] maxRevTSTT = 0 maxRevRevenue = 0 maxRevThroughput = 0 maxRevJAH = 0 maxRevRemVeh = 0 maxRevJAH2 = 0 maxRevRMSE_MLvio = 0 maxRevPerTimeVio = 0 maxRevHOTDensity = pd.DataFrame() maxRevGPDensity = pd.DataFrame() maxtdJAHMax = 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) #we need to scale the sampled values of action from (-1,1) to our choices of toll coz they were sampled from tanh activation mu numpyFromA = np.array(a[0]) numpyFromA = ((numpyFromA + 1.0) * (env.state.tollMax - env.state.tollMin) / 2.0) + env.state.tollMin a[0] = np.ndarray.tolist(numpyFromA) o, r, d, _ = env.step(a[0]) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) #get other stats and store them too otherStats = env.getAllOtherStats() if np.any(np.isnan(np.array(otherStats))): sys.exit("Nan found in statistics! Error") logger.store(EpTSTT=otherStats[0], EpRevenue=otherStats[1], EpThroughput=otherStats[2], EpJAH=otherStats[3], EpRemVeh=otherStats[4], EpJAH2=otherStats[5], EpMLViolRMSE=otherStats[6], EpPerTimeVio=otherStats[7], EptdJAHMax=otherStats[8]) #determine max rev profile if ep_ret > maxRev: maxRev = ep_ret maxRevActionSeq = env.state.tollProfile maxRevTSTT = otherStats[0] maxRevRevenue = otherStats[1] maxRevThroughput = otherStats[2] maxRevJAH = otherStats[3] maxRevRemVeh = otherStats[4] maxRevJAH2 = otherStats[5] maxRevRMSE_MLvio = otherStats[6] maxRevPerTimeVio = otherStats[7] maxRevHOTDensity = env.getHOTDensityData() maxRevGPDensity = env.getGPDensityData() maxtdJAHMax = otherStats[8] o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpTSTT', average_only=True) logger.log_tabular('EpRevenue', average_only=True) logger.log_tabular('EpThroughput', average_only=True) logger.log_tabular('EpJAH', average_only=True) logger.log_tabular('EpRemVeh', average_only=True) logger.log_tabular('EpJAH2', average_only=True) logger.log_tabular('EpMLViolRMSE', average_only=True) logger.log_tabular('EpPerTimeVio', average_only=True) logger.log_tabular('EptdJAHMax', average_only=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() print("Max cumulative reward obtained= %f " % maxRev) print( "Corresponding revenue($)= %f, TSTT(hrs)= %f, Throughput(veh)=%f, JAHstat= %f, remaining vehicles= %f, JAHstat2=%f, RMSEML_vio=%f, percentTimeViolated(%%)=%f, tdJAHMax= %f" % (maxRevRevenue, maxRevTSTT, maxRevThroughput, maxRevJAH, maxRevRemVeh, maxRevJAH2, maxRevRMSE_MLvio, maxRevPerTimeVio, maxtdJAHMax)) outputVector = [ maxRev, maxRevRevenue, maxRevTSTT, maxRevThroughput, maxRevJAH, maxRevRemVeh, maxRevJAH2, maxRevRMSE_MLvio, maxRevPerTimeVio, maxtdJAHMax ] #print("\n===Max rev action sequence is\n",maxRevActionSeq) exportTollProfile(maxRevActionSeq, logger_kwargs, outputVector) exportDensityData(maxRevHOTDensity, maxRevGPDensity, logger_kwargs)
def vpg(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, pi_lr=3e-4, vf_lr=1e-3, train_v_iters=80, lam=0.97, max_ep_len=1000, logger_kwargs=dict(), save_freq=10): """ Vanilla Policy Gradient (with GAE-Lambda for advantage estimation) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to VPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env_fn() obs_dim = env.observation_space.shape # obs_dim = env.observation_space.n act_dim = env.action_space.shape # Create actor-critic module ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) # Sync params across processes sync_params(ac) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts) # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = VPGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Set up function for computing VPG policy loss def compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data['logp'] # Policy loss pi, logp = ac.pi(obs, act) loss_pi = -(logp * adv).mean() # Useful extra info approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() pi_info = dict(kl=approx_kl, ent=ent) return loss_pi, pi_info # Set up function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs) - ret)**2).mean() # Set up optimizers for policy and value function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(): data = buf.get() # Get loss and info values before update pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with a single step of gradient descent pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) bayes_kl_loss = 0. if isinstance(ac.v, BayesMLPCritic): bayes_kl_loss = ac.v.compute_kl() total_loss_v = loss_v + bayes_kl_loss / data['obs'].shape[0] total_loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update kl, ent = pi_info['kl'], pi_info_old['ent'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old), BayesKL=bayes_kl_loss) # Prepare for interaction with environment start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 epoch_reward = [] # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32)) next_o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v, logp) logger.store(VVals=v) # Update obs (critical!) o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t==local_steps_per_epoch-1 if terminal or epoch_ended: if epoch_ended and not(terminal): print('Warning: trajectory cut off by epoch at %d steps.'%ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32)) else: v = 0 buf.finish_path(v) if terminal: # only save EpRet / EpLen if trajectory finished epoch_reward.append(ep_ret) logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs-1): logger.save_state({'env': env}, None) # Perform VPG update! update() if epoch % 10 == 0: # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('BayesKL', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular() return epoch_reward
def vpg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, pi_lr=3e-2, vf_lr=1e-3, train_v_iters=80, lam=0.97, max_ep_len=1000, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to VPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = VPGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # VPG objectives pi_loss = -tf.reduce_mean(logp * adv_ph) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Policy gradient step sess.run(train_pi, feed_dict=inputs) # Value function learning for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl = sess.run([pi_loss, v_loss, approx_kl], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) o, r, d, _ = env.step(a[0]) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform VPG update! update() # Log info about epoch #logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', average_only=True) #logger.log_tabular('EpLen', average_only=True) #logger.log_tabular('VVals', with_min_and_max=True) #logger.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch) #logger.log_tabular('LossPi', average_only=True) #logger.log_tabular('LossV', average_only=True) #logger.log_tabular('DeltaLossPi', average_only=True) #logger.log_tabular('DeltaLossV', average_only=True) #logger.log_tabular('Entropy', average_only=True) #logger.log_tabular('KL', average_only=True) #logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular()
def vpg(env_fn, actor_critic=core.ActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, pi_lr=3e-4, vf_lr=1e-3, train_v_iters=80, lam=0.97, max_ep_len=1000, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A reference to ActorCritic class which after instantiation takes an input ``x``, and action, ``a``, and returns: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a`` | in states ``x``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x``. (Critical: make sure to | flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to VPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) # https://pytorch.org/docs/master/notes/randomness.html#cudnn torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = VPGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Actor Critic model instance actor_critic = actor_critic(obs_dim, **ac_kwargs) actor_critic.to(device) # load to cpu/gpu # Count variables var_counts = tuple(core.count_vars(model) for model in [actor_critic.policy, actor_critic.value]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts) # Optimizers train_pi = optim.Adam(actor_critic.policy.parameters(), lr=pi_lr) train_v = optim.Adam(actor_critic.value.parameters(), lr=vf_lr) # Sync params across processes # sync_all_params() # TODO figure out the way to do use MPI for pytorch def update(): actor_critic.train() obs, act, adv, ret, logp_old = map(lambda x: Tensor(x).to(device), buf.get()) _ , logp, _, val = actor_critic(obs, act) ent = (-logp).mean() # VPG objectives pi_loss = -(logp * adv).mean() v_l_old = ((ret - val)**2).mean() # Policy gradient step train_pi.zero_grad() pi_loss.backward() train_pi.step() # Value function learning for _ in range(train_v_iters): val = actor_critic.value(obs) v_loss = (ret - val).pow(2).mean() train_v.zero_grad() v_loss.backward() train_v.step() actor_critic.eval() # Log changes from update _, logp, _, val = actor_critic(obs, act) pi_l_new = -(logp * adv).mean() v_l_new = ((ret - val)**2).mean() kl = (logp_old - logp).mean() logger.store(LossPi=pi_loss, LossV=v_l_old, KL=kl, Entropy=ent, DeltaLossPi=(pi_l_new - pi_loss), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, logp_t, logp_pi_t, v_t = actor_critic(Tensor(o.reshape(1,-1)).to(device)) # save and log buf.store(o, a.cpu().numpy(), r, v_t.item(), logp_pi_t.cpu().detach().numpy()) logger.store(VVals=v_t) o, r, d, _ = env.step(a.cpu().numpy()) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t==local_steps_per_epoch-1): if not(terminal): print('Warning: trajectory cut off by epoch at %d steps.'%ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else actor_critic(Tensor(o.reshape(1,-1)).to(device))[-1].item() buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs-1): logger.save_state({'env': env}, actor_critic, None) # Perform VPG update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular()
def ddpg(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=1, steps_per_epoch=2000, epochs=10000, replay_size=int(1e5), gamma=0.99, polyak=0.995, pi_lr=1e-4, q_lr=1e-4, batch_size=128, start_steps=2000, update_after=1000, update_every=1000, act_noise=0.05, num_test_episodes=1, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Deep Deterministic Policy Gradient (DDPG) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with an ``act`` method, a ``pi`` module, and a ``q`` module. The ``act`` method and ``pi`` module should accept batches of observations as inputs, and ``q`` should accept a batch of observations and a batch of actions as inputs. When called, these should return: =========== ================ ====================================== Call Output Shape Description =========== ================ ====================================== ``act`` (batch, act_dim) | Numpy array of actions for each | observation. ``pi`` (batch, act_dim) | Tensor containing actions from policy | given observations. ``q`` (batch,) | Tensor containing the current estimate | of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to DDPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) rospy.init_node('DDPG_Train') env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] print(f"[DDPG] obs dim: {obs_dim} action dim: {act_dim}") # Create actor-critic module and target networks ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) # ac.apply(init_weights) ac_targ = deepcopy(ac) ac.eval() # in-active training BN print(f"[MODEL] Actor_Critic: {ac}") # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.q]) logger.log('\nNumber of parameters: \t pi: %d, \t q: %d\n'%var_counts) # Set up function for computing DDPG Q-loss def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done'] # import ipdb # ipdb.set_trace() q = ac.q(o, a) # Bellman backup for Q function with torch.no_grad(): q_pi_targ = ac_targ.q(o2, ac_targ.pi(o2)) backup = r + gamma * (1 - d) * q_pi_targ # MSE loss against Bellman backup loss_q = ((q - backup)**2).mean() # Useful info for logging loss_info = dict(QVals=q.cpu().detach().numpy()) return loss_q, loss_info # Set up function for computing DDPG pi loss def compute_loss_pi(data): o = data['obs'] q_pi = ac.q(o, ac.pi(o)) return -q_pi.mean() # Set up optimizers for policy and q-function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) q_optimizer = Adam(ac.q.parameters(), lr=q_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(data): # First run one gradient descent step for Q. q_optimizer.zero_grad() loss_q, loss_info = compute_loss_q(data) loss_q.backward() q_optimizer.step() # Freeze Q-network so you don't waste computational effort # computing gradients for it during the policy learning step. for p in ac.q.parameters(): p.requires_grad = False # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi = compute_loss_pi(data) loss_pi.backward() pi_optimizer.step() # Unfreeze Q-network so you can optimize it at next DDPG step. for p in ac.q.parameters(): p.requires_grad = True # Record things logger.store(LossQ=loss_q.item(), LossPi=loss_pi.item(), **loss_info) def soft_target_update(): # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) def get_action(o, noise_scale): o = torch.as_tensor(o, dtype=torch.float32) if o.dim() == 1: o = o.unsqueeze(0) a = ac.act(o)[0] a += noise_scale * np.random.randn(act_dim) return np.clip(a, env.act_limit_min, env.act_limit_max) def test_agent(): print("[DDPG] eval......") for j in range(num_test_episodes): o, d, ep_ret, ep_len = env.reset(), False, 0, 0 # while not(d or (ep_len == max_ep_len)): while not(d or (ep_len == 100)): # Take deterministic actions at test time (noise_scale=0) a = get_action(o, 0) print(f"[Eval] a: {a}") o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy (with some noise, via act_noise). print(f"O {o[-4]:.3f} {o[-3]:.3f} {o[-2]:.3f} {o[-1]:.3f} ") if t > start_steps: # if np.random.rand() > 0.3: a = get_action(o, act_noise) # else: # a = env.action_space.sample() else: a = env.action_space.sample() print(f't {t:7.0f} | a [{a[0]:.3f},{a[1]:.3f}]') # Step the env o2, r, d, info = env.step(a) # print(f"O {o[-4:]} |A {a} |O2 {o2[-4:]} |R {r} |D {d} |Info {info}") print(f" ------------------> R: {r:.3f}") ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): env.pause_pedsim() logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 env.unpause_pedsim() # Update handling if t >= update_after and t % update_every == 0: env.pause_pedsim() ac.train() # active training BN ac_targ.train() if torch.cuda.is_available(): ac.cuda() ac_targ.cuda() for _ in range(update_every): batch = replay_buffer.sample_batch(batch_size) if torch.cuda.is_available(): for key, value in batch.items(): batch[key] = value.cuda() update(data=batch) soft_target_update() ac.eval() ac_targ.eval() if torch.cuda.is_available(): ac.cpu() ac_targ.cpu() env.unpause_pedsim() # End of epoch handling if (t+1) % steps_per_epoch == 0: epoch = (t+1) // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() o, d, ep_ret, ep_len = env.reset(), False, 0, 0 sec = time.time() - start_time elapsed_time = str(datetime.timedelta(seconds=sec)).split('.')[0] # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('QVals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) # logger.log_tabular('Time', time.time()-start_time) logger.log_tabular('Time', elapsed_time) logger.dump_tabular()
def trpo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, delta=0.01, vf_lr=1e-3, train_v_iters=80, damping_coeff=0.1, cg_iters=10, backtrack_iters=10, backtrack_coeff=0.8, lam=0.97, max_ep_len=1000, logger_kwargs=dict(), save_freq=10, algo='trpo'): """ Trust Region Policy Optimization (with support for Natural Policy Gradient) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: ============ ================ ======================================== Symbol Shape Description ============ ================ ======================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``info`` N/A | A dict of any intermediate quantities | (from calculating the policy or log | probabilities) which are needed for | analytically computing KL divergence. | (eg sufficient statistics of the | distributions) ``info_phs`` N/A | A dict of placeholders for old values | of the entries in ``info``. ``d_kl`` () | A symbol for computing the mean KL | divergence between the current policy | (``pi``) and the old policy (as | specified by the inputs to | ``info_phs``) over the batch of | states given in ``x_ph``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) ============ ================ ======================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to TRPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) delta (float): KL-divergence limit for TRPO / NPG update. (Should be small for stability. Values like 0.01, 0.05.) vf_lr (float): Learning rate for value function optimizer. train_v_iters (int): Number of gradient descent steps to take on value function per epoch. damping_coeff (float): Artifact for numerical stability, should be smallish. Adjusts Hessian-vector product calculation: .. math:: Hv \\rightarrow (\\alpha I + H)v where :math:`\\alpha` is the damping coefficient. Probably don't play with this hyperparameter. cg_iters (int): Number of iterations of conjugate gradient to perform. Increasing this will lead to a more accurate approximation to :math:`H^{-1} g`, and possibly slightly-improved performance, but at the cost of slowing things down. Also probably don't play with this hyperparameter. backtrack_iters (int): Maximum number of steps allowed in the backtracking line search. Since the line search usually doesn't backtrack, and usually only steps back once when it does, this hyperparameter doesn't often matter. backtrack_coeff (float): How far back to step during backtracking line search. (Always between 0 and 1, usually above 0.5.) lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. algo: Either 'trpo' or 'npg': this code supports both, since they are almost the same. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph, plus placeholders for old pdist (for KL) pi, logp, logp_pi, info, info_phs, d_kl, v = actor_critic( x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph ] + core.values_as_sorted_list(info_phs) # Every step, get: action, value, logprob, & info for pdist (for computing kl div) get_action_ops = [pi, v, logp_pi] + core.values_as_sorted_list(info) # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) info_shapes = {k: v.shape.as_list()[1:] for k, v in info_phs.items()} buf = GAEBuffer(obs_dim, act_dim, local_steps_per_epoch, info_shapes, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # TRPO losses ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) pi_loss = -tf.reduce_mean(ratio * adv_ph) v_loss = tf.reduce_mean((ret_ph - v)**2) # Optimizer for value function train_vf = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) # Symbols needed for CG solver pi_params = core.get_vars('pi') gradient = core.flat_grad(pi_loss, pi_params) v_ph, hvp = core.hessian_vector_product(d_kl, pi_params) if damping_coeff > 0: hvp += damping_coeff * v_ph # Symbols for getting and setting params get_pi_params = core.flat_concat(pi_params) set_pi_params = core.assign_params_from_flat(v_ph, pi_params) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def cg(Ax, b): """ Conjugate gradient algorithm (see https://en.wikipedia.org/wiki/Conjugate_gradient_method) """ x = np.zeros_like(b) r = b.copy( ) # Note: should be 'b - Ax(x)', but for x=0, Ax(x)=0. Change if doing warm start. p = r.copy() r_dot_old = np.dot(r, r) for _ in range(cg_iters): z = Ax(p) alpha = r_dot_old / (np.dot(p, z) + EPS) x += alpha * p r -= alpha * z r_dot_new = np.dot(r, r) p = r + (r_dot_new / r_dot_old) * p r_dot_old = r_dot_new return x def update(): # Prepare hessian func, gradient eval inputs = {k: v for k, v in zip(all_phs, buf.get())} Hx = lambda x: mpi_avg(sess.run(hvp, feed_dict={**inputs, v_ph: x})) g, pi_l_old, v_l_old = sess.run([gradient, pi_loss, v_loss], feed_dict=inputs) g, pi_l_old = mpi_avg(g), mpi_avg(pi_l_old) # Core calculations for TRPO or NPG x = cg(Hx, g) alpha = np.sqrt(2 * delta / (np.dot(x, Hx(x)) + EPS)) old_params = sess.run(get_pi_params) def set_and_eval(step): sess.run(set_pi_params, feed_dict={v_ph: old_params - alpha * x * step}) return mpi_avg(sess.run([d_kl, pi_loss], feed_dict=inputs)) if algo == 'npg': # npg has no backtracking or hard kl constraint enforcement kl, pi_l_new = set_and_eval(step=1.) elif algo == 'trpo': # trpo augments npg with backtracking line search, hard kl for j in range(backtrack_iters): kl, pi_l_new = set_and_eval(step=backtrack_coeff**j) if kl <= delta and pi_l_new <= pi_l_old: logger.log( 'Accepting new params at step %d of line search.' % j) logger.store(BacktrackIters=j) break if j == backtrack_iters - 1: logger.log('Line search failed! Keeping old params.') logger.store(BacktrackIters=j) kl, pi_l_new = set_and_eval(step=0.) # Value function updates for _ in range(train_v_iters): sess.run(train_vf, feed_dict=inputs) v_l_new = sess.run(v_loss, feed_dict=inputs) # Log changes from update logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): agent_outs = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) a, v_t, logp_t, info_t = agent_outs[0][0], agent_outs[ 1], agent_outs[2], agent_outs[3:] o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v_t, logp_t, info_t) logger.store(VVals=v_t) # Update obs (critical!) o = o2 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = 0 if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform TRPO or NPG update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('KL', average_only=True) if algo == 'trpo': logger.log_tabular('BacktrackIters', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def td3(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, update_after=1000, update_every=50, act_noise=0.1, target_noise=0.2, noise_clip=0.5, policy_delay=2, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Twin Delayed Deep Deterministic Policy Gradient (TD3) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with an ``act`` method, a ``pi`` module, a ``q1`` module, and a ``q2`` module. The ``act`` method and ``pi`` module should accept batches of observations as inputs, and ``q1`` and ``q2`` should accept a batch of observations and a batch of actions as inputs. When called, these should return: =========== ================ ====================================== Call Output Shape Description =========== ================ ====================================== ``act`` (batch, act_dim) | Numpy array of actions for each | observation. ``pi`` (batch, act_dim) | Tensor containing actions from policy | given observations. ``q1`` (batch,) | Tensor containing one current estimate | of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) ``q2`` (batch,) | Tensor containing the other current | estimate of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to TD3. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) target_noise (float): Stddev for smoothing noise added to target policy. noise_clip (float): Limit for absolute value of target policy smoothing noise. policy_delay (int): Policy will only be updated once every policy_delay times for each update of the Q-networks. num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Create actor-critic module and target networks ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) ac_targ = deepcopy(ac) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters()) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2]) logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n'%var_counts) #=========================================================================# # # # All of your code goes in the space below. # # # #=========================================================================# # Set up function for computing TD3 Q-losses def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done'] # Compute target actions a_next = ac_targ.pi(torch.as_tensor(o2, dtype=torch.float32)) a_next += torch.clamp(target_noise * torch.randn(act_dim), -noise_clip, noise_clip) a_next = torch.clamp(a_next, -act_limit, act_limit) # Compute targets q1 = ac_targ.q1(o2, a_next) q2 = ac_targ.q2(o2, a_next) y = r + gamma * (1 - d) * torch.min(q1, q2) # Loss function loss_q1 = ((ac.q1(o, a) - y) ** 2).mean() loss_q2 = ((ac.q2(o, a) - y) ** 2).mean() loss_q = loss_q1 + loss_q2 # Useful info for logging loss_info = dict(Q1Vals=q1.detach().numpy(), Q2Vals=q2.detach().numpy()) return loss_q, loss_info # Set up function for computing TD3 pi loss def compute_loss_pi(data): o = torch.as_tensor(data['obs'], dtype=torch.float32) loss_pi = -ac.q1(o, ac.pi(o)).mean() # Gradient ascent return loss_pi #=========================================================================# # # # All of your code goes in the space above. # # # #=========================================================================# # Set up optimizers for policy and q-function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) q_optimizer = Adam(q_params, lr=q_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(data, timer): # First run one gradient descent step for Q1 and Q2 q_optimizer.zero_grad() loss_q, loss_info = compute_loss_q(data) loss_q.backward() q_optimizer.step() # Record things logger.store(LossQ=loss_q.item(), **loss_info) # Possibly update pi and target networks if timer % policy_delay == 0: # Freeze Q-networks so you don't waste computational effort # computing gradients for them during the policy learning step. for p in q_params: p.requires_grad = False # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi = compute_loss_pi(data) loss_pi.backward() pi_optimizer.step() # Unfreeze Q-networks so you can optimize it at next DDPG step. for p in q_params: p.requires_grad = True # Record things logger.store(LossPi=loss_pi.item()) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) def get_action(o, noise_scale): a = ac.act(torch.as_tensor(o, dtype=torch.float32)) a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(): for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not(d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, _ = test_env.step(get_action(o, 0)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy (with some noise, via act_noise). if t > start_steps: a = get_action(o, act_noise) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len==max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Update handling if t >= update_after and t % update_every == 0: for j in range(update_every): batch = replay_buffer.sample_batch(batch_size) update(data=batch, timer=j) # End of epoch handling if (t+1) % steps_per_epoch == 0: epoch = (t+1) // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular()
def eglu(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=256, start_steps=10000, update_after=1000, update_every=50, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, eps=0.2, n_explore=32, device='cuda'): device = torch.device(device) logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Create actor-critic module and target networks ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs).to(device) ac_targ = deepcopy(ac) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters()) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size, device=device) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple( core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2]) logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' % var_counts) # Set up function for computing SAC Q-losses def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] q1 = ac.q1(o, a) q2 = ac.q2(o, a) # Bellman backup for Q functions with torch.no_grad(): # Target actions come from *current* policy a2, logp_a2 = ac.pi(o2) # Target Q-values q1_pi_targ = ac_targ.q1(o2, a2) q2_pi_targ = ac_targ.q2(o2, a2) q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) backup = r + gamma * (1 - d) * (q_pi_targ - alpha * logp_a2) # MSE loss against Bellman backup loss_q1 = ((q1 - backup)**2).mean() loss_q2 = ((q2 - backup)**2).mean() loss_q = loss_q1 + loss_q2 # Useful info for logging q_info = dict(Q1Vals=q1.detach().cpu().numpy(), Q2Vals=q2.detach().cpu().numpy()) return loss_q, q_info # Set up function for computing EGL mean-gradient-losses def compute_loss_g(data): o, a1, r, o_tag, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] a2 = ball_explore(a1, n_explore, eps) a2 = a2.view(n_explore * len(r), act_dim) o_expand = repeat_and_reshape(o, n_explore) # Bellman backup for Q functions with torch.no_grad(): q1 = ac.q1(o_expand, a2) q2 = ac.q2(o_expand, a2) q_dither = torch.min(q1, q2) # Target actions come from *current* policy a_tag, logp_a_tag = ac.pi(o_tag) # Target Q-values q1_pi_targ = ac_targ.q1(o_tag, a_tag) q2_pi_targ = ac_targ.q2(o_tag, a_tag) q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) q_anchor = r + gamma * (1 - d) * (q_pi_targ - alpha * logp_a_tag) q_anchor = repeat_and_reshape(q_anchor, n_explore).squeeze(-1) a1_in = autograd.Variable(a1.data, requires_grad=True) q1 = ac.q1(o, a1_in) q2 = ac.q2(o, a1_in) qa = torch.min(q1, q2).unsqueeze(-1) geps = autograd.grad(outputs=qa, inputs=a1_in, grad_outputs=torch.cuda.FloatTensor( qa.size()).fill_(1.), create_graph=False, retain_graph=True, only_inputs=True)[0] geps = repeat_and_reshape(geps, n_explore) a1 = repeat_and_reshape(a1, n_explore) geps = (geps * (a2 - a1)).sum(-1) # l1 loss against Bellman backup loss_g = F.smooth_l1_loss(geps, q_dither - q_anchor) # Useful info for logging g_info = dict(GVals=geps.flatten().detach().cpu().numpy()) return loss_g, g_info # Set up function for computing SAC pi loss def compute_loss_pi(data): o = data['obs'] pi, logp_pi = ac.pi(o) q1_pi = ac.q1(o, pi) q2_pi = ac.q2(o, pi) q_pi = torch.min(q1_pi, q2_pi) # Entropy-regularized policy loss loss_pi = (alpha * logp_pi - q_pi).mean() # Useful info for logging pi_info = dict(LogPi=logp_pi.detach().cpu().numpy()) return loss_pi, pi_info # Set up optimizers for policy and q-function pi_optimizer = Adam(ac.pi.parameters(), lr=lr) q_optimizer = Adam(q_params, lr=lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(data): # First run one gradient descent step for Q1 and Q2 q_optimizer.zero_grad() # Next run one gradient descent step for the mean-gradient loss_g, g_info = compute_loss_g(data) # Record things logger.store(LossG=loss_g.item(), **g_info) q_optimizer.zero_grad() loss_q, q_info = compute_loss_q(data) # Record things logger.store(LossQ=loss_q.item(), **q_info) loss_q = loss_q + loss_g loss_q.backward() q_optimizer.step() # Freeze Q-networks so you don't waste computational effort # computing gradients for them during the policy learning step. for p in ac.geps.parameters(): p.requires_grad = False # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) loss_pi.backward() pi_optimizer.step() # Unfreeze Q-networks so you can optimize it at next DDPG step. for p in ac.geps.parameters(): p.requires_grad = True # Record things logger.store(LossPi=loss_pi.item(), **pi_info) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) def get_action(o, deterministic=False): return ac.act(torch.as_tensor(o, dtype=torch.float32, device=device), deterministic) def test_agent(): for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for t in tqdm(range(total_steps)): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy. if t > start_steps: a = get_action(o) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Update handling if t >= update_after and t % update_every == 0: for j in range(update_every): batch = replay_buffer.sample_batch(batch_size) update(data=batch) # End of epoch handling if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def ppo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10, explorer=None, eps=.03, pretrain_epochs=0): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_epochs = epochs + pretrain_epochs # Main loop: collect experience in env and update/log each epoch for epoch in range(total_epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) # explore if you are in a pretrain epoch or if eps-greedy pre = epoch < pretrain_epochs during = random.random() < eps if pre or during: if explorer is None: raise ValueError('Trying to explore but explorer is None') state = env.env.state_vector() a = explorer.sample_action(state) # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) o, r, d, _ = env.step(a[0]) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def asac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=200, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=5e-4, alpha_start=0.2, batch_size=100, start_steps=10000, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, loss_threshold=0.0001, delta=0.02, sample_step=2000): alpha = Alpha(alpha_start=alpha_start, delta=delta) alpha_t = alpha() logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph #x_ph, a_ph, x2_ph, r_ph, d_ph, ret_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None, None) x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) alpha_ph = core.scale_holder() # Main outputs from computation graph #R, R_next = return_estimate(x_ph, x2_ph, **ac_kwargs) with tf.variable_scope('main'): mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v, Q, Q_pi, R = actor_critic(x_ph, a_ph, **ac_kwargs) # Target value network with tf.variable_scope('target'): _,_,_,_,_,_,_,v_targ, _, _, R_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main/Q', 'main/R', 'main']) print(('\nNumber of parameters: \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t v: %d, \t Q: %d, \t R: %d, \t total: %d\n')%var_counts) # Min Double-Q: min_q_pi = tf.minimum(q1_pi, q2_pi) # Targets for Q and V regression q_backup = tf.stop_gradient(r_ph + gamma*(1 - d_ph)*v_targ) v_backup = tf.stop_gradient(min_q_pi - alpha_ph *logp_pi) Q_backup = tf.stop_gradient(r_ph + gamma*(1 - d_ph)*R_targ) R_backup = tf.stop_gradient(Q_pi) adv = Q_pi - R pi_loss = tf.reduce_mean(alpha_ph * logp_pi - q1_pi) q1_loss = 0.5 * tf.reduce_mean((q_backup - q1) ** 2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2) ** 2) v_loss = 0.5 * tf.reduce_mean((v_backup - v)**2) Q_loss = 0.5*tf.reduce_mean((Q_backup - Q)**2) R_loss = 0.5*tf.reduce_mean((R_backup - R)**2) value_loss = q1_loss + q2_loss + v_loss + Q_loss + R_loss # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars('main/q') + get_vars('main/v') + get_vars('main/Q') + get_vars('main/R') with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) """ R_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_R_op = R_optimizer.minimize(R_loss, var_list=get_vars('R')) """ # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) # All ops to call during one training step step_ops = [pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, train_pi_op, train_value_op, target_update, R_loss, Q_loss] # Initializing targets to match main variables target_init = tf.group([tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) config = tf.ConfigProto(inter_op_parallelism_threads=30,intra_op_parallelism_threads=5) config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2, 'v': v, 'Q': Q, 'R': R}) def get_action(o, deterministic=False): act_op = mu if deterministic else pi return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1)}) def test_agent(n=10): global sess, mu, pi, q1, q2, q1_pi, q2_pi for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 ret_est = sess.run(R, feed_dict={x_ph: [o]})[0] total_steps = steps_per_epoch * epochs counter = 0 ret_epi = [] obs_epi = [] loss_old = 10000 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t > start_steps: a = get_action(o) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = {x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], alpha_ph: alpha_t } outs = sess.run(step_ops, feed_dict) logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2], LossV=outs[3], Q1Vals=outs[4], Q2Vals=outs[5], VVals=outs[6], LogPi=outs[7], LossR=outs[11]) counter += 1 logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 ret_est = sess.run(R, feed_dict={x_ph: [o]})[0] logger.store(RetEst=ret_est) if counter >= 1000: loss_new, _ = logger.get_stats('LossPi') counter = 0 if (loss_old - loss_new)/np.absolute(loss_old) < loss_threshold and t > start_steps: rho_s = np.zeros([sample_step, obs_dim], dtype=np.float32) rho_ptr = 0 for sample_t in range(sample_step): a = get_action(o) o2, r, d, _ = env.step(a) ep_len += 1 d = False if ep_len == max_ep_len else d rho_s[rho_ptr] = o o = o2 if d or (ep_len == max_ep_len): o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 advantages = sess.run(adv, feed_dict={x_ph: rho_s}) alpha.update_alpha(advantages) #alpha.update_alpha(rho_q-rho_v) alpha_t = alpha() print(alpha_t) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 loss_old = 10000 else: loss_old = loss_new # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EntCoeff', alpha_t) logger.log_tabular('RetEst', average_only=True) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('LossR', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def sac(env_fn, logger_kwargs=dict(), network_params=dict(), rl_params=dict()): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # env params thresh = rl_params['thresh'] # control params seed = rl_params['seed'] epochs = rl_params['epochs'] steps_per_epoch = rl_params['steps_per_epoch'] replay_size = rl_params['replay_size'] batch_size = rl_params['batch_size'] start_steps = rl_params['start_steps'] max_ep_len = rl_params['max_ep_len'] max_noop = rl_params['max_noop'] save_freq = rl_params['save_freq'] render = rl_params['render'] # rl params gamma = rl_params['gamma'] polyak = rl_params['polyak'] lr = rl_params['lr'] grad_clip_val = rl_params['grad_clip_val'] alpha = rl_params['alpha'] target_entropy_start = rl_params['target_entropy_start'] target_entropy_stop = rl_params['target_entropy_stop'] target_entropy_steps = rl_params['target_entropy_steps'] train_env, test_env = env_fn(), env_fn() obs_space = env.observation_space act_space = env.action_space tf.set_random_seed(seed) np.random.seed(seed) train_env.seed(seed) train_env.action_space.np_random.seed(seed) test_env.seed(seed) test_env.action_space.np_random.seed(seed) # get the size after resize obs_dim = network_params['input_dims'] act_dim = act_space.n # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # init a state buffer for storing last m states train_state_buffer = StateBuffer(m=obs_dim[2]) test_state_buffer = StateBuffer(m=obs_dim[2]) # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = placeholders(obs_dim, act_dim, obs_dim, None, None) # alpha and entropy setup max_target_entropy = tf.log(tf.cast(act_dim, tf.float32)) target_entropy_prop_ph = tf.placeholder(dtype=tf.float32, shape=()) target_entropy = max_target_entropy * target_entropy_prop_ph log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=0.0) if alpha == 'auto': # auto tune alpha alpha = tf.exp(log_alpha) else: # fixed alpha alpha = tf.get_variable('alpha', dtype=tf.float32, initializer=alpha) # Main outputs from computation graph with tf.variable_scope('main'): mu, pi, logp_pi, pi_logits, q1_logits, q2_logits, q1_a, q2_a, q1_pi, q2_pi = build_models(x_ph, a_ph, act_dim, network_params) # Target value network with tf.variable_scope('target'): _, _, logp_pi_targ, _, _, _, _, _, q1_pi_targ, q2_pi_targ = build_models(x2_ph, a_ph, act_dim, network_params) # Count variables var_counts = tuple(count_vars(scope) for scope in ['log_alpha', 'main/pi', 'main/q1', 'main/q2', 'main']) print(('\nNumber of parameters: \t alpha: %d, \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t total: %d\n')%var_counts) # Min Double-Q: (check the logp_pi bit) min_q_pi = tf.minimum(q1_pi, q2_pi) min_q_pi_targ = tf.minimum(q1_pi_targ, q2_pi_targ) # Targets for Q regression q_backup = r_ph + gamma*(1-d_ph)*tf.stop_gradient(min_q_pi_targ - alpha * logp_pi_targ) # critic losses q1_loss = 0.5 * tf.reduce_mean((q_backup - q1_a)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2_a)**2) value_loss = q1_loss + q2_loss # actor loss pi_loss = tf.reduce_mean(alpha*logp_pi - min_q_pi) # alpha loss for temperature parameter alpha_backup = tf.stop_gradient(logp_pi + target_entropy) alpha_loss = -tf.reduce_mean(log_alpha * alpha_backup) # Policy train op # (has to be separate from value train op, because q1_logits appears in pi_loss) pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr, epsilon=1e-04) if grad_clip_val is not None: gvs = pi_optimizer.compute_gradients(pi_loss, var_list=get_vars('main/pi')) capped_gvs = [(ClipIfNotNone(grad, grad_clip_val), var) for grad, var in gvs] train_pi_op = pi_optimizer.apply_gradients(capped_gvs) else: train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr, epsilon=1e-04) with tf.control_dependencies([train_pi_op]): if grad_clip_val is not None: gvs = value_optimizer.compute_gradients(value_loss, var_list=get_vars('main/q')) capped_gvs = [(ClipIfNotNone(grad, grad_clip_val), var) for grad, var in gvs] train_value_op = value_optimizer.apply_gradients(capped_gvs) else: train_value_op = value_optimizer.minimize(value_loss, var_list=get_vars('main/q')) # Alpha train op alpha_optimizer = tf.train.AdamOptimizer(learning_rate=lr, epsilon=1e-04) with tf.control_dependencies([train_value_op]): train_alpha_op = alpha_optimizer.minimize(alpha_loss, var_list=get_vars('log_alpha')) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) # All ops to call during one training step step_ops = [pi_loss, q1_loss, q2_loss, q1_a, q2_a, logp_pi, target_entropy, alpha_loss, alpha, train_pi_op, train_value_op, train_alpha_op, target_update] # Initializing targets to match main variables target_init = tf.group([tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) sess = tf.Session(config=tf_config) sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'mu': mu, 'pi': pi, 'q1': q1_a, 'q2': q2_a}) def get_action(state, deterministic=False): state = state.astype('float32') / 255. act_op = mu if deterministic else pi return sess.run(act_op, feed_dict={x_ph: [state]})[0] def reset(env, state_buffer): o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # fire to start game and perform no-op for some frames to randomise start o, _, _, _ = env.step(1) # Fire action to start game for _ in range(np.random.randint(1, max_noop)): o, _, _, _ = env.step(0) # Action 'NOOP' o = process_image_observation(o, obs_dim, thresh) r = process_reward(r) old_lives = env.ale.lives() state = state_buffer.init_state(init_obs=o) return o, r, d, ep_ret, ep_len, old_lives, state def test_agent(n=10, render=True): global sess, mu, pi, q1, q2, q1_pi, q2_pi for j in range(n): o, r, d, ep_ret, ep_len, test_old_lives, test_state = reset(test_env, test_state_buffer) terminal_life_lost_test = False if render: test_env.render() while not(d or (ep_len == max_ep_len)): # start by firing if terminal_life_lost_test: a = 1 else: # Take lower variance actions at test(noise_scale=0.05) a = get_action(test_state, True) # Take deterministic actions at test time o, r, d, _ = test_env.step(a) o = process_image_observation(o, obs_dim, thresh) r = process_reward(r) test_state = test_state_buffer.append_state(o) ep_ret += r ep_len += 1 if test_env.ale.lives() < test_old_lives: test_old_lives = test_env.ale.lives() terminal_life_lost_test = True else: terminal_life_lost_test = False if render: test_env.render() if render: test_env.close() logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) # ================== Main training Loop ================== start_time = time.time() o, r, d, ep_ret, ep_len, old_lives, state = reset(train_env, train_state_buffer) total_steps = steps_per_epoch * epochs target_entropy_prop = linear_anneal(current_step=0, start=target_entropy_start, stop=target_entropy_stop, steps=target_entropy_steps) save_iter = 0 terminal_life_lost = False # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # press fire to start if terminal_life_lost: a = 1 else: if t > start_steps: a = get_action(state) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) o2 = process_image_observation(o2, obs_dim, thresh) r = process_reward(r) one_hot_a = process_action(a, act_dim) next_state = train_state_buffer.append_state(o2) ep_ret += r ep_len += 1 if train_env.ale.lives() < old_lives: old_lives = train_env.ale.lives() terminal_life_lost = True else: terminal_life_lost = False # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len==max_ep_len else d # Store experience to replay buffer replay_buffer.store(state, one_hot_a, r, next_state, terminal_life_lost) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 state = next_state if d or (ep_len == max_ep_len): """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = {x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], target_entropy_prop_ph: target_entropy_prop } outs = sess.run(step_ops, feed_dict) logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2], Q1Vals=outs[3], Q2Vals=outs[4], LogPi=outs[5], TargEntropy=outs[6], LossAlpha=outs[7], Alpha=outs[8]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len, old_lives, state = reset(train_env, train_state_buffer) # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # update target entropy every epoch target_entropy_prop = linear_anneal(current_step=t, start=target_entropy_start, stop=target_entropy_stop, steps=target_entropy_steps) # Save model if save_freq is not None: if (epoch % save_freq == 0) or (epoch == epochs-1): print('Saving...') logger.save_state({'env': env}, itr=save_iter) save_iter+=1 # Test the performance of the deterministic version of the agent. test_agent(n=5, render=render) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LogPi', average_only=True) logger.log_tabular('TargEntropy', average_only=True) logger.log_tabular('Alpha', average_only=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) logger.log_tabular('LossAlpha', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular()
class SingleTaskDDPG(Approach): def __init__(self, action_space, observation_space, rng, eps=0.9, discount_factor=0.99, alpha=1e-3): self.rng = rng logger_kwargs = setup_logger_kwargs('SingleTaskDDPG', self.rng) self.logger = EpochLogger(**logger_kwargs) self.logger.save_config(locals()) self.actor_critic = MLPActorCritic # ac_kwargs=dict() ****?????***** # seed=0 self.replay_size = int(1e6) self.polyak = 0.995 self.gamma = discount_factor self.pi_lr = alpha self.q_lr = alpha self.batch_size = 100 self.start_steps = 10000 self.update_after = 1000 self.update_every = 50 self.act_noise = 0.1 self.step_count = 0 self.action_space = action_space self.observation_space = observation_space # self.observation_space = spaces.Box(-np.inf, np.inf, shape=(17,), dtype=np.float32) #fix # torch.manual_seed(seed) # np.random.seed(seed) # self.obs_dim = self.observation_space.shape self.act_dim = self.action_space.shape[0] # act_dim = self.action_space.n # Action limit for clamping: critically, assumes all dimensions share the same bound! self.act_limit = self.action_space.high[0] self.net = False def init_net(self, state): self.obs_dim = state.shape # Create actor-critic module and target networks self.ac = self.actor_critic(self.obs_dim[0], self.action_space) #took out ac_kwargs self.ac_targ = deepcopy(self.ac) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in self.ac_targ.parameters(): p.requires_grad = False # Experience buffer self.replay_buffer = ReplayBuffer(obs_dim=self.obs_dim, act_dim=self.act_dim, size=self.replay_size) # Set up optimizers for policy and q-function self.pi_optimizer = Adam(self.ac.pi.parameters(), lr=self.pi_lr) self.q_optimizer = Adam(self.ac.q.parameters(), lr=self.q_lr) self.logger.setup_pytorch_saver(self.ac) self.net = True def observe(self, state, action, next_state, reward, done): state = self.process_state(state) next_state = self.process_state(next_state) self.replay_buffer.store(state, action, reward, next_state, done) if self.step_count >= self.update_after and self.step_count % self.update_every == 0: for _ in range(self.update_every): batch = self.replay_buffer.sample_batch(self.batch_size) self.update(data=batch) # Set up function for computing DDPG Q-loss def compute_loss_q(self, data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] q = self.ac.q(o, a) # Bellman backup for Q function with torch.no_grad(): q_pi_targ = self.ac_targ.q(o2, self.ac_targ.pi(o2)) backup = r + self.gamma * (1 - d) * q_pi_targ # MSE loss against Bellman backup loss_q = ((q - backup)**2).mean() # Useful info for logging loss_info = dict(QVals=q.detach().numpy()) return loss_q, loss_info # Set up function for computing DDPG pi loss def compute_loss_pi(self, data): o = data['obs'] q_pi = self.ac.q(o, self.ac.pi(o)) return -q_pi.mean() def update(self, data): # First run one gradient descent step for Q. self.q_optimizer.zero_grad() loss_q, loss_info = self.compute_loss_q(data) loss_q.backward() self.q_optimizer.step() # Freeze Q-network so you don't waste computational effort # computing gradients for it during the policy learning step. for p in self.ac.q.parameters(): p.requires_grad = False # Next run one gradient descent step for pi. self.pi_optimizer.zero_grad() loss_pi = self.compute_loss_pi(data) loss_pi.backward() self.pi_optimizer.step() # Unfreeze Q-network so you can optimize it at next DDPG step. for p in self.ac.q.parameters(): p.requires_grad = True self.logger.store(LossQ=loss_q.item(), LossPi=loss_pi.item(), **loss_info) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(self.ac.parameters(), self.ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(self.polyak) p_targ.data.add_((1 - self.polyak) * p.data) def get_action(self, state, exploit=False): processed_state = self.process_state(state) if not self.net: self.init_net(processed_state) # state is actually observation self.step_count += 1 if self.step_count <= self.start_steps: return self.action_space.sample() a = self.ac.act(torch.as_tensor(processed_state, dtype=torch.float32)) if not exploit: a += self.act_noise * np.random.randn(self.act_dim) return np.clip(a, -self.act_limit, self.act_limit) def reset(self, reward_function): self.reward_function = reward_function self.net = False # self.step_count = 0 def process_state(self, state): return state def log(self, returns, task): self.logger.store(EpRet=sum(returns), EpLen=len(returns)) self.logger.save_state({'env': task}, None) self.logger.log_tabular('EpRet', with_min_and_max=True) self.logger.log_tabular('EpLen', average_only=True) self.logger.log_tabular('TotalEnvInteracts', self.step_count) self.logger.log_tabular('QVals', with_min_and_max=True) self.logger.log_tabular('LossPi', average_only=True) self.logger.log_tabular('LossQ', average_only=True) self.logger.dump_tabular() def load(self, file, task): # model = torch.load(file) # s = () # for param_tensor in model.state_dict(): # s+=(param_tensor, "\t", model.state_dict()[param_tensor].size()) # return s # model = self.actor_critic(17, self.action_space) # model.load_state_dict(torch.load(file)) self.ac = torch.load(file) self.ac.eval() self.net = True state = task.reset(self.rng) self.reward_function = task.reward_function images = [] for i in range(100): action = self.get_action(state, True) state, reward, done, _ = task.step(action) im = task.render(mode='rgb_array') images.append(im) if done: break imageio.mimsave('figures/DDPG/oracle.mp4', images)
def ppo(BASE_DIR, expert_density, env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), steps_per_epoch=1000, epochs=10, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=50, train_v_iters=50, lam=0.97, max_ep_len=1000, target_kl=0.01, data_n=10): data = {} # ALL THE DATA logger_kwargs = setup_logger_kwargs(args.dir_name, data_dir=BASE_DIR) logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) # update rule def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) policy_distr = Gaussian_Density() policy = lambda s: np.random.uniform( -2.0, 2.0, size=env.action_space.shape) # random policy policy_distr.train(env, policy, args.trajects, args.distr_gamma, args.iter_length) density = policy_distr.density() data[0] = { 'pol_s': policy_distr.num_samples, 'pol_t': policy_distr.num_trajects } dist_rewards = [] # repeat REIL for given number of rounds for i in range(args.rounds): message = "\nRound {} out of {}\n".format(i + 1, args.rounds) reward = lambda s: expert_density(s) / (density(s) + args.eps) dist_rewards.append(reward) start_time = time.time() o, old_r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 r = reward(o) # custom reward # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) o, old_r, d, _ = env.step(a[0]) r = reward(o) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print( 'Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = old_r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) last_val = reward(o) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, old_r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 r = reward(o) # store model! if (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() print(message) policy = lambda state: sess.run( get_action_ops, feed_dict={x_ph: state.reshape(1, -1)})[0][0] data[i] = { 'pol_s': policy_distr.num_samples, 'pol_t': policy_distr.num_trajects } data[i]['rewards'] = evaluate_reward(env, policy, data_n) if i != args.rounds - 1: policy_distr.train(env, policy, args.trajects, args.distr_gamma, args.iter_length) density = policy_distr.density() return data, dist_rewards
def sac_overwrite(env_fn, hidden_sizes=[256, 256], seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=3e-4, alpha=0.2, batch_size=256, start_steps=10000, max_ep_len=1000, save_freq=1, dont_save=True, logger_kwargs=dict(), update_multiplier=1, hidden_activation_setting='relu', use_linear_priority=False, update_order='old_first', eta_0=0.994, m=900, c_min=5000, eta_final=1.0, no_eta_anneal=False): ## TODO eta will anneal to eta_final ## TODO random order update ## update_order can be old_first, new_first, random ## old first will first use data from all of buffer and then sample from a shrinking range of recent data ## new first is update on recent data first then gradually become uniform sampling. """ Largely following OpenAI documentation But slightly different from tensorflow implementation Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. hidden_sizes: number of entries is number of hidden layers each entry in this list indicate the size of that hidden layer. applies to all networks seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. Note the epoch here is just logging epoch so every this many steps a logging to stdouot and also output file will happen note: not to be confused with training epoch which is a term used often in literature for all kinds of different things epochs (int): Number of epochs to run and train agent. Usage of this term can be different in different algorithms, use caution. Here every epoch you get new logs replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. However during testing the action always come from policy max_ep_len (int): Maximum length of trajectory / episode / rollout. Environment will get reseted if timestep in an episode excedding this number save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. logger_kwargs (dict): Keyword args for EpochLogger. """ if no_eta_anneal: eta_final = eta_0 """set up logger""" logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) env, test_env = env_fn(), env_fn() ## seed torch and numpy torch.manual_seed(seed) np.random.seed(seed) ## seed environment along with env action space so that everything about env is seeded env.seed(seed) env.action_space.np_random.seed(seed) test_env.seed(seed) test_env.action_space.np_random.seed(seed) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # if environment has a smaller max episode length, then use the environment's max episode length max_ep_len = env._max_episode_steps if max_ep_len > env._max_episode_steps else max_ep_len # Action limit for clamping: critically, assumes all dimensions share the same bound! # we need .item() to convert it from numpy float to python float act_limit = env.action_space.high[0].item() # Experience buffer with weighted/priority sampling replay_buffer = StagePriorityReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) def test_agent(n=5): """ This will test the agent's performance by running n episodes During the runs, the agent only take deterministic action, so the actions are not drawn from a distribution, but just use the mean :param n: number of episodes to run the agent """ ep_return_list = np.zeros(n) for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time a = policy_net.get_env_action(o, deterministic=True) o, r, d, _ = test_env.step(a) ep_ret += r ep_len += 1 ep_return_list[j] = ep_ret logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs hidden_activation_dictionary = { 'relu': F.relu, 'leaky_relu': F.leaky_relu, 'selu': F.selu } hidden_activation = hidden_activation_dictionary[hidden_activation_setting] """init all networks""" # see line 1 policy_net = TanhGaussianPolicy(obs_dim, act_dim, hidden_sizes, action_limit=act_limit, hidden_activation=hidden_activation) value_net = Mlp(obs_dim, 1, hidden_sizes, hidden_activation=hidden_activation) target_value_net = Mlp(obs_dim, 1, hidden_sizes, hidden_activation=hidden_activation) q1_net = Mlp(obs_dim + act_dim, 1, hidden_sizes, hidden_activation=hidden_activation) q2_net = Mlp(obs_dim + act_dim, 1, hidden_sizes, hidden_activation=hidden_activation) # see line 2: copy parameters from value_net to target_value_net target_value_net.load_state_dict(value_net.state_dict()) # set up optimizers policy_optimizer = optim.Adam(policy_net.parameters(), lr=lr) value_optimizer = optim.Adam(value_net.parameters(), lr=lr) q1_optimizer = optim.Adam(q1_net.parameters(), lr=lr) q2_optimizer = optim.Adam(q2_net.parameters(), lr=lr) # mean squared error loss for v and q networks mse_criterion = nn.MSELoss() # Main loop: collect experience in env and update/log each epoch # NOTE: t here is the current number of total timesteps used # it is not the number of timesteps passed in the current episode for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t > start_steps: a = policy_net.get_env_action(o, deterministic=False) else: a = env.action_space.sample() # Step the env, get next observation, reward and done signal o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience (observation, action, reward, next observation, done) to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. Quoted from the original SAC paper: 'In practice, we take a single environment step followed by one or several gradient step' after a single environment step, the number of gradient steps is 1 for SAC. (see paper for reference) """ ## first compute the current eta, eta_current = compute_current_eta(eta_0, eta_final, t, total_steps) num_updates = ep_len ck_list = get_ck_list_exp(replay_size, num_updates, eta_current, update_order) for k in range(num_updates): """ first get priority count c_k, and we only uniformly sample from the most recent c_k data points in the replay buffer """ c_k = ck_list[k] if c_k < c_min: c_k = c_min # get data from replay buffer batch = replay_buffer.sample_priority_only_batch( c_k, batch_size) obs_tensor = Tensor(batch['obs1']) obs_next_tensor = Tensor(batch['obs2']) acts_tensor = Tensor(batch['acts']) # unsqueeze is to make sure rewards and done tensors are of the shape nx1, instead of n # to prevent problems later rews_tensor = Tensor(batch['rews']).unsqueeze(1) done_tensor = Tensor(batch['done']).unsqueeze(1) """ now we do a SAC update, following the OpenAI spinup doc check the openai sac document psudocode part for reference line nubmers indicate lines in psudocode part we will first compute each of the losses and then update all the networks in the end """ # see line 12: get a_tilda, which is newly sampled action (not action from replay buffer) a_tilda, mean_a_tilda, log_std_a_tilda, log_prob_a_tilda, _, _ = policy_net.forward( obs_tensor) """get q loss""" # see line 12: first equation v_from_target_v_net = target_value_net(obs_next_tensor) y_q = rews_tensor + gamma * (1 - done_tensor) * v_from_target_v_net # see line 13: compute loss for the 2 q networks, note that we want to detach the y_q value # since we only want to update q networks here, and don't want other gradients q1_prediction = q1_net(torch.cat([obs_tensor, acts_tensor], 1)) q1_loss = mse_criterion(q1_prediction, y_q.detach()) q2_prediction = q2_net(torch.cat([obs_tensor, acts_tensor], 1)) q2_loss = mse_criterion(q2_prediction, y_q.detach()) """get v loss""" # see line 12: second equation q1_a_tilda = q1_net(torch.cat([obs_tensor, a_tilda], 1)) q2_a_tilda = q2_net(torch.cat([obs_tensor, a_tilda], 1)) min_q1_q2_a_tilda = torch.min( torch.cat([q1_a_tilda, q2_a_tilda], 1), 1)[0].reshape(-1, 1) y_v = min_q1_q2_a_tilda - alpha * log_prob_a_tilda # see line 14: compute loss for value network v_prediction = value_net(obs_tensor) v_loss = mse_criterion(v_prediction, y_v.detach()) """policy loss""" # line 15: note that here we are doing gradient ascent, so we add a minus sign in the front policy_loss = -(q1_a_tilda - alpha * log_prob_a_tilda).mean() """ add policy regularization loss, this is not in openai's minimal version, but they are in the original sac code, see https://github.com/vitchyr/rlkit for reference this part is not necessary but might improve performance """ policy_mean_reg_weight = 1e-3 policy_std_reg_weight = 1e-3 mean_reg_loss = policy_mean_reg_weight * (mean_a_tilda** 2).mean() std_reg_loss = policy_std_reg_weight * (log_std_a_tilda** 2).mean() policy_loss = policy_loss + mean_reg_loss + std_reg_loss """update networks""" q1_optimizer.zero_grad() q1_loss.backward() q1_optimizer.step() q2_optimizer.zero_grad() q2_loss.backward() q2_optimizer.step() value_optimizer.zero_grad() v_loss.backward() value_optimizer.step() policy_optimizer.zero_grad() policy_loss.backward() policy_optimizer.step() # see line 16: update target value network with value network soft_update_model1_with_model2(target_value_net, value_net, polyak) # store diagnostic info to logger logger.store(LossPi=policy_loss.item(), LossQ1=q1_loss.item(), LossQ2=q2_loss.item(), LossV=v_loss.item(), Q1Vals=q1_prediction.detach().numpy(), Q2Vals=q2_prediction.detach().numpy(), VVals=v_prediction.detach().numpy(), LogPi=log_prob_a_tilda.detach().numpy()) ## store episode return and length to logger logger.store(EpRet=ep_ret, EpLen=ep_len) ## reset environment o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if (t + 1) % steps_per_epoch == 0: epoch = t // steps_per_epoch """ Save pytorch model, very different from tensorflow version We need to save the environment, the state_dict of each network and also the state_dict of each optimizer """ if not dont_save: sac_state_dict = { 'env': env, 'policy_net': policy_net.state_dict(), 'value_net': value_net.state_dict(), 'target_value_net': target_value_net.state_dict(), 'q1_net': q1_net.state_dict(), 'q2_net': q2_net.state_dict(), 'policy_opt': policy_optimizer, 'value_opt': value_optimizer, 'q1_opt': q1_optimizer, 'q2_opt': q2_optimizer } if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state(sac_state_dict, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def sac_multistep( env_fn, hidden_sizes=[256, 256], seed=0, steps_per_epoch=1000, epochs=1000, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=3e-4, alpha=0.2, batch_size=256, start_steps=10000, max_ep_len=1000, save_freq=1, save_model=False, auto_alpha=True, grad_clip=-1, logger_store_freq=100, multistep_k=1, debug=False, use_single_variant=False, logger_kwargs=dict(), ): """ Largely following OpenAI documentation, but a bit different Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. hidden_sizes: number of entries is number of hidden layers each entry in this list indicate the size of that hidden layer. applies to all networks seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. Note the epoch here is just logging epoch so every this many steps a logging to stdouot and also output file will happen note: not to be confused with training epoch which is a term used often in literature for all kinds of different things epochs (int): Number of epochs to run and train agent. Usage of this term can be different in different algorithms, use caution. Here every epoch you get new logs replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. However during testing the action always come from policy max_ep_len (int): Maximum length of trajectory / episode / rollout. Environment will get reseted if timestep in an episode excedding this number save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. logger_kwargs (dict): Keyword args for EpochLogger. save_model (bool): set to True if want to save the trained agent auto_alpha: set to True to use the adaptive alpha scheme, target entropy will be set automatically grad_clip: whether to use gradient clipping. < 0 means no clipping logger_store_freq: how many steps to log debugging info, typically don't need to change """ if debug: hidden_sizes = [2, 2] batch_size = 2 start_steps = 1000 multistep_k = 5 use_single_variant = True print('[basic setups] multistep_k:', multistep_k, 'use_single_variant:', use_single_variant) """set up logger""" logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) env, test_env = env_fn(), env_fn() ## seed torch and numpy torch.manual_seed(seed) np.random.seed(seed) ## seed environment along with env action space so that everything about env is seeded env.seed(seed) env.action_space.np_random.seed(seed) test_env.seed(seed + 10000) test_env.action_space.np_random.seed(seed + 10000) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # if environment has a smaller max episode length, then use the environment's max episode length max_ep_len = env._max_episode_steps if max_ep_len > env._max_episode_steps else max_ep_len # Action limit for clamping: critically, assumes all dimensions share the same bound! # we need .item() to convert it from numpy float to python float act_limit = env.action_space.high[0].item() # Experience buffer replay_buffer = MultistepReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) """ Auto tuning alpha """ if auto_alpha: target_entropy = -np.prod(env.action_space.shape).item() # H log_alpha = torch.zeros(1, requires_grad=True) alpha_optim = optim.Adam([log_alpha], lr=lr) else: target_entropy, log_alpha, alpha_optim = None, None, None def test_agent(n=1): """ This will test the agent's performance by running n episodes During the runs, the agent only take deterministic action, so the actions are not drawn from a distribution, but just use the mean :param n: number of episodes to run the agent """ ep_return_list = np.zeros(n) for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time a = policy_net.get_env_action(o, deterministic=True) o, r, d, _ = test_env.step(a) ep_ret += r ep_len += 1 ep_return_list[j] = ep_ret logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs """init all networks""" # see line 1 policy_net = TanhGaussianPolicySACAdapt(obs_dim, act_dim, hidden_sizes, action_limit=act_limit) q1_net = Mlp(obs_dim + act_dim, 1, hidden_sizes) q2_net = Mlp(obs_dim + act_dim, 1, hidden_sizes) q1_target_net = Mlp(obs_dim + act_dim, 1, hidden_sizes) q2_target_net = Mlp(obs_dim + act_dim, 1, hidden_sizes) # see line 2: copy parameters from value_net to target_value_net q1_target_net.load_state_dict(q1_net.state_dict()) q2_target_net.load_state_dict(q2_net.state_dict()) # set up optimizers policy_optimizer = optim.Adam(policy_net.parameters(), lr=lr) q1_optimizer = optim.Adam(q1_net.parameters(), lr=lr) q2_optimizer = optim.Adam(q2_net.parameters(), lr=lr) # mean squared error loss for v and q networks mse_criterion = nn.MSELoss() # Main loop: collect experience in env and update/log each epoch # NOTE: t here is the current number of total timesteps used # it is not the number of timesteps passed in the current episode current_update_index = 0 for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t > start_steps: a = policy_net.get_env_action(o, deterministic=False) else: a = env.action_space.sample() # Step the env, get next observation, reward and done signal o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience (observation, action, reward, next observation, done) to replay buffer # the multi-step buffer (given to you) will store the data in a fashion that # they can be easily used for multi-step update replay_buffer.store(o, a, r, o2, d, ep_len, max_ep_len, multistep_k, gamma) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 """perform update""" if replay_buffer.size >= batch_size: # get data from replay buffer batch = replay_buffer.sample_batch(batch_size) obs_tensor = Tensor(batch['obs1']) # NOTE: given the multi-step buffer, obs_next_tensor now contains the observation that are # k-step away from current observation obs_next_tensor = Tensor(batch['obs2']) acts_tensor = Tensor(batch['acts']) # NOTE: given the multi-step buffer, rewards tensor now contain the sum of discounted rewards in the next # k steps (or up until termination, if terminated in less than k steps) rews_tensor = Tensor(batch['rews']).unsqueeze(1) # NOTE: given the multi-step buffer, done_tensor now shows whether the data's episode terminated in less # than k steps or not done_tensor = Tensor(batch['done']).unsqueeze(1) """ now we do a SAC update, following the OpenAI spinup doc check the openai sac document psudocode part for reference line nubmers indicate lines in psudocode part we will first compute each of the losses and then update all the networks in the end """ # see line 12: get a_tilda, which is newly sampled action (not action from replay buffer) """get q loss""" with torch.no_grad(): a_tilda_next, _, _, log_prob_a_tilda_next, _, _ = policy_net.forward( obs_next_tensor) q1_next = q1_target_net( torch.cat([obs_next_tensor, a_tilda_next], 1)) q2_next = q2_target_net( torch.cat([obs_next_tensor, a_tilda_next], 1)) # TODO: compute the k-step Q estiamte (in the form of reward + next Q), don't worry about the entropy terms if use_single_variant: ### write code for computing the k-step estimate for the single Q estimate variant case # target y = (gamma**0 * r1 + ... + gamma**(k-1) * rk) + gamma ** k * (1-d) * Q (not considering the entropy term) # and rews_tensor already calculate the sum in the first pair of parenthesises y_q = rews_tensor + gamma**multistep_k * ( 1 - done_tensor) * q1_next else: ### write code for computing the k-step estimate while using double clipped Q # first get the compare Q1 and Q2 and get the min, then use that min to compute the target min_next_q = torch.min(q1_next, q2_next) y_q = rews_tensor + gamma**multistep_k * ( 1 - done_tensor) * min_next_q # add the entropy, with a simplied heuristic way # NOTE: you don't need to modify the following 3 lines. They deal with entropy terms powers = np.arange(1, multistep_k + 1) # k = 5 => posers = [1,2,3,4,5] entropy_discounted_sum = -sum(gamma**powers) * ( 1 - done_tensor) * alpha * log_prob_a_tilda_next y_q += entropy_discounted_sum # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] q1_prediction = q1_net(torch.cat([obs_tensor, acts_tensor], 1)) q1_loss = mse_criterion(q1_prediction, y_q) q2_prediction = q2_net(torch.cat([obs_tensor, acts_tensor], 1)) q2_loss = mse_criterion(q2_prediction, y_q) """ get policy loss """ a_tilda, mean_a_tilda, log_std_a_tilda, log_prob_a_tilda, _, _ = policy_net.forward( obs_tensor) # see line 12: second equation q1_a_tilda = q1_net(torch.cat([obs_tensor, a_tilda], 1)) q2_a_tilda = q2_net(torch.cat([obs_tensor, a_tilda], 1)) # TODO write code here to compute policy loss correctly, for both variants. if use_single_variant: # still pick Q1 network as the single network q_policy_part = q1_a_tilda else: # compare Q1 and Q2 to get the min q_policy_part = torch.min(q1_a_tilda, q2_a_tilda) # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))] policy_loss = (alpha * log_prob_a_tilda - q_policy_part).mean() """ alpha loss, update alpha """ if auto_alpha: alpha_loss = -( log_alpha * (log_prob_a_tilda + target_entropy).detach()).mean() alpha_optim.zero_grad() alpha_loss.backward() if grad_clip > 0: nn.utils.clip_grad_norm_(log_alpha, grad_clip) alpha_optim.step() alpha = log_alpha.exp().item() else: alpha_loss = 0 """update networks""" q1_optimizer.zero_grad() q1_loss.backward() if grad_clip > 0: nn.utils.clip_grad_norm_(q1_net.parameters(), grad_clip) q1_optimizer.step() q2_optimizer.zero_grad() q2_loss.backward() if grad_clip > 0: nn.utils.clip_grad_norm_(q2_net.parameters(), grad_clip) q2_optimizer.step() policy_optimizer.zero_grad() policy_loss.backward() if grad_clip > 0: nn.utils.clip_grad_norm_(policy_net.parameters(), grad_clip) policy_optimizer.step() # see line 16: update target value network with value network soft_update_model1_with_model2(q1_target_net, q1_net, polyak) soft_update_model1_with_model2(q2_target_net, q2_net, polyak) current_update_index += 1 if current_update_index % logger_store_freq == 0: # store diagnostic info to logger logger.store(LossPi=policy_loss.item(), LossQ1=q1_loss.item(), LossQ2=q2_loss.item(), LossAlpha=alpha_loss.item(), Q1Vals=q1_prediction.detach().numpy(), Q2Vals=q2_prediction.detach().numpy(), Alpha=alpha, LogPi=log_prob_a_tilda.detach().numpy()) if d or (ep_len == max_ep_len): """when episode terminates, log info about this episode, then reset""" ## store episode return and length to logger logger.store(EpRet=ep_ret, EpLen=ep_len) ## reset environment o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if (t + 1) % steps_per_epoch == 0: epoch = t // steps_per_epoch """ Save pytorch model, very different from tensorflow version We need to save the environment, the state_dict of each network and also the state_dict of each optimizer """ if save_model: sac_state_dict = { 'env': env, 'policy_net': policy_net.state_dict(), 'q1_net': q1_net.state_dict(), 'q2_net': q2_net.state_dict(), 'q1_target_net': q1_target_net.state_dict(), 'q2_target_net': q2_target_net.state_dict(), 'policy_opt': policy_optimizer, 'q1_opt': q1_optimizer, 'q2_opt': q2_optimizer, 'log_alpha': log_alpha, 'alpha_opt': alpha_optim, 'target_entropy': target_entropy } if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state(sac_state_dict, None) # use joblib.load(fname) to load # Test the performance of the deterministic version of the agent. test_agent() # TODO write code here to estimate the bias of the Q networks # recall that we can define the Q bias to be Q value - discounted MC return # initialize another environment that is only used for provide such a bias estimate # store that to logger # I eventually decided not to use buffer, because a buffer makes it a bit harder to discard the last 200 datapoints # but the discussion on buffer was enlightening Watcher! I learned a lot :) # this function is adapted from test_agent() def q_bias_analysis(N=1): # initialize another env # all b_XXX means bias b_env = env_fn() # loop for n episodes b_observations, b_actions, b_mc_returns = [], [], [] for j in range(N): b_o, b_r, b_d, b_ep_len = b_env.reset(), 0, False, 0 observations, actions, rewards, mc_returns = [], [], [], [ 0 ] # last return is 0 while not (b_d or (b_ep_len == max_ep_len)): # add noise to action selection, so no deterministic action b_a = policy_net.get_env_action(b_o, deterministic=False) observations.append(b_o) actions.append(b_a) # b_env.render() # render the environment # Step the env, get next observation, reward and done signal b_o1, b_r, b_d, _ = b_env.step(b_a) rewards.append(b_r) b_ep_len += 1 b_o = b_o1 # check that everything is paired up assert len(observations) == len(actions) == len(rewards) # decide the cutoff point if b_ep_len == 1000: # terminates because of reaching max limit, then discard the last 200 cut_idx = 800 else: # terminate before 1000 steps naturally then the MC return is accurate for later state-action pairs cut_idx = len(observations) b_observations += observations[:cut_idx] b_actions += actions[:cut_idx] G = 0 # loop the rewards list backward to calculate the returns for r in reversed(rewards): G = gamma * G + r mc_returns.append(G) mc_returns.reverse( ) # reverse [0, something, something, ...] into correct order b_mc_returns += mc_returns[:cut_idx] # after rendering close env # b_env.close() # use b_obs, b_acts to calculate Q estimate b_obs_tensor = Tensor(b_observations) b_acts_tensor = Tensor(b_actions) if use_single_variant: # as usual, choose q1 as single network b_q_estimate = q1_net( torch.cat([b_obs_tensor, b_acts_tensor], 1)) else: # as usual, take the min b_q1 = q1_net(torch.cat([b_obs_tensor, b_acts_tensor], 1)) b_q2 = q2_net(torch.cat([b_obs_tensor, b_acts_tensor], 1)) b_q_estimate = torch.min(b_q1, b_q2) # mc returns have been calculated but is still a list, so convert it into a tensor # need to unsqueeze it!!! need to unsqueeze it!!! need to unsqueeze it!!! # or a [1000, 1] tensor - a [1000] tensor would be a [1000, 1000] tensor # and you will get a 1e6 bias like I did ............ b_mc_returns_tensor = Tensor(b_mc_returns).unsqueeze(1) # check all pairs are matched assert len(b_q_estimate) == len(b_mc_returns_tensor) # check how many steps each episode makes just for curiosity print('######### number of datapoints:', len(b_q_estimate)) logger.log_tabular('NumDatapoints', len(b_q_estimate)) # Q-bias = Q value - discounted MC return q_bias = b_q_estimate - b_mc_returns_tensor # put the result in buffer store (avg is taken care of, no need to calculate by hand) logger.store(QBias=q_bias.detach().numpy()) # call q_bias_analysis function (running N episodes) q_bias_analysis(5) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('Alpha', with_min_and_max=True) logger.log_tabular('LossAlpha', average_only=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) # TODO after you store bias info to logger, you should also write code here to log them # so that you can later plot them logger.log_tabular('QBias', with_min_and_max=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() sys.stdout.flush()
def vpg(env_fn, actor_critic=tabular_actor_critic.TabularVPGActorCritic, n_episodes=100, env_kwargs={}, logger_kwargs={}, ac_kwargs={}, n_test_episodes=100, gamma=0.99, lam=0.95, bootstrap_n=3): """ Environment has discrete observation and action spaces, both low dimensional so policy and value functions can be stored in table. Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic : The constructor method for an actor critic class with an ``act`` method, and attributes ``pi`` and ``v``. n_episodes (int): Number of episodes/rollouts of interaction (equivalent to number of policy updates) to perform. bootstrap_n (int) : (optional) Number of reward steps to use with a bootstrapped approximate Value function. If None, use GAE-lambda advantage estimation. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) log_wandb = logger_kwargs.get('output_dir').startswith('wandb') env = env_fn(**env_kwargs) test_env = env_fn(**env_kwargs) obs_dim = env.observation_space.n act_dim = env.action_space.n ac = actor_critic(obs_dim, act_dim, **ac_kwargs) def test_agent(): o, test_ep_ret, test_ep_len = test_env.reset(), 0, 0 episode = 0 while episode < n_test_episodes: a, _ = ac.step(o) o2, r, d, _ = test_env.step(a) test_ep_ret += r test_ep_len += 1 o = o2 if d is True: logger.store(TestEpRet=test_ep_ret) logger.store(TestEpLen=test_ep_len) episode += 1 o, test_ep_ret, test_ep_len = test_env.reset(), 0, 0 traj = Trajectory(gamma, lam, bootstrap_n) # Run test agent before any training happens episode = 0 test_agent() print('Mean test returns from random agent:', np.mean(logger.epoch_dict['TestEpRet']), flush=True) logger.log_tabular('Epoch', episode) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('TestEpLen', with_min_and_max=True) # Hack logger values for compatibility with main logging header keys logger.log_tabular('EpRet', 0) logger.log_tabular('EpLen', 0) logger.log_tabular('AverageVVals', 0) logger.log_tabular('MaxVVals', 0) logger.log_tabular('MinVVals', 0) logger.log_tabular('StdVVals', 0) logger.log_tabular('TotalEnvInteracts', 0) if log_wandb: wandb.log(logger.log_current_row, step=episode) logger.dump_tabular() episode += 1 o, ep_ret, ep_len = env.reset(), 0, 0 total_env_interacts = 0 while episode < n_episodes: a, v = ac.step(o) o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 total_env_interacts += 1 traj.store(o, a, r, v) logger.store(VVals=v) o = o2 if d is True: traj.finish_path(last_obs=o, last_val=0) ac.update(traj) test_agent() logger.log_tabular('Epoch', episode) logger.log_tabular('EpRet', ep_ret) logger.log_tabular('EpLen', ep_len) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('TestEpLen', with_min_and_max=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', total_env_interacts) if log_wandb: wandb.log(logger.log_current_row, step=episode) logger.dump_tabular() traj.reset() episode += 1 o, ep_ret, ep_len = env.reset(), 0, 0 print('pi', ac.pi, flush=True) print('logits_pi', ac.logits_pi, flush=True) print('value', ac.V, flush=True) if isinstance(ac, tabular_actor_critic.TabularReturnHCA) or isinstance(ac, tabular_actor_critic.TabularStateHCA): print('h', ac.h, flush=True)
def sac1_carla(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(3e5), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``mu`` (batch, act_dim) | Computes mean actions from policy | given states. ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. Critical: must be differentiable | with respect to policy parameters all | the way through action sampling. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). ``q2_pi`` (batch,) | Gives the composition of ``q2`` and | ``pi`` for states in ``x_ph``: | q2(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for policy/value/alpha learning). alpha (float/'auto'): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) / 'auto': alpha is automated. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_space = env.observation_space.spaces[0] act_space = env.action_space obs_dim = obs_space.shape act_dim = act_space.shape # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders_from_space(obs_space, act_space, obs_space, None, None) # Main outputs from computation graph with tf.variable_scope('main'): mu, pi, logp_pi, q1, q2, q1_pi, q2_pi = actor_critic(x_ph, a_ph, **ac_kwargs) # Target value network with tf.variable_scope('target'): _, _, logp_pi_, _, _,q1_pi_, q2_pi_= actor_critic(x2_ph, a_ph, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=list(obs_dim), act_dim=list(act_dim), size=replay_size) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['main/cnn_layer', 'main/pi', 'main/q1', 'main/q2', 'main']) print(('\nNumber of parameters: \t cnn_layer: %d, \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t total: %d\n')%var_counts) ###### if alpha == 'auto': target_entropy = (-np.prod(env.action_space.shape)) log_alpha = tf.get_variable( 'log_alpha', dtype=tf.float32, initializer=0.0) alpha = tf.exp(log_alpha) alpha_loss = tf.reduce_mean(-log_alpha * tf.stop_gradient(logp_pi + target_entropy)) alpha_optimizer = tf.train.AdamOptimizer(learning_rate=lr, name='alpha_optimizer') train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss, var_list=[log_alpha]) ###### # Min Double-Q: min_q_pi = tf.minimum(q1_pi_, q2_pi_) # Targets for Q and V regression v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi) q_backup = r_ph + gamma*(1-d_ph)*v_backup # Soft actor-critic losses pi_loss = tf.reduce_mean(alpha * logp_pi - tf.stop_gradient(q1_pi)) q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2) value_loss = q1_loss + q2_loss cnn_params = get_vars('main/cnn_layer') # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) pi_params = get_vars('main/pi') train_pi_op = pi_optimizer.minimize(pi_loss, var_list = cnn_params + pi_params) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars('main/q') with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list = cnn_params + value_params) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) # All ops to call during one training step if isinstance(alpha, Number): step_ops = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, tf.identity(alpha), train_pi_op, train_value_op, target_update] else: step_ops = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, alpha, train_pi_op, train_value_op, target_update, train_alpha_op] # Initializing targets to match main variables target_init = tf.group([tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2}) def get_action(o, deterministic=False): act_op = mu if deterministic else pi return sess.run(act_op, feed_dict={x_ph: o[np.newaxis,...]})[0] def test_agent(n=1): global sess, mu, pi, q1, q2, q1_pi, q2_pi for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not(d or (ep_len == 200)): # max_ep_len # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t > start_steps: a = get_action(o) else: a = [0.35, 0] # a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len==max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of episode. Training (ep_len times). if d or (ep_len == max_ep_len): print('EpRet: ',ep_ret, 'ep_len: ', ep_len) """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = {x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], } # step_ops = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, alpha, train_pi_op, train_value_op, target_update] outs = sess.run(step_ops, feed_dict) logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2], Q1Vals=outs[3], Q2Vals=outs[4], LogPi=outs[5], Alpha=outs[6]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs-1): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # logger.store(): store the data; logger.log_tabular(): log the data; logger.dump_tabular(): write the data # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Alpha',average_only=True) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) # logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) # logger.log_tabular('LossV', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular()
def td3(env_fn: Callable, actor_critic: torch.nn.Module = core.MLPActorCritic, ac_kwargs: Dict = None, seed: int = 0, steps_per_epoch: int = 4000, epochs: int = 2000, replay_size: int = int(1e6), gamma: float = 0.99, polyak: float = 0.995, pi_lr: Union[Callable, float] = 1e-3, q_lr: Union[Callable, float] = 1e-3, batch_size: int = 100, start_steps: int = 10000, update_after: int = 1000, update_every: int = 100, act_noise: Union[Callable, float] = 0.1, target_noise: float = 0.2, noise_clip: float = 0.5, policy_delay: int = 2, num_test_episodes: int = 3, max_ep_len: int = 1000, logger_kwargs: Dict = None, save_freq: int = 1, random_exploration: Union[Callable, float] = 0.0, save_checkpoint_path: str = None, load_checkpoint_path: str = None, load_model_file: str = None): """ Twin Delayed Deep Deterministic Policy Gradient (TD3) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with an ``act`` method, a ``pi`` module, a ``q1`` module, and a ``q2`` module. The ``act`` method and ``pi`` module should accept batches of observations as inputs, and ``q1`` and ``q2`` should accept a batch of observations and a batch of actions as inputs. When called, these should return: =========== ================ ====================================== Call Output Shape Description =========== ================ ====================================== ``act`` (batch, act_dim) | Numpy array of actions for each | observation. ``pi`` (batch, act_dim) | Tensor containing actions from policy | given observations. ``q1`` (batch,) | Tensor containing one current estimate | of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) ``q2`` (batch,) | Tensor containing the other current | estimate of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to TD3. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float or callable): Learning rate for policy. q_lr (float or callable): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. act_noise (float or callable): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) target_noise (float): Stddev for smoothing noise added to target policy. noise_clip (float): Limit for absolute value of target policy smoothing noise. policy_delay (int): Policy will only be updated once every policy_delay times for each update of the Q-networks. num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. random_exploration (float or callable): Probability to randomly select an action instead of selecting from policy. save_checkpoint_path (str): Path to save the model. If not set, no model will be saved load_checkpoint_path (str): Path to load the model. Cannot be set if save_model_path is set. """ if logger_kwargs is None: logger_kwargs = dict() if ac_kwargs is None: ac_kwargs = dict() if save_checkpoint_path is not None: assert load_checkpoint_path is None, "load_model_path cannot be set when save_model_path is already set" if not os.path.exists(save_checkpoint_path): print(f"Folder {save_checkpoint_path} does not exist, creating...") os.makedirs(save_checkpoint_path) if load_checkpoint_path is not None: assert load_model_file is None, "load_checkpoint_path cannot be set when load_model_file is already set" # ------------ Initialisation begin ------------ loaded_state_dict = None if load_checkpoint_path is not None: logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) loaded_state_dict = load_latest_state_dict(load_checkpoint_path) logger.epoch_dict = loaded_state_dict['logger_epoch_dict'] q_learning_rate_fn = loaded_state_dict['q_learning_rate_fn'] pi_learning_rate_fn = loaded_state_dict['pi_learning_rate_fn'] epsilon_fn = loaded_state_dict['epsilon_fn'] act_noise_fn = loaded_state_dict['act_noise_fn'] replay_buffer = loaded_state_dict['replay_buffer'] env, test_env = loaded_state_dict['env'], loaded_state_dict['test_env'] ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) ac_targ = deepcopy(ac) ac.load_state_dict(loaded_state_dict['ac']) ac_targ.load_state_dict(loaded_state_dict['ac_targ']) obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] env.action_space.np_random.set_state( loaded_state_dict['action_space_state']) # List of parameters for both Q-networks (save this for convenience) q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters()) t_ori = loaded_state_dict['t'] pi_optimizer = Adam(ac.pi.parameters(), lr=pi_learning_rate_fn(t_ori)) pi_optimizer.load_state_dict(loaded_state_dict['pi_optimizer']) q_optimizer = Adam(q_params, lr=q_learning_rate_fn(t_ori)) q_optimizer.load_state_dict(loaded_state_dict['q_optimizer']) np.random.set_state(loaded_state_dict['np_rng_state']) torch.set_rng_state(loaded_state_dict['torch_rng_state']) else: logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) os.environ['PYTHONHASHSEED'] = str(seed) q_learning_rate_fn = get_schedule_fn(q_lr) pi_learning_rate_fn = get_schedule_fn(pi_lr) act_noise_fn = get_schedule_fn(act_noise) epsilon_fn = get_schedule_fn(random_exploration) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] env.action_space.seed(seed) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Create actor-critic module and target networks if load_model_file is not None: assert os.path.exists( load_model_file ), f"Model file path does not exist: {load_model_file}" ac = torch.load(load_model_file) else: ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) ac_targ = deepcopy(ac) # List of parameters for both Q-networks (save this for convenience) q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters()) # Set up optimizers for policy and q-function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_learning_rate_fn(0)) q_optimizer = Adam(q_params, lr=q_learning_rate_fn(0)) t_ori = 0 act_limit = 1.0 # ------------ Initialisation end ------------ # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple( core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2]) logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' % var_counts) torch.set_printoptions(profile="default") # Set up function for computing TD3 Q-losses def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] q1 = ac.q1(o, a) q2 = ac.q2(o, a) # Bellman backup for Q functions with torch.no_grad(): pi_targ = ac_targ.pi(o2) # Target policy smoothing epsilon = torch.randn_like(pi_targ) * target_noise epsilon = torch.clamp(epsilon, -noise_clip, noise_clip) a2 = pi_targ + epsilon a2 = torch.clamp(a2, -act_limit, act_limit) # Target Q-values q1_pi_targ = ac_targ.q1(o2, a2) q2_pi_targ = ac_targ.q2(o2, a2) q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) backup = r + gamma * (1 - d) * q_pi_targ # MSE loss against Bellman backup loss_q1 = ((q1 - backup)**2).mean() loss_q2 = ((q2 - backup)**2).mean() loss_q = loss_q1 + loss_q2 # Useful info for logging loss_info = dict(Q1Vals=q1.detach().numpy(), Q2Vals=q2.detach().numpy()) return loss_q, loss_info # Set up function for computing TD3 pi loss def compute_loss_pi(data): o = data['obs'] q1_pi = ac.q1(o, ac.pi(o)) return -q1_pi.mean() # Set up model saving logger.setup_pytorch_saver(ac) def update(data, timer): # First run one gradient descent step for Q1 and Q2 q_optimizer.zero_grad() loss_q, loss_info = compute_loss_q(data) loss_q.backward() q_optimizer.step() # Record things logger.store(LossQ=loss_q.item(), **loss_info) # Possibly update pi and target networks if timer % policy_delay == 0: # Freeze Q-networks so you don't waste computational effort # computing gradients for them during the policy learning step. for p in q_params: p.requires_grad = False # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi = compute_loss_pi(data) loss_pi.backward() pi_optimizer.step() # Unfreeze Q-networks so you can optimize it at next DDPG step. for p in q_params: p.requires_grad = True # Record things logger.store(LossPi=loss_pi.item()) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) def get_action(o, noise_scale): a = ac.act(torch.as_tensor(o, dtype=torch.float32)) a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(): for _ in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) scaled_action = get_action(o, 0) o, r, d, _ = test_env.step( unscale_action(env.action_space, scaled_action)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() if loaded_state_dict is not None: o = loaded_state_dict['o'] ep_ret = loaded_state_dict['ep_ret'] ep_len = loaded_state_dict['ep_len'] else: o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): t += t_ori # printMemUsage(f"start of step {t}") # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy (with some noise, via act_noise). if t > start_steps and np.random.rand() > epsilon_fn(t): a = get_action(o, act_noise_fn(t)) unscaled_action = unscale_action(env.action_space, a) else: unscaled_action = env.action_space.sample() a = scale_action(env.action_space, unscaled_action) # Step the env o2, r, d, _ = env.step(unscaled_action) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Update handling if t >= update_after and t % update_every == 0: for j in range(update_every): batch = replay_buffer.sample_batch(batch_size) update(data=batch, timer=j) # End of epoch handling if (t + 1) % steps_per_epoch == 0: # Perform LR decay update_learning_rate(q_optimizer, q_learning_rate_fn(t)) update_learning_rate(pi_optimizer, pi_learning_rate_fn(t)) epoch = (t + 1) // steps_per_epoch # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() # Save model and checkpoint save_checkpoint = False checkpoint_path = "" if save_checkpoint_path is not None: save_checkpoint = True checkpoint_path = save_checkpoint_path if load_checkpoint_path is not None: save_checkpoint = True checkpoint_path = load_checkpoint_path if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({}, None) if save_checkpoint: checkpoint_file = os.path.join(checkpoint_path, f'save_{epoch}.pt') torch.save( { 'ac': ac.state_dict(), 'ac_targ': ac_targ.state_dict(), 'replay_buffer': replay_buffer, 'pi_optimizer': pi_optimizer.state_dict(), 'q_optimizer': q_optimizer.state_dict(), 'logger_epoch_dict': logger.epoch_dict, 'q_learning_rate_fn': q_learning_rate_fn, 'pi_learning_rate_fn': pi_learning_rate_fn, 'epsilon_fn': epsilon_fn, 'act_noise_fn': act_noise_fn, 'torch_rng_state': torch.get_rng_state(), 'np_rng_state': np.random.get_state(), 'action_space_state': env.action_space.np_random.get_state(), 'env': env, 'test_env': test_env, 'ep_ret': ep_ret, 'ep_len': ep_len, 'o': o, 't': t + 1 }, checkpoint_file) delete_old_files(checkpoint_path, 10)
class sac_discrete_class: def __init__(self, env_fn, Actor=core.DiscreteMLPActor, Critic=core.DiscreteMLPQFunction, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=100, replay_size=int(5e5), gamma=0.99, polyak=0.995, lr=1e-5, alpha=0.2, batch_size=100, start_steps=10000, update_after=1000, update_times_every_step=50, num_test_episodes=10, max_ep_len=2000, logger_kwargs=dict(), save_freq=1, automatic_entropy_tuning=True, use_gpu=False, gpu_parallel=False, show_test_render=False, last_save_path=None, state_of_art_model=False, **kwargs): """ Soft Actor-Critic (SAC) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with an ``act`` method, a ``pi`` module, a ``q1`` module, and a ``q2`` module. The ``act`` method and ``pi`` module should accept batches of observations as inputs, and ``q1`` and ``q2`` should accept a batch of observations and a batch of actions as inputs. When called, ``act``, ``q1``, and ``q2`` should return: =========== ================ ====================================== Call Output Shape Description =========== ================ ====================================== ``act`` (batch, act_dim) | Numpy array of actions for each | observation. ``q1`` (batch,) | Tensor containing one current estimate | of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) ``q2`` (batch,) | Tensor containing the other current | estimate of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) =========== ================ ====================================== Calling ``pi`` should return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Tensor containing actions from policy | given observations. ``logp_pi`` (batch,) | Tensor containing log probabilities of | actions in ``a``. Importantly: gradients | should be able to flow back into ``a``. =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_times_every_step (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ self.ac_kwargs = ac_kwargs self.seed = seed self.steps_per_epoch = steps_per_epoch self.epochs = epochs self.replay_size = replay_size self.gamma = gamma self.polyak = polyak self.lr = lr self.alpha = alpha self.batch_size = batch_size self.start_steps = start_steps self.update_after = update_after self.update_times_every_step = update_times_every_step self.num_test_episodes = num_test_episodes self.max_ep_len = max_ep_len self.logger_kwargs = logger_kwargs self.save_freq = save_freq self.automatic_entropy_tuning = automatic_entropy_tuning self.use_gpu = use_gpu self.gpu_parallel = gpu_parallel self.show_test_render = show_test_render self.last_save_path = last_save_path self.kwargs = kwargs self.logger = EpochLogger(**logger_kwargs) self.logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) self.env = env_fn() self.test_env = env_fn() self.env.seed(seed) # env.seed(seed) # test_env.seed(seed) self.obs_dim = self.env.observation_space.shape self.act_dim = self.env.action_space.n # Create actor-critic module and target networks self.state_of_art_model = state_of_art_model if self.state_of_art_model: self.actor = Actor(**ac_kwargs) self.critic1 = Critic(**ac_kwargs) self.critic2 = Critic(**ac_kwargs) self.critic1_targ = deepcopy(self.critic1) self.critic2_targ = deepcopy(self.critic2) else: self.actor = Actor(self.obs_dim, self.act_dim, **ac_kwargs) self.critic1 = Critic(self.obs_dim, self.act_dim, **ac_kwargs) self.critic2 = Critic(self.obs_dim, self.act_dim, **ac_kwargs) self.critic1_targ = deepcopy(self.critic1) self.critic2_targ = deepcopy(self.critic2) # gpu是否使用 if torch.cuda.is_available(): self.device = torch.device("cuda" if self.use_gpu else "cpu") if gpu_parallel: self.actor = torch.nn.DataParallel(self.actor) self.critic1 = torch.nn.DataParallel(self.critic1) self.critic2 = torch.nn.DataParallel(self.critic2) self.critic1_targ = torch.nn.DataParallel(self.critic1_targ) self.critic2_targ = torch.nn.DataParallel(self.critic2_targ) else: self.use_gpu = False self.gpu_parallel = False self.device = torch.device("cpu") # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in self.critic1_targ.parameters(): p.requires_grad = False for p in self.critic2_targ.parameters(): p.requires_grad = False self.actor.to(self.device) self.critic1.to(self.device) self.critic2.to(self.device) self.critic1_targ.to(self.device) self.critic2_targ.to(self.device) # Experience buffer self.replay_buffer = ReplayBuffer(obs_dim=self.obs_dim, act_dim=1, size=replay_size, device=self.device) # # List of parameters for both Q-networks (save this for convenience) # q_params = itertools.chain(critic1.parameters(), critic2.parameters()) if self.automatic_entropy_tuning: # we set the max possible entropy as the target entropy self.target_entropy = -np.log((1.0 / self.act_dim)) * 0.98 self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha = self.log_alpha.exp() self.alpha_optim = Adam([self.log_alpha], lr=lr, eps=1e-4) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple( core.count_vars(module) for module in [self.actor, self.critic1, self.critic2]) self.logger.log( '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' % var_counts) # Set up optimizers for policy and q-function self.pi_optimizer = Adam(self.actor.parameters(), lr=lr) self.q1_optimizer = Adam(self.critic1.parameters(), lr=lr) self.q2_optimizer = Adam(self.critic2.parameters(), lr=lr) if last_save_path is not None: checkpoints = torch.load(last_save_path) self.epoch = checkpoints['epoch'] self.actor.load_state_dict(checkpoints['actor']) self.critic1.load_state_dict(checkpoints['critic1']) self.critic2.load_state_dict(checkpoints['critic2']) self.pi_optimizer.load_state_dict(checkpoints['pi_optimizer']) self.q1_optimizer.load_state_dict(checkpoints['q1_optimizer']) self.q2_optimizer.load_state_dict(checkpoints['q2_optimizer']) self.critic1_targ.load_state_dict(checkpoints['critic1_targ']) self.critic2_targ.load_state_dict(checkpoints['critic2_targ']) # last_best_Return_per_local = checkpoints['last_best_Return_per_local'] print("succesfully load last prameters") else: self.epoch = 0 print("Dont load last prameters.") # Set up function for computing SAC Q-losses def compute_loss_q(self, data): # Bellman backup for Q functions with torch.no_grad(): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] r = r.unsqueeze(-1) if r.ndim == 1 else r d = d.unsqueeze(-1) if d.ndim == 1 else d if self.state_of_art_model and o.ndim != 4: o = o.unsqueeze(dim=1) o2 = o2.unsqueeze(dim=1) # Target actions come from *current* policy a2, (a2_p, logp_a2), _ = self.get_action(o2) # Target Q-values q1_pi_targ = self.critic1_targ(o2) q2_pi_targ = self.critic2_targ(o2) q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) min_qf_next_target = a2_p * (q_pi_targ - self.alpha * logp_a2) min_qf_next_target = min_qf_next_target.mean(dim=1).unsqueeze(-1) backup = r + self.gamma * (1 - d) * min_qf_next_target q1 = self.critic1(o).gather(1, a.long()) q2 = self.critic2(o).gather(1, a.long()) # MSE loss against Bellman backup loss_q1 = F.mse_loss(q1, backup) loss_q2 = F.mse_loss(q2, backup) # Useful info for logging q_info = dict(Q1Vals=q1.detach().cpu().numpy(), Q2Vals=q2.detach().cpu().numpy()) return loss_q1, loss_q2, q_info # Set up function for computing SAC pi loss def compute_loss_pi(self, data): state_batch = data['obs'] if self.state_of_art_model and state_batch.ndim != 4: state_batch = state_batch.unsqueeze(dim=1) action, (action_probabilities, log_action_probabilities), _ = self.get_action(state_batch) qf1_pi = self.critic1(state_batch) qf2_pi = self.critic2(state_batch) min_qf_pi = torch.min(qf1_pi, qf2_pi) inside_term = self.alpha * log_action_probabilities - min_qf_pi policy_loss = action_probabilities * inside_term policy_loss = policy_loss.mean() log_action_probabilities = torch.sum(log_action_probabilities * action_probabilities, dim=1) # Useful info for logging pi_info = dict(LogPi=log_action_probabilities.detach().cpu().numpy()) return policy_loss, log_action_probabilities, pi_info def take_optimisation_step(self, optimizer, network, loss, clipping_norm=None, retain_graph=False): if not isinstance(network, list): network = [network] optimizer.zero_grad() # reset gradients to 0 loss.backward( retain_graph=retain_graph) # this calculates the gradients if clipping_norm is not None: for net in network: torch.nn.utils.clip_grad_norm_( net.parameters(), clipping_norm) # clip gradients to help stabilise training optimizer.step() # this applies the gradients def soft_update_of_target_network(self, local_model, target_model, tau): """Updates the target network in the direction of the local network but by taking a step size less than one so the target network's parameter values trail the local networks. This helps stabilise training""" for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def update(self, data): # First run one gradient descent step for Q1 and Q2 loss_q1, loss_q2, q_info = self.compute_loss_q(data) self.take_optimisation_step( self.q1_optimizer, self.critic1, loss_q1, 5, ) self.take_optimisation_step( self.q2_optimizer, self.critic2, loss_q2, 5, ) # Record things self.logger.store(LossQ=(loss_q1.item() + loss_q2.item()) / 2., **q_info) # Freeze Q-networks so you don't waste computational effort # # computing gradients for them during the policy learning step. # for p in q_params: # p.requires_grad = False # Next run one gradient descent step for pi. loss_pi, log_pi, pi_info = self.compute_loss_pi(data) # Record things self.logger.store(LossPi=loss_pi.item(), **pi_info) # # Unfreeze Q-networks so you can optimize it at next DDPG step. # for p in q_params: # p.requires_grad = True if self.automatic_entropy_tuning: alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean() # logger.store(alpha_loss=alpha_loss.item()) self.take_optimisation_step( self.pi_optimizer, self.actor, loss_pi, 5, ) with torch.no_grad(): for p, p_targ in zip(self.critic1.parameters(), self.critic1_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(self.polyak) p_targ.data.add_((1 - self.polyak) * p.data) for p, p_targ in zip(self.critic2.parameters(), self.critic2_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(self.polyak) p_targ.data.add_((1 - self.polyak) * p.data) if self.automatic_entropy_tuning: self.take_optimisation_step(self.alpha_optim, None, alpha_loss, None) self.alpha = self.log_alpha.exp() def get_action(self, state): """Given the state, produces an action, the probability of the action, the log probability of the action, and the argmax action""" action_probabilities = self.actor(state) max_probability_action = torch.argmax(action_probabilities).unsqueeze( 0) action_distribution = Categorical(action_probabilities) action = action_distribution.sample().cpu() # Have to deal with situation of 0.0 probabilities because we can't do log 0 z = action_probabilities == 0.0 z = z.float() * 1e-8 log_action_probabilities = torch.log(action_probabilities + z) return action, (action_probabilities, log_action_probabilities), max_probability_action def test_agent(self): for j in range(self.num_test_episodes): o, d, ep_ret, ep_len = self.test_env.reset( isRandomStart=True), False, 0, 0 while not (ep_len == self.max_ep_len): if self.show_test_render: self.test_env.render() # Take deterministic actions at test time with torch.no_grad(): if self.state_of_art_model and o.ndim == 2: obs = torch.FloatTensor(o).view([1, 1, *self.obs_dim ]).to(self.device) else: obs = torch.FloatTensor(o).view([1, *self.obs_dim ]).to(self.device) _, (_, _), a = self.get_action(obs) o, r, d, _ = self.test_env.step(a.cpu().item()) ep_ret += r ep_len += 1 text = "Test: Code: %s, Epoch: %s, TestEp_ret: %s, Testep_len: %s." % \ (self.test_env.current_env.code, self.epoch, ep_ret, ep_len) self.logger.log_stdout(text) if d == 1: # 资金不足 print('test资金不足') break elif d == 2: # 达到索引终点 print('test达到合约终点, 重新开始') self.test_env.reset(isRandomStart=True, total=self.test_env.current_env.total) elif d == 3: # 达到回撤限制 print('test达到回撤限制') break self.logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) def run(self): # Prepare for interaction with environment total_steps = self.steps_per_epoch * self.epochs start_time = time.time() o, ep_ret, ep_len = self.env.reset(), 0, 0 eps = 1 t = self.epoch * self.steps_per_epoch if self.last_save_path is not None else 0 # Main loop: collect experience in env and update/log each epoch self.actor.eval() while t < total_steps: text = "Code: %s, Epoch: %s, Episode: %s, Ep_ret: %s, ep_len: %s. [%s/%s]" % \ (self.env.current_env.code, self.epoch, eps, ep_ret, ep_len, t + 1, total_steps) self.logger.log_stdout(text) # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy. if t >= self.start_steps: with torch.no_grad(): if self.state_of_art_model and o.ndim == 2: obs = torch.FloatTensor(o).view([1, 1, *self.obs_dim ]).to(self.device) else: obs = torch.FloatTensor(o).view([1, *self.obs_dim ]).to(self.device) a, _, _ = self.get_action(obs) a = a.cpu().item() else: a = np.random.randint(0, self.act_dim) # Step the env o2, r, d, _ = self.env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == self.max_ep_len else d # 如果长度==最大长度则False,否则 # Store experience to replay buffer if d == 2 or d == 1: # 控制重置 done = 1 else: done = 0 self.replay_buffer.store(o, a, r, o2, done) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d == 1 or (ep_len == self.max_ep_len ): # ep_len == max_ep_len是游戏成功时最少ep长度 o, ep_ret, ep_len = self.env.reset(isRandomStart=False), 0, 0 eps += 1 elif d == 2: # 达到索引终点 self.env.reset( isRandomStart=False, total=self.env.current_env.total) #继续下一个合约,但是继承上一次总资产 elif d == 3: # 达到回撤限制(目前先不管) d # Update handling if self.replay_buffer.size > self.update_after and t % self.update_times_every_step == 0: self.actor.train() for j in range(self.update_times_every_step): batch = self.replay_buffer.sample_batch(self.batch_size) self.update(data=batch) self.actor.eval() # logger.save_epoch_Ret_optimizer_model(save_dict) # last_best_Return_per_local = Return_per_local # End of epoch handling if ( t + 1 ) % self.steps_per_epoch == 0 and self.replay_buffer.size > self.update_after: if ( t + 1 ) % self.update_times_every_step == 0: # 每达到update_times_every_step self.epoch = (t + 1) // self.steps_per_epoch # Save model if proc_id() == 0 and (self.epoch) % self.save_freq == 0: save_dict = { 'epoch': self.epoch, 'actor': self.actor.state_dict(), 'critic1': self.critic1.state_dict(), 'critic2': self.critic2.state_dict(), 'pi_optimizer': self.pi_optimizer.state_dict(), 'q1_optimizer': self.q1_optimizer.state_dict(), 'q2_optimizer': self.q2_optimizer.state_dict(), 'critic1_targ': self.critic1_targ.state_dict(), 'critic2_targ': self.critic2_targ.state_dict(), } self.logger.save_epoch_Ret_optimizer_model(save_dict) self.actor.eval() # Test the performance of the deterministic version of the agent. self.test_agent() # Log info about epoch self.logger.log_tabular('Epoch', self.epoch) # self.logger.log_tabular('EpRet', with_min_and_max=True) self.logger.log_tabular('TestEpRet', with_min_and_max=False) # self.logger.log_tabular('EpLen', average_only=True) self.logger.log_tabular('TestEpLen', average_only=True) self.logger.log_tabular('TotalEnvInteracts', t) self.logger.log_tabular('Q1Vals', with_min_and_max=True) self.logger.log_tabular('Q2Vals', with_min_and_max=True) self.logger.log_tabular('LogPi', with_min_and_max=True) self.logger.log_tabular('LossPi', average_only=True) self.logger.log_tabular('LossQ', average_only=True) self.logger.log_tabular('Time', time.time() - start_time) # if epoch > 1: # (time.time() - start_time)/epo self.logger.dump_tabular() t += 1
def cvi_ad(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, alp = 0.8, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, decay = None, squash = False): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``mu`` (batch, act_dim) | Computes mean actions from policy | given states. ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. Critical: must be differentiable | with respect to policy parameters all | the way through action sampling. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). ``q2_pi`` (batch,) | Gives the composition of ``q2`` and | ``pi`` for states in ``x_ph``: | q2(x, pi(x)). ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) adv_ph = tf.placeholder(dtype = tf.float32, shape = (None,)) alp_ph = tf.placeholder(dtype = tf.float32) t_step = tf.placeholder(dtype = tf.float32) #adv_ph1 = tf.placeholder(dtype = tf.float32, shape = (None,)) #adv_ph2 = tf.placeholder(dtype = tf.float32, shape = (None,)) # Main outputs from computation graph with tf.variable_scope('main'): mu, pi, logp_pi, ad1, ad2, ad1_pi, ad2_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Target value network with tf.variable_scope('target'): _, _, _, ad1_targ, ad2_targ, _, _, v_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) squash_eps = 1e-2 if squash: print("Squashed") squash_func = lambda x: tf.sign(x) * (tf.sqrt(tf.abs(x) + 1) - 1) + x * squash_eps squash_ifunc = lambda x: tf.sign(x) * ((tf.sqrt(1 + 4 * squash_eps * (tf.abs(x) + 1 + squash_eps)) - 1)** 2 * (1 / (2 * squash_eps))** 2 - 1) else: print ("Not Squashed") squash_func = lambda x: x squash_ifunc = lambda x: x # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main']) print(('\nNumber of parameters: \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t v: %d, \t total: %d\n')%var_counts) q1 = v + ad1 q2 = v + ad2 q1_pi = v + ad1_pi q2_pi = v + ad2_pi # Min Double-Q: min_q_pi = tf.minimum(q1_pi, q2_pi) # Targets for Q and V regression q_backup = tf.stop_gradient(squash_func(r_ph + gamma*(1-d_ph)*squash_ifunc(v_targ) + alp_ph * adv_ph)) #q_backup1 = tf.stop_gradient(r_ph + gamma*(1-d_ph)*v_targ + alp * adv_ph1) #q_backup2 = tf.stop_gradient(r_ph + gamma*(1-d_ph)*v_targ + alp * adv_ph2) v_backup = tf.stop_gradient(squash_func(squash_ifunc(min_q_pi) - alpha * logp_pi)) # Soft actor-critic losses #alp = tf.Variable(0.2,dtype=tf.float32) #q_min = tf.minimum(q1,q2) pi_loss = tf.reduce_mean(alpha * logp_pi - squash_ifunc(min_q_pi)) q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2) v_loss = 0.5 * tf.reduce_mean((v_backup - v)**2) value_loss = q1_loss + q2_loss + v_loss # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars('main/q') + get_vars('main/v') with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main) for v_main, v_targ in zip(get_vars('main') , get_vars('target'))]) # target_update = tf.group([tf.assign(v_targ, tf.cond(tf.not_equal(t_step%1000,0), lambda: v_targ, lambda: v_main)) # for v_main, v_targ in zip(get_vars('main') , get_vars('target'))]) # All ops to call during one training step step_ops = [pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, train_pi_op, train_value_op, target_update] # adv_op = squash_ifunc(tf.minimum(q1_targ, q2_targ))-squash_ifunc(v_targ) adv_op = squash_ifunc(tf.minimum(ad1_targ, ad2_targ)) #adv_op1 = q1_targ-v_targ #adv_op2 = q2_targ-v_targ # Initializing targets to match main variables target_init = tf.group([tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2, 'v': v}) def get_action(o, deterministic=False): act_op = mu if deterministic else pi return sess.run(act_op, feed_dict={x_ph: o.reshape(1,-1)})[0] def test_agent(n=10): global sess, mu, pi, q1, q2, q1_pi, q2_pi for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not(d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs if decay: alp_val = 0.2 else: alp_val = alp update_step = 0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t > start_steps: a = get_action(o) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len==max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ for j in range(ep_len): update_step+=1 batch = replay_buffer.sample_batch(batch_size) feed_dict = {x2_ph: batch['obs1'], a_ph: batch['acts'] } advantage = sess.run(adv_op , feed_dict) #advantage = sess.run([adv_op1, adv_op2] , feed_dict) feed_dict = {x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], t_step: update_step, adv_ph : advantage, alp_ph : alp_val #adv_ph1 : advantage[0], #adv_ph2 : advantage[1] } outs = sess.run(step_ops, feed_dict) logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2], LossV=outs[3], Q1Vals=outs[4], Q2Vals=outs[5], VVals=outs[6], LogPi=outs[7]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch if decay: alp_val = eval(decay)(t//steps_per_epoch) # Save model if (epoch % save_freq == 0) or (epoch == epochs-1): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular()
def sac1(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=3000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-4, alpha=0.2, batch_size=150, start_steps=9000, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``mu`` (batch, act_dim) | Computes mean actions from policy | given states. ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. Critical: must be differentiable | with respect to policy parameters all | the way through action sampling. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). ``q2_pi`` (batch,) | Gives the composition of ``q2`` and | ``pi`` for states in ``x_ph``: | q2(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for policy/value/alpha learning). alpha (float/'auto'): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) / 'auto': alpha is automated. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information with policy architecture ac_kwargs['action_space'] = env.action_space ac_kwargs['obs_dim'] = obs_dim h_size = ac_kwargs["h_size"] # hidden size of rnn seq_length = ac_kwargs["seq"] # seq length of rnn # Inputs to computation graph seq = None # training and testing doesn't has to have the same seq length x_ph, a_ph, r_ph, d_ph = core.placeholders([seq, obs_dim], [seq, act_dim], [seq, 1], [seq, 1]) s_t_0 = tf.placeholder(shape=[None, h_size], name="pre_state", dtype="float32") # zero state # Main outputs from computation graph outputs, states = cudnn_rnn_cell(x_ph, s_t_0, h_size=ac_kwargs["h_size"]) # outputs, _ = rnn_cell(x_ph, s_t_0, h_size=ac_kwargs["h_size"]) # outputs = mlp(outputs, [ac_kwargs["h_size"], ac_kwargs["h_size"]], activation=tf.nn.elu) # states = outputs[:, -1, :] # if use model predict next state (obs) with tf.variable_scope("model"): """hidden size for mlp h_size for RNN """ s_predict = mlp(tf.concat([outputs, a_ph], axis=-1), list(ac_kwargs["hidden_sizes"]) + [ac_kwargs["h_size"]], activation=tf.nn.relu) with tf.variable_scope('main'): mu, pi, logp_pi, q1, q2, q1_pi, q2_pi = actor_critic( x_ph, a_ph, s_t_0, outputs, states, **ac_kwargs) # Target value network with tf.variable_scope('target'): _, _, _, _, _, q1_pi_, q2_pi_ = actor_critic(x_ph, a_ph, s_t_0, outputs, states, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size, h_size=h_size, seq_length=seq_length, flag="seq", normalize=ac_kwargs["norm"]) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', "model"]) print( '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t model: %d \n' % var_counts) if alpha == 'auto': # target_entropy = (-np.prod(env.action_space.shape)) target_entropy = -np.prod(env.action_space.shape) log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=ac_kwargs["h0"]) alpha = tf.exp(log_alpha) alpha_loss = tf.reduce_mean( -log_alpha * tf.stop_gradient(logp_pi[:, :-1, :] + target_entropy)) # Use smaller learning rate to make alpha decay slower alpha_optimizer = tf.train.AdamOptimizer(learning_rate=lr, name='alpha_optimizer') train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss, var_list=[log_alpha]) # model train op # we can't use s_T to predict s_T+1 delta_x = tf.stop_gradient( outputs[:, 1:, :] - outputs[:, :-1, :]) # predict delta obs instead of obs # model_loss = tf.abs((1 - d_ph[:, :-1, :]) * (s_predict[:, :-1, :] - delta_x[:, :, :obs_dim-act_dim])) model_loss = tf.abs( (1 - d_ph[:, :-1, :]) * (s_predict[:, :-1, :] - delta_x)) # how about "done" state model_optimizer = tf.train.AdamOptimizer(learning_rate=lr) if "m" in ac_kwargs["opt"]: value_params_1 = get_vars('model') + get_vars('rnn') else: value_params_1 = get_vars('model') # opt for optimize model train_model_op = model_optimizer.minimize(tf.reduce_mean(model_loss), var_list=value_params_1) # Targets for Q and V regression v_backup = tf.stop_gradient(tf.minimum(q1_pi_, q2_pi_) - alpha * logp_pi) # clip curiosity in_r = tf.stop_gradient( tf.reduce_mean(tf.clip_by_value(model_loss, 0, 64), axis=-1, keepdims=True)) beta = tf.placeholder(dtype=tf.float32, shape=(), name="beta") # beta = ac_kwargs["beta"] # adjust internal reward # can we prove the optimal value of beta # I think beta should decrease with training going on # beta = alpha # adjust internal reward q_backup = r_ph[:, :-1, :] + beta * in_r + gamma * ( 1 - d_ph[:, :-1, :]) * v_backup[:, 1:, :] # Soft actor-critic losses # pi_loss = tf.reduce_mean(alpha * logp_pi[:, :-1, :] - q1_pi[:, :-1, :]) pi_loss = tf.reduce_mean(alpha * logp_pi - q1_pi) # in some case, the last timestep Q function is super important so maybe we can use weight sum of loss # calculate last timestep separately for convince # q1_loss = 0.5 * tf.reduce_mean((q1[:, :-1, :] - q_backup) ** 2) q1_loss = tf.reduce_mean((q1[:, :-1, :] - q_backup)**2) q2_loss = tf.reduce_mean((q2[:, :-1, :] - q_backup)**2) # q2_loss = 0.5 * tf.reduce_mean((q2[:, :-1, :] - q_backup) ** 2) Q_loss = q1[:, :-1, :] - q_backup P_loss = alpha * logp_pi - q1_pi value_loss = q1_loss + q2_loss # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) # train model first pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) with tf.control_dependencies([train_model_op]): train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) if "q" in ac_kwargs["opt"]: value_params = get_vars('main/q') + get_vars('rnn') else: value_params = get_vars('main/q') with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in non_deterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # All ops to call during one training step if isinstance(alpha, Number): step_ops = [ pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, tf.identity(alpha), model_loss, train_model_op, train_pi_op, train_value_op, target_update ] else: step_ops = [ pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, alpha, model_loss, train_model_op, train_pi_op, train_value_op, target_update, train_alpha_op, Q_loss, P_loss ] # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2 }) def get_action(o, s_t_0_, mu, pi, states, deterministic=False): """s_t_0_ starting step for testing 1 H""" act_op = mu if deterministic else pi action, s_t_1_ = sess.run( [act_op, states], feed_dict={ x_ph: o.reshape(1, 1, obs_dim), a_ph: np.zeros([1, 1, act_dim]), s_t_0: s_t_0_ }) return action.reshape(act_dim), s_t_1_ def test_agent(mu, pi, states, n=10): # global sess, mu, pi, q1, q2, q1_pi, q2_pi for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 s_0 = np.zeros([1, h_size]) while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time a, s_1 = get_action(o, s_0, mu, pi, states, deterministic=True) s_0 = s_1 o, r, d, _ = test_env.step(a) # test_env.render() ep_ret += r ep_len += 1 # replay_buffer.store(o.reshape([1, obs_dim]), a.reshape([1, act_dim]), r, d) logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() # start = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch s_t_0_ = np.zeros([1, h_size]) episode = 0 for t in range(total_steps + 1): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t == 0: start = time.time() if t > start_steps: # s_t_0_store = s_t_0_ # hidden state stored in buffer a, s_t_1_ = get_action(o, s_t_0_, mu, pi, states, deterministic=False) s_t_0_ = s_t_1_ else: # s_t_0_store = s_t_0_ # print(s_t_0_.shape) _, s_t_1_ = get_action(o, s_t_0_, mu, pi, states, deterministic=False) s_t_0_ = s_t_1_ a = env.action_space.sample() # Step the env o2, r, d, _ = env.step( a ) # give back o_t_1 we need store o_t_0 because that is what cause a_t_0 # print(r) # env.render() ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o.reshape([1, obs_dim]), s_t_0_.reshape([1, h_size]), a.reshape([1, act_dim]), r, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of episode. Training (ep_len times). if d or (ep_len == max_ep_len): """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ # fps = (time.time() - start)/200 # print("{} fps".format(200 / (time.time() - start))) print(ep_len) episode += 1 start = time.time() beta_ = ac_kwargs["beta"] * (1 - t / total_steps) # beta_ = ac_kwargs["beta"] * (1 / t ** 0.5) for j in range(int(ep_len)): batch = replay_buffer.sample_batch(batch_size) # maybe we can store starting state feed_dict = { x_ph: batch['obs1'], s_t_0: batch[ 's_t_0'], # all zero matrix for zero state in training a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], beta: beta_, } for _ in range(ac_kwargs["tm"] - 1): batch = replay_buffer.sample_batch(batch_size) # maybe we can store starting state feed_dict = { x_ph: batch['obs1'], s_t_0: batch['s_t_0'], # stored zero state for training a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], beta: beta_, } _ = sess.run(train_model_op, feed_dict) outs = sess.run(step_ops, feed_dict) # print(outs) logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2], Q1Vals=outs[3].flatten(), Q2Vals=outs[4].flatten(), LogPi=outs[5].flatten(), Alpha=outs[6], beta=beta_, model_loss=outs[7].flatten()) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 s_t_0_ = np.zeros([1, h_size ]) # reset s_t_0_ when one episode is finished print("one episode duration:", time.time() - start) start = time.time() # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch if epoch % 50 == 0: np.save("model__mq_loss_{}".format(epoch), outs[7]) np.save("Q_mq_loss_{}".format(epoch), outs[-2]) np.save("P_mq_loss_{}".format(epoch), outs[-1]) # Save model # if (epoch % save_freq == 0) or (epoch == epochs - 1): # logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent(mu, pi, states) # logger.store(): store the data; logger.log_tabular(): log the data; logger.dump_tabular(): write the data # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('Episode', episode) logger.log_tabular('name', name) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Alpha', average_only=True) logger.log_tabular('beta', average_only=True) logger.log_tabular('model_loss', with_min_and_max=True) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, act_noise=0.1, target_noise=0.2, noise_clip=0.5, policy_delay=2, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Deterministically computes actions | from policy given states. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to TD3. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) target_noise (float): Stddev for smoothing noise added to target policy. noise_clip (float): Limit for absolute value of target policy smoothing noise. policy_delay (int): Policy will only be updated once every policy_delay times for each update of the Q-networks. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) #=========================================================================# # # # All of your code goes in the space below. # # # #=========================================================================# # Main outputs from computation graph with tf.variable_scope('main'): pi, q1, q2, q1_pi = actor_critic(x_ph, a_ph, **ac_kwargs) # Target policy network with tf.variable_scope('target'): pi_targ, _, _, _ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Target Q networks with tf.variable_scope('target', reuse=True): # Target policy smoothing, by adding clipped noise to target actions pi_noise_targ = get_pi_noise_clipped(pi, noise_scale=target_noise, noise_clip=noise_clip, act_limit=act_limit) # Target Q-values, using action from smoothed target policy _, q1_targ, q2_targ, _ = actor_critic(x2_ph, pi_noise_targ, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main']) print( '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t total: %d\n' % var_counts) # Bellman backup for Q functions, using Clipped Double-Q targets q_targ = get_q_target(q1_targ, q2_targ, r_ph, d=d_ph, gamma=0.99) # TD3 losses pi_loss = -tf.reduce_mean(q1_pi) q1_loss = tf.losses.mean_squared_error(q_targ, q1) q2_loss = tf.losses.mean_squared_error(q_targ, q2) q_loss = q1_loss + q2_loss #=========================================================================# # # # All of your code goes in the space above. # # # #=========================================================================# # Separate train ops for pi, q pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q')) # Polyak averaging for target variables target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'pi': pi, 'q1': q1, 'q2': q2 }) def get_action(o, noise_scale): a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)}) a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(n=10): for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, _ = test_env.step(get_action(o, 0)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ if t > start_steps: a = get_action(o, act_noise) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all TD3 updates at the end of the trajectory (in accordance with source code of TD3 published by original authors). """ for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } q_step_ops = [q_loss, q1, q2, train_q_op] outs = sess.run(q_step_ops, feed_dict) logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2]) if j % policy_delay == 0: # Delayed policy update outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) logger.store(LossPi=outs[0]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def sac_adapt_fast(env_fn, hidden_sizes=[256, 256], seed=0, steps_per_epoch=1000, epochs=1000, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=3e-4, alpha=0.2, batch_size=256, start_steps=10000, max_ep_len=1000, save_freq=1, save_model=False, auto_alpha=True, grad_clip=-1, logger_store_freq=100, logger_kwargs=dict(), use_deterministic_action=False): """ Largely following OpenAI documentation, but a bit different Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. hidden_sizes: number of entries is number of hidden layers each entry in this list indicate the size of that hidden layer. applies to all networks seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. Note the epoch here is just logging epoch so every this many steps a logging to stdouot and also output file will happen note: not to be confused with training epoch which is a term used often in literature for all kinds of different things epochs (int): Number of epochs to run and train agent. Usage of this term can be different in different algorithms, use caution. Here every epoch you get new logs replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. However during testing the action always come from policy max_ep_len (int): Maximum length of trajectory / episode / rollout. Environment will get reseted if timestep in an episode excedding this number save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. logger_kwargs (dict): Keyword args for EpochLogger. save_model (bool): set to True if want to save the trained agent auto_alpha: set to True to use the adaptive alpha scheme, target entropy will be set automatically grad_clip: whether to use gradient clipping. < 0 means no clipping logger_store_freq: how many steps to log debugging info, typically don't need to change """ """set up logger""" logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) env, test_env = env_fn(), env_fn() ## seed torch and numpy torch.manual_seed(seed) np.random.seed(seed) ## seed environment along with env action space so that everything about env is seeded env.seed(seed) env.action_space.np_random.seed(seed) test_env.seed(seed + 10000) test_env.action_space.np_random.seed(seed + 10000) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # if environment has a smaller max episode length, then use the environment's max episode length max_ep_len = env._max_episode_steps if max_ep_len > env._max_episode_steps else max_ep_len # Action limit for clamping: critically, assumes all dimensions share the same bound! # we need .item() to convert it from numpy float to python float act_limit = env.action_space.high[0].item() # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) """ Auto tuning alpha """ if auto_alpha: target_entropy = -np.prod(env.action_space.shape).item() # H log_alpha = torch.zeros(1, requires_grad=True) alpha_optim = optim.Adam([log_alpha], lr=lr) else: target_entropy, log_alpha, alpha_optim = None, None, None def test_agent(n=1): """ This will test the agent's performance by running n episodes During the runs, the agent only take deterministic action, so the actions are not drawn from a distribution, but just use the mean :param n: number of episodes to run the agent """ ep_return_list = np.zeros(n) for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time a = policy_net.get_env_action(o, deterministic=True) o, r, d, _ = test_env.step(a) ep_ret += r ep_len += 1 ep_return_list[j] = ep_ret logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs """init all networks""" # see line 1 policy_net = TanhGaussianPolicySACAdapt(obs_dim, act_dim, hidden_sizes, action_limit=act_limit) q1_net = Mlp(obs_dim + act_dim, 1, hidden_sizes) q2_net = Mlp(obs_dim + act_dim, 1, hidden_sizes) q1_target_net = Mlp(obs_dim + act_dim, 1, hidden_sizes) q2_target_net = Mlp(obs_dim + act_dim, 1, hidden_sizes) # see line 2: copy parameters from value_net to target_value_net q1_target_net.load_state_dict(q1_net.state_dict()) q2_target_net.load_state_dict(q2_net.state_dict()) # set up optimizers policy_optimizer = optim.Adam(policy_net.parameters(), lr=lr) q1_optimizer = optim.Adam(q1_net.parameters(), lr=lr) q2_optimizer = optim.Adam(q2_net.parameters(), lr=lr) # mean squared error loss for v and q networks mse_criterion = nn.MSELoss() # Main loop: collect experience in env and update/log each epoch # NOTE: t here is the current number of total timesteps used # it is not the number of timesteps passed in the current episode current_update_index = 0 for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t > start_steps: a = policy_net.get_env_action( o, deterministic=use_deterministic_action) else: a = env.action_space.sample() # Step the env, get next observation, reward and done signal o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience (observation, action, reward, next observation, done) to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 """perform update""" if replay_buffer.size >= batch_size: # get data from replay buffer batch = replay_buffer.sample_batch(batch_size) obs_tensor = Tensor(batch['obs1']) obs_next_tensor = Tensor(batch['obs2']) acts_tensor = Tensor(batch['acts']) # unsqueeze is to make sure rewards and done tensors are of the shape nx1, instead of n # to prevent problems later rews_tensor = Tensor(batch['rews']).unsqueeze(1) done_tensor = Tensor(batch['done']).unsqueeze(1) """ now we do a SAC update, following the OpenAI spinup doc check the openai sac document psudocode part for reference line nubmers indicate lines in psudocode part we will first compute each of the losses and then update all the networks in the end """ # see line 12: get a_tilda, which is newly sampled action (not action from replay buffer) """get q loss""" with torch.no_grad(): a_tilda_next, _, _, log_prob_a_tilda_next, _, _ = policy_net.forward( obs_next_tensor) q1_next = q1_target_net( torch.cat([obs_next_tensor, a_tilda_next], 1)) q2_next = q2_target_net( torch.cat([obs_next_tensor, a_tilda_next], 1)) min_next_q = torch.min(q1_next, q2_next) - alpha * log_prob_a_tilda_next y_q = rews_tensor + gamma * (1 - done_tensor) * min_next_q # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] q1_prediction = q1_net(torch.cat([obs_tensor, acts_tensor], 1)) q1_loss = mse_criterion(q1_prediction, y_q) q2_prediction = q2_net(torch.cat([obs_tensor, acts_tensor], 1)) q2_loss = mse_criterion(q2_prediction, y_q) """ get policy loss """ a_tilda, mean_a_tilda, log_std_a_tilda, log_prob_a_tilda, _, _ = policy_net.forward( obs_tensor) # see line 12: second equation q1_a_tilda = q1_net(torch.cat([obs_tensor, a_tilda], 1)) q2_a_tilda = q2_net(torch.cat([obs_tensor, a_tilda], 1)) min_q1_q2_a_tilda = torch.min(q1_a_tilda, q2_a_tilda) # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))] policy_loss = (alpha * log_prob_a_tilda - min_q1_q2_a_tilda).mean() """ alpha loss, update alpha """ if auto_alpha: alpha_loss = -( log_alpha * (log_prob_a_tilda + target_entropy).detach()).mean() alpha_optim.zero_grad() alpha_loss.backward() if grad_clip > 0: nn.utils.clip_grad_norm_(log_alpha, grad_clip) alpha_optim.step() alpha = log_alpha.exp().item() else: alpha_loss = 0 """update networks""" q1_optimizer.zero_grad() q1_loss.backward() if grad_clip > 0: nn.utils.clip_grad_norm_(q1_net.parameters(), grad_clip) q1_optimizer.step() q2_optimizer.zero_grad() q2_loss.backward() if grad_clip > 0: nn.utils.clip_grad_norm_(q2_net.parameters(), grad_clip) q2_optimizer.step() policy_optimizer.zero_grad() policy_loss.backward() if grad_clip > 0: nn.utils.clip_grad_norm_(policy_net.parameters(), grad_clip) policy_optimizer.step() # see line 16: update target value network with value network soft_update_model1_with_model2(q1_target_net, q1_net, polyak) soft_update_model1_with_model2(q2_target_net, q2_net, polyak) current_update_index += 1 if current_update_index % logger_store_freq == 0: # store diagnostic info to logger logger.store(LossPi=policy_loss.item(), LossQ1=q1_loss.item(), LossQ2=q2_loss.item(), LossAlpha=alpha_loss.item(), Q1Vals=q1_prediction.detach().numpy(), Q2Vals=q2_prediction.detach().numpy(), Alpha=alpha, LogPi=log_prob_a_tilda.detach().numpy()) if d or (ep_len == max_ep_len): """when episode terminates, log info about this episode, then reset""" ## store episode return and length to logger logger.store(EpRet=ep_ret, EpLen=ep_len) ## reset environment o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if (t + 1) % steps_per_epoch == 0: epoch = t // steps_per_epoch """ Save pytorch model, very different from tensorflow version We need to save the environment, the state_dict of each network and also the state_dict of each optimizer """ if save_model: sac_state_dict = { 'env': env, 'policy_net': policy_net.state_dict(), 'q1_net': q1_net.state_dict(), 'q2_net': q2_net.state_dict(), 'q1_target_net': q1_target_net.state_dict(), 'q2_target_net': q2_target_net.state_dict(), 'policy_opt': policy_optimizer, 'q1_opt': q1_optimizer, 'q2_opt': q2_optimizer, 'log_alpha': log_alpha, 'alpha_opt': alpha_optim, 'target_entropy': target_entropy } if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state(sac_state_dict, None) # use joblib.load(fname) to load # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('Alpha', with_min_and_max=True) logger.log_tabular('LossAlpha', average_only=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() sys.stdout.flush()
def ppo(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=100000, target_kl=0.01, logger_kwargs=dict(), save_freq=10, **kwargs): """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env_fn() # test_env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Create actor-critic module ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) # Sync params across processes sync_params(ac) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Set up function for computing PPO policy loss def compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data['logp'] # Policy loss pi, logp = ac.pi(obs.reshape(local_steps_per_epoch, 960), act) ratio = torch.exp(logp - logp_old) clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() # Useful extra info approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item() pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac) return loss_pi, pi_info # Set up function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs.reshape(local_steps_per_epoch, 960)) - ret) ** 2).mean() # Set up optimizers for policy and value function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(): data = buf.get() pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with multiple steps of gradient descent for i in range(train_pi_iters): pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: logger.log('Early stopping at step %d due to reaching max kl.' % i) break loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() logger.store(StopIter=i) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old)) # def test_agent(): # for j in range(10): # o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 # while not d: # # if show_test_render: # # test_env.render() # # Take deterministic actions at test time # with torch.no_grad(): # _, (_, _), a = ac.pi._distribution(torch.FloatTensor([o])) # o, r, d, _ = test_env.step(a.cpu().item()) # ep_ret += r # ep_len += 1 # logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) # Prepare for interaction with environment start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32).view(-1)) torch.FloatTensor() next_o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v, logp) logger.store(VVals=v) # Update obs (critical!) o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: if epoch_ended and not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32).view(-1)) else: v = 0 buf.finish_path(v) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) logger.store(TotalRet=env.current_env.total) o, ep_ret, ep_len = env.reset(isRandomStart=True), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update() # ac.eval() # test_agent() # ac.train() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('TotalRet', with_min_and_max=True) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
class MultiTaskDDPGAutoQuery(MultiTaskDDPG): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) logger_kwargs = setup_logger_kwargs('MultiTaskDDPGAutoQuery') self.logger = EpochLogger(**logger_kwargs) self.logger.save_config(globals()) self.init_query = False self.init_reward = False self.query_reward = 0 def reset(self, reward_function): self.reward_function = reward_function self.init_reward = False self.query_reward = 0 def observe(self, state, action, next_state, reward, done): # use this transition to augment state properly - only the first time if not self.init_query: self.query_state = state self.query_action = action self.query_next_state = next_state self.init_query = True if not self.init_reward: self.query_reward = self.reward_function(self.query_state, self.query_action, self.query_next_state) self.init_reward = True state = self.process_state(state) next_state = self.process_state(next_state) self.replay_buffer.store(state, action, reward, next_state, done) if self.step_count >= self.update_after and self.step_count % self.update_every == 0: for _ in range(self.update_every): batch = self.replay_buffer.sample_batch(self.batch_size) self.update(data=batch) def get_action(self, state, exploit=False): # for first call, will add random junk to processed state bc no transition yet if (self.init_query) and (not self.init_reward): self.query_reward = self.reward_function(self.query_state, self.query_action, self.query_next_state) self.init_reward = True processed_state = self.process_state(state) if not self.net: self.init_net(processed_state) # state is actually observation self.step_count += 1 if self.step_count <= self.start_steps: return self.action_space.sample() a = self.ac.act(torch.as_tensor(processed_state, dtype=torch.float32)) if not exploit: a += self.act_noise * np.random.randn(self.act_dim) return np.clip(a, -self.act_limit, self.act_limit) def process_state(self, state): return np.append(state, self.query_reward)
def bc_ue_ptb_learn(env_set="Hopper-v2", seed=0, buffer_type="FinalSigma0.5", buffer_seed=0, buffer_size='1000K', cut_buffer_size='1000K', mcue_seed=1, qloss_k=10000, qgt_seed=0, qlearn_type='learn_all_data', border=0.75, clip=0.85, update_type='e', eval_freq=float(1e3), max_timesteps=float(1e6), lr=1e-3, lag_lr=1e-3, search_lr=3e-2, wd=0, epsilon_base=1, logger_kwargs=dict()): """parameters |max_timesteps|, |eval_freq|: for BC_ue_border_perturb_c, Totalsteps means the number of minibatch updates (default batch size=100) for BC_ue_border_perturb_5, for BC_ue_border_perturb_e, Totalsteps means the number of updates on each datapoint, i.e., a step is an iteration of one optimization step on each data in the buffer""" device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("running on device:", device) """set up logger""" global logger logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) file_name = "BCue_per_e_%s_%s" % (env_set, seed) buffer_name = "%s_%s_%s_%s" % (buffer_type, env_set, buffer_seed, buffer_size) setting_name = "%s_r%s_g%s" % (buffer_name, 1000, 0.99) print ("---------------------------------------") print ("Settings: " + setting_name) print ("---------------------------------------") if not os.path.exists("./results"): os.makedirs("./results") env = gym.make(env_set) test_env = gym.make(env_set) # Set seeds env.seed(seed) test_env.seed(seed) env.action_space.np_random.seed(seed) test_env.action_space.np_random.seed(seed) torch.manual_seed(seed) np.random.seed(seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) action_range = float(env.action_space.high[0]) - float( env.action_space.low[0]) print('env', env_set, 'action range:', action_range) # Print out config used in MC Upper Envelope training rollout_list = [None, 1000, 200, 100, 10] k_list = [10000, 1000, 100] print('testing MClength:', rollout_list[mcue_seed % 10]) print('Training loss ratio k:', k_list[mcue_seed // 10]) selection_info = 'ue_border%s' % border selection_info += '_clip%s' % clip if clip is not None else '' print('selection_info:', selection_info) # Load the ue border selected buffer selected_buffer = utils.SARSAReplayBuffer() if buffer_size != cut_buffer_size: buffer_name = buffer_name + '_cutfinal' + cut_buffer_size selected_buffer.load(selection_info + '_' + buffer_name) buffer_length = selected_buffer.get_length() print(buffer_length) print('buffer setting:', selection_info + '_' + buffer_name) # Load the Q net trained with regression on Gts # And Load the corresponding Gts to the selected buffer selected_gts = np.load('./results/sele%s_ueMC_%s_Gt.npy' % (selection_info, setting_name), allow_pickle=True) if qlearn_type == 'learn_all_data': verbose_qnet = 'alldata_qgts%s' % qgt_seed + 'lok=%s' % qloss_k elif qlearn_type == 'learn_border_data': verbose_qnet = 'uebor%s_qgts%s' % (border, qgt_seed) if clip is None \ else 'uebor%s_clip%s_qgts%s' % (border, clip, qgt_seed) verbose_qnet += 'lok=%s' % qloss_k else: raise ValueError print('verbose_qnet:', verbose_qnet) Q_from_gt = QNet(state_dim, action_dim, activation='relu') Q_from_gt.load_state_dict( torch.load('%s/%s_Qgt.pth' % ("./pytorch_models", setting_name + '_' + verbose_qnet))) print('load Qnet from', '%s/%s_UE.pth' % ("./pytorch_models", setting_name)) # choose the epsilon plan for the constraints if update_type == 'c': epsilon = epsilon_plan(epsilon_base, action_range, selected_buffer, selected_gts, Q_from_gt, device,\ plan='common') else: epsilon = torch.FloatTensor([epsilon_base]) print('one epsilon:', epsilon) print('policy train starts --') '''Initialize policy of the update type''' print("Updating approach: BC_ue_border_perturb_%s" % update_type) if update_type == "c": policy = BC_ue_border_perturb_c.BC_ue_perturb(state_dim, action_dim, max_action,\ lr=lr, lag_lr=lag_lr, wd=wd, num_lambda=buffer_length, Q_from_gt=Q_from_gt ) elif update_type == "5": policy = BC_ue_border_perturb_5.BC_ue_perturb(state_dim, action_dim, max_action, \ lr=lr, lag_lr=lag_lr, wd=wd, Q_from_gt=Q_from_gt) elif update_type == "e": policy = BC_ue_border_perturb_e.BC_ue_perturb(state_dim, action_dim, max_action, \ lr=lr, wd=wd, Q_from_gt=Q_from_gt) policy.train_a_tilda(selected_buffer, max_updates=50, search_lr=search_lr, epsilon=epsilon) episode_num = 0 done = True training_iters, epoch = 0, 0 while training_iters < max_timesteps: epoch += 1 if update_type == 'e': pol_vals = policy.behavioral_cloning(iterations=int(eval_freq), logger=logger) else: # "5" and "c" pol_vals = policy.train(selected_buffer, iterations=int(eval_freq), epsilon=epsilon, logger=logger) avgtest_reward = evaluate_policy(policy, test_env) training_iters += eval_freq logger.log_tabular('Epoch', epoch) logger.log_tabular('AverageTestEpRet', avgtest_reward) logger.log_tabular('TotalSteps', training_iters) if update_type == 'c': logger.log_tabular('BCLoss', average_only=True) logger.log_tabular('ActorLoss', average_only=True) logger.log_tabular('LambdaMax', average_only=True) logger.log_tabular('LambdaMin', average_only=True) logger.log_tabular('ConstraintViolated', with_min_and_max=True) elif update_type == '5': logger.log_tabular('BCLoss', average_only=True) logger.log_tabular('ActorLoss', average_only=True) logger.log_tabular('Lambda', average_only=True) logger.log_tabular('ConstraintViolatedValue', average_only=True) elif update_type == 'e': logger.log_tabular('BCLoss', average_only=True) logger.dump_tabular()
def vpg(env, actor_critic=MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, pi_lr=3e-4, vf_lr=1e-3, train_v_iters=80, max_ep_len=1000, logger_kwargs=dict(), save_freq=10): """ Vanilla Policy Gradient (with GAE 0 for advantage estimation) Args: env : An environment that satisfies the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to VPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_v_iters (int): Number of gradient descent steps to take on value function per epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) obs_dim = env.observation_space.shape act_dim = env.action_space.n # assumes Discrete space ac = actor_critic(env.observation_space, env.action_space) ac.to(device) # buffer size equals number of steps in an epoch buff = VPGBuffer(steps_per_epoch, gamma, obs_dim, act_dim) def compute_loss_pi(data): obs = torch.as_tensor(data.obs_buf, dtype=torch.float32, device=device) act = torch.as_tensor(data.act_buf, dtype=torch.int32, device=device) adv = torch.as_tensor(data.advantage_buf, dtype=torch.float32, device=device) logpa = ac.pi(obs, act) return -1 * (logpa * adv).mean() def compute_loss_v(data): obs = torch.as_tensor(data.obs_buf, dtype=torch.float32, device=device) rew2go = torch.as_tensor(data.rew2go_buf, dtype=torch.float32, device=device) values = ac.v(obs) return F.mse_loss(values, rew2go) pi_optimizer = torch.optim.Adam(ac.pi.parameters(), lr=pi_lr) v_optimizer = torch.optim.Adam(ac.v.parameters(), lr=vf_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update_pi(data): pi_optimizer.zero_grad() pi_loss = compute_loss_pi(data) pi_loss.backward() pi_optimizer.step() logger.store(LossPi=pi_loss.item()) #TODO: log policy entropy def update_v(data): for s in range(train_v_iters): v_optimizer.zero_grad() v_loss = compute_loss_v(data) v_loss.backward() v_optimizer.step() logger.store(LossV=v_loss.item()) total_steps = steps_per_epoch * epochs start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 t = 0 # total environment interactions # Update policy once per epoch for epoch in range(epochs): for t_epoch in range(steps_per_epoch): t += 1 a, v, logpa = ac.step( torch.as_tensor(o, dtype=torch.float32, device=device)) o2, r, d, info = env.step(a.cpu().numpy()) buff.store(o, a, v, r, logpa) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d o = o2 # If trajectory is finished, calculate rewards to go, # then calculate the Advantage. if d is True or (ep_len == max_ep_len) or (t_epoch + 1 == steps_per_epoch): buff.finish_trajectory() logger.store( EpRet=ep_ret, EpLen=ep_len, ) o, ep_ret, ep_len = env.reset(), 0, 0 # Calculate policy gradient when we've collected t_epoch time steps. if t_epoch + 1 == steps_per_epoch: pylogger.debug('*** epoch ***', epoch) pylogger.debug('*** t_epoch ***', t_epoch) pylogger.debug('values', buff.val_buf) pylogger.debug('rewards', buff.rew_buf) pylogger.debug('rew2go', buff.rew2go_buf) pylogger.debug('advantage', buff.advantage_buf) # Update the policy using policy gradient update_pi(buff) # Re-fit the value function on the MSE. Note, this is # gradient descent starting from the previous parameters. update_v(buff) # Save model if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({'env': env}, None) # note, this includes full model pickle # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Time', time.time() - start_time) if hasattr(env, 'episode_id'): logger.log_tabular('EpisodeId', env.episode_id) # If a quantity has not been calculated/stored yet, do not log it. This can # happen, e.g. if NN update length or episode length exceeds num steps in epoch. to_log = [{ 'key': 'LossV', 'average_only': True }, { 'key': 'LossPi', 'average_only': True }, { 'key': 'EpRet', 'with_min_and_max': True }, { 'key': 'EpLen', 'average_only': True }, { 'key': 'RawRet', 'with_min_and_max': True }, { 'key': 'RawLen', 'average_only': True }] for log_tabular_kwargs in to_log: key = log_tabular_kwargs['key'] if key in logger.epoch_dict and len(logger.epoch_dict[key]) > 0: logger.log_tabular(**log_tabular_kwargs) wandb.log(logger.log_current_row, step=epoch) logger.dump_tabular() # reset buffer buff = VPGBuffer(steps_per_epoch, gamma, obs_dim, act_dim) # Save final model as a state dict state = { 'epoch': epoch, 'pi_state_dict': ac.pi.state_dict(), 'v_state_dict': ac.v.state_dict(), 'pi_optimizer': pi_optimizer.state_dict(), 'v_optimizer': v_optimizer.state_dict(), } # hack for wandb: should output the model in the wandb.run.dir to avoid # problems syncing the model in the cloud with wandb's files state_fname = os.path.join(logger_kwargs['output_dir'], f"state_dict.pt") torch.save(state, state_fname) wandb.save(state_fname) pylogger.info(f"Saved state dict to {state_fname}") env.close()