def test_vtrace_from_logits(self, batch_size=2): """Tests V-trace calculated from logits.""" seq_len = 5 num_actions = 3 clip_rho_threshold = None # No clipping. clip_pg_rho_threshold = None # No clipping. values = { "behavior_policy_logits": _shaped_arange(seq_len, batch_size, num_actions), "target_policy_logits": _shaped_arange(seq_len, batch_size, num_actions), "actions": np.random.randint(0, num_actions - 1, size=(seq_len, batch_size)), "discounts": np.array( # T, B where B_i: [0.9 / (i+1)] * T [[0.9 / (b + 1) for b in range(batch_size)] for _ in range(seq_len)], dtype=np.float32, ), "rewards": _shaped_arange(seq_len, batch_size), "values": _shaped_arange(seq_len, batch_size) / batch_size, "bootstrap_value": _shaped_arange(batch_size) + 1.0, # B } values = {k: torch.from_numpy(v) for k, v in values.items()} from_logits_output = vtrace.from_logits( clip_rho_threshold=clip_rho_threshold, clip_pg_rho_threshold=clip_pg_rho_threshold, **values, ) target_log_probs = vtrace.action_log_probs( values["target_policy_logits"], values["actions"]) behavior_log_probs = vtrace.action_log_probs( values["behavior_policy_logits"], values["actions"]) log_rhos = target_log_probs - behavior_log_probs # Calculate V-trace using the ground truth logits. from_iw = vtrace.from_importance_weights( log_rhos=log_rhos, discounts=values["discounts"], rewards=values["rewards"], values=values["values"], bootstrap_value=values["bootstrap_value"], clip_rho_threshold=clip_rho_threshold, clip_pg_rho_threshold=clip_pg_rho_threshold, ) assert_allclose(from_iw.vs, from_logits_output.vs) assert_allclose(from_iw.pg_advantages, from_logits_output.pg_advantages) assert_allclose(behavior_log_probs, from_logits_output.behavior_action_log_probs) assert_allclose(target_log_probs, from_logits_output.target_action_log_probs) assert_allclose(log_rhos, from_logits_output.log_rhos)
def learn( flags, learner_queue, model, actor_model, optimizer, scheduler, stats, plogger, lock=threading.Lock(), ): for tensors in learner_queue: tensors = nest.map(lambda t: t.to(flags.learner_device), tensors) batch, initial_agent_state = tensors env_outputs, actor_outputs = batch frame, reward, done, *_ = env_outputs lock.acquire() # Only one thread learning at a time. learner_outputs, unused_state = model( dict(frame=frame, reward=reward, done=done), initial_agent_state ) # Take final value function slice for bootstrapping. learner_outputs = AgentOutput._make(learner_outputs) bootstrap_value = learner_outputs.baseline[-1] # Move from obs[t] -> action[t] to action[t] -> obs[t]. batch = nest.map(lambda t: t[1:], batch) learner_outputs = nest.map(lambda t: t[:-1], learner_outputs) # Turn into namedtuples again. env_outputs, actor_outputs = batch env_outputs = EnvOutput._make(env_outputs) actor_outputs = AgentOutput._make(actor_outputs) learner_outputs = AgentOutput._make(learner_outputs) if flags.reward_clipping == "abs_one": clipped_rewards = torch.clamp(env_outputs.rewards, -1, 1) elif flags.reward_clipping == "none": clipped_rewards = env_outputs.rewards discounts = (1 - env_outputs.done).float() * flags.discounting vtrace_returns = vtrace.from_logits( behavior_policy_logits=actor_outputs.policy_logits, target_policy_logits=learner_outputs.policy_logits, actions=actor_outputs.action, discounts=discounts, rewards=clipped_rewards, values=learner_outputs.baseline, bootstrap_value=bootstrap_value, ) pg_loss = compute_policy_gradient_loss( learner_outputs.policy_logits, actor_outputs.action, vtrace_returns.pg_advantages, ) baseline_loss = flags.baseline_cost * compute_baseline_loss( vtrace_returns.vs - learner_outputs.baseline ) entropy_loss = flags.entropy_cost * compute_entropy_loss( learner_outputs.policy_logits ) total_loss = pg_loss + baseline_loss + entropy_loss scheduler.step() optimizer.zero_grad() total_loss.backward() nn.utils.clip_grad_norm_(model.parameters(), flags.grad_norm_clipping) optimizer.step() actor_model.load_state_dict(model.state_dict()) episode_returns = env_outputs.episode_return[env_outputs.done] stats["step"] = stats.get("step", 0) + flags.unroll_length * flags.batch_size stats["episode_returns"] = tuple(episode_returns.cpu().numpy()) stats["mean_episode_return"] = torch.mean(episode_returns).item() stats["mean_episode_step"] = torch.mean(env_outputs.episode_step.float()).item() stats["total_loss"] = total_loss.item() stats["pg_loss"] = pg_loss.item() stats["baseline_loss"] = baseline_loss.item() stats["entropy_loss"] = entropy_loss.item() stats["learner_queue_size"] = learner_queue.size() plogger.log(stats) if not len(episode_returns): # Hide the mean-of-empty-tuple NaN as it scares people. stats["mean_episode_return"] = None lock.release()
def learn( flags, actor_model, # single actor model with shared memory? Confirm that? model, batch, optimizer, scheduler, lock=threading.Lock(), # noqa: B008 ): """Performs a learning (optimization) step.""" with lock: learner_outputs = model.learner_step(batch) # Take final value function slice for bootstrapping. bootstrap_value = learner_outputs["baseline_trg"][-1] # V_learner(s_T) entropy = learner_outputs['entropy'] #rearranged_batch = {} #rearranged_batch['done'] = batch['done'][:-1] # done_{0}, ..., done_{T-1} #rearranged_batch['done'] = batch['done'][1:] #rearranged_batch['bootstrap'] = batch['bootstrap'][1:] #rearranged_batch['reward'] = batch['reward'][1:] # reward_{0}, ..., reward_{T-1} #rearranged_batch['log_prob'] = batch['log_prob'][:-1] # log_prob_{0}, ..., log_prob_{T-1} # gets [log_prob_{0}, ..., log_prob_{T-1}] and [V_{0},...,V_{T-1}] learner_outputs = { key: tensor[:-1] for key, tensor in learner_outputs.items() if key != 'entropy' } rewards = batch['reward'][1:] if flags.reward_clipping == "abs_one": clipped_rewards = torch.clamp(rewards, -1, 1) elif flags.reward_clipping == "none": clipped_rewards = rewards #discounts = (~rearranged_batch["done"]).float() * flags.discounting # 0 if done, gamma otherwise vtrace_returns = vtrace.from_logits( behavior_action_log_probs=batch['log_prob'][:-1], # actor target_action_log_probs=learner_outputs["log_prob"], # learner not_done=(~batch['done'][1:]).float(), bootstrap=batch['bootstrap'][1:], gamma=flags.discounting, rewards=clipped_rewards, values=learner_outputs["baseline"], values_trg=learner_outputs["baseline_trg"], bootstrap_value=bootstrap_value, # coming from the learner too ) pg_loss = compute_policy_gradient_loss( learner_outputs["log_prob"], vtrace_returns.pg_advantages, ) baseline_loss = flags.baseline_cost * compute_baseline_loss( vtrace_returns.vs - learner_outputs["baseline"]) entropy_loss = flags.entropy_cost * entropy total_loss = pg_loss + baseline_loss + entropy_loss # not every time we get an episode return because the unroll length is shorter than the episode length, # so not every time batch['done'] contains some True entries episode_returns = batch["episode_return"][ batch["done"]] # still to check, might be okay stats = { "episode_returns": tuple(episode_returns.cpu().numpy()), "mean_episode_return": torch.mean(episode_returns).item(), "total_loss": total_loss.item(), "pg_loss": pg_loss.item(), "baseline_loss": baseline_loss.item(), "entropy_loss": entropy_loss.item(), } optimizer.zero_grad() total_loss.backward() nn.utils.clip_grad_norm_(model.parameters(), flags.grad_norm_clipping) optimizer.step() if flags.optim == "RMSprop": scheduler.step() actor_model.load_state_dict(model.state_dict()) return stats
def learn(flags, actor_model, model, batch, initial_agent_state, optimizer, scheduler, stats, lock=threading.Lock(), envs=None): """Performs a learning (optimization) step.""" with lock: # forward pass with gradients learner_outputs, unused_state = model(batch, initial_agent_state) # Take final value function slice for bootstrapping. bootstrap_value = learner_outputs["baseline"][-1] # Move from obs[t] -> action[t] to action[t] -> obs[t]. batch = {key: tensor[1:] for key, tensor in batch.items()} learner_outputs = { key: tensor[:-1] for key, tensor in learner_outputs.items() } # if specified, clip rewards between -1 and 1 rewards = batch["reward"] if flags.reward_clipping == "abs_one": clipped_rewards = torch.clamp(rewards, -1, 1) elif flags.reward_clipping == "none": clipped_rewards = rewards # the "~"/tilde operator is apparently kind of a complement or # inverse, so maybe this just reverses # the "done" tensor? in that case would discounting only be applied when the game was NOT done? discounts = (~batch["done"]).float() * flags.discounting # prepare tensors for computation of the loss task = F.one_hot(batch["task"].long(), flags.num_tasks).float() clipped_rewards = clipped_rewards[:, :, None] discounts = discounts[:, :, None] # prepare PopArt parameters as well mu = model.baseline.mu[None, None, :] sigma = model.baseline.sigma[None, None, :] # get the V-trace returns; I hope nothing needs to be changed about this, but I think # once one has the V-trace returns it can just be plugged into the PopArt equations vtrace_returns = vtrace.from_logits( behavior_policy_logits=batch["policy_logits"], target_policy_logits=learner_outputs["policy_logits"], actions=batch["action"], discounts=discounts, rewards=clipped_rewards, values=learner_outputs["baseline"], bootstrap_value=bootstrap_value, normalized_values=learner_outputs["normalized_baseline"], mu=mu, sigma=sigma) # PopArt normalization with torch.no_grad(): normalized_vs = (vtrace_returns.vs - mu) / sigma # policy gradient loss pg_loss = compute_policy_gradient_loss( learner_outputs["policy_logits"], batch["action"], vtrace_returns.pg_advantages * task, ) # value function/baseline loss (1/2 * squared difference between V-trace and value function) baseline_loss = flags.baseline_cost * compute_baseline_loss( # vtrace_returns.vs - learner_outputs["baseline"] (normalized_vs - learner_outputs["normalized_baseline"]) * task) # entropy loss for getting a "diverse" action distribution (?), "normal entropy" over action distribution entropy_loss = flags.entropy_cost * compute_entropy_loss( learner_outputs["policy_logits"]) total_loss = pg_loss + baseline_loss + entropy_loss # do the backward pass (WITH GRADIENT NORM CLIPPING) and adjust hyperparameters (scheduler, ?) optimizer.zero_grad() total_loss.backward() # plot_grad_flow(model.named_parameters(), flags) gradient_tracker.process_backward_pass(model.named_parameters()) nn.utils.clip_grad_norm_(model.parameters(), flags.grad_norm_clipping) optimizer.step() scheduler.step() # update the PopArt parameters, which the optimizer does not take care of if flags.use_popart: model.baseline.update_parameters(vtrace_returns.vs, task) # update the actor model with the new parameters actor_model.load_state_dict(model.state_dict()) # get the returns only for finished episodes (where the game was played to completion) episode_returns = batch["episode_return"][batch["done"]] stats["step"] = stats.get("step", 0) + flags.unroll_length * flags.batch_size stats["episode_returns"] = tuple(episode_returns.cpu().numpy()) stats["mean_episode_return"] = torch.mean(episode_returns).item() stats["total_loss"] = total_loss.item() stats["pg_loss"] = pg_loss.item() stats["baseline_loss"] = baseline_loss.item() stats["entropy_loss"] = entropy_loss.item() stats["mu"] = mu[0, 0, :] stats["sigma"] = sigma[0, 0, :] if "env_step" not in stats: stats["env_step"] = {} for task in batch["task"][0].cpu().numpy(): stats["env_step"][envs[task]] = stats["env_step"].get( envs[task], 0) + flags.unroll_length return stats
def learn( flags, learner_queue, d_queue, model, actor_model, D, optimizer, scheduler, stats, plogger, lock=threading.Lock(), ): for tensors in learner_queue: tensors = nest.map( lambda t: t.to(flags.learner_device, non_blocking=True), tensors) batch, agent_state, image = tensors env_outputs, actor_outputs, noise = batch batch = (env_outputs, actor_outputs) frame, reward, done, *_ = env_outputs d_queue.put((frame, image.squeeze(0))) lock.acquire() # Only one thread learning at a time. optimizer.zero_grad() actor_outputs = AgentOutput._make(actor_outputs) if flags.condition: condition = image else: condition = None model = model.train() learner_outputs, agent_state = model( dict( obs=frame, condition=condition, action=actor_outputs.action, noise=noise, done=done, ), agent_state, ) if flags.use_tca: frame = torch.flatten(frame, 0, 1) if flags.condition: condition = torch.flatten(condition, 0, 1) else: frame = frame[-1] if flags.condition: condition = condition[-1] D = D.eval() with torch.no_grad(): if flags.condition: p = D(frame, condition).view(-1, flags.batch_size) else: p = D(frame).view(-1, flags.batch_size) if flags.use_tca: d_reward = p[1:] - p[:-1] reward = reward[1:] + d_reward else: reward[-1] = reward[-1] + p reward = reward[1:] # empty condition condition = None # Take final value function slice for bootstrapping. learner_outputs = AgentOutput._make(learner_outputs) bootstrap_value = learner_outputs.baseline[-1] # Move from obs[t] -> action[t] to action[t] -> obs[t]. batch = nest.map(lambda t: t[1:], batch) learner_outputs = nest.map(lambda t: t[:-1], learner_outputs) # Turn into namedtuples again. env_outputs, actor_outputs = batch env_outputs = EnvOutput._make(env_outputs) actor_outputs = AgentOutput._make(actor_outputs) learner_outputs = AgentOutput._make(learner_outputs) discounts = (~env_outputs.done).float() * flags.discounting action = actor_outputs.action.unbind(dim=2) vtrace_returns = vtrace.from_logits( behavior_policy_logits=actor_outputs.policy_logits, target_policy_logits=learner_outputs.policy_logits, actions=action, discounts=discounts, rewards=reward, values=learner_outputs.baseline, bootstrap_value=bootstrap_value, ) pg_loss = compute_policy_gradient_loss( learner_outputs.policy_logits, action, vtrace_returns.pg_advantages, ) baseline_loss = flags.baseline_cost * compute_baseline_loss( vtrace_returns.vs - learner_outputs.baseline) entropy_loss = flags.entropy_cost * compute_entropy_loss( learner_outputs.policy_logits) total_loss = pg_loss + baseline_loss + entropy_loss total_loss.backward() nn.utils.clip_grad_norm_(model.parameters(), flags.grad_norm_clipping) optimizer.step() scheduler.step() actor_model.load_state_dict(model.state_dict()) stats["step"] = stats.get("step", 0) + flags.unroll_length * flags.batch_size stats["total_loss"] = total_loss.item() stats["pg_loss"] = pg_loss.item() stats["baseline_loss"] = baseline_loss.item() stats["entropy_loss"] = entropy_loss.item() stats["final_reward"] = reward[-1].mean().item() stats["episode_reward"] = reward.mean(dim=1).sum().item() stats["learner_queue_size"] = learner_queue.size() if flags.condition: if flags.use_tca: _, C, H, W = frame.shape frame = frame.view(flags.unroll_length, flags.batch_size, C, H, W) frame = frame[-1] stats["l2_loss"] = F.mse_loss(frame, image.squeeze(0)).item() plogger.log(stats) lock.release()
def learn( flags, actor_model, model, batch, initial_agent_state, optimizer, scheduler, lock=threading.Lock(), # noqa: B008 ): """Performs a learning (optimization) step.""" with lock: learner_outputs, unused_state = model(batch, initial_agent_state) # Take final value function slice for bootstrapping. bootstrap_value = learner_outputs["baseline"][-1] # Move from obs[t] -> action[t] to action[t] -> obs[t]. batch = {key: tensor[1:] for key, tensor in batch.items()} learner_outputs = {key: tensor[:-1] for key, tensor in learner_outputs.items()} rewards = batch["reward"] if flags.reward_clipping == "abs_one": clipped_rewards = torch.clamp(rewards, -1, 1) elif flags.reward_clipping == "none": clipped_rewards = rewards discounts = (~batch["done"]).float() * flags.discounting vtrace_returns = vtrace.from_logits( behavior_policy_logits=batch["policy_logits"], target_policy_logits=learner_outputs["policy_logits"], actions=batch["action"], discounts=discounts, rewards=clipped_rewards, values=learner_outputs["baseline"], bootstrap_value=bootstrap_value, ) pg_loss = compute_policy_gradient_loss( learner_outputs["policy_logits"], batch["action"], vtrace_returns.pg_advantages, ) baseline_loss = flags.baseline_cost * compute_baseline_loss( vtrace_returns.vs - learner_outputs["baseline"] ) entropy_loss = flags.entropy_cost * compute_entropy_loss( learner_outputs["policy_logits"] ) total_loss = pg_loss + baseline_loss + entropy_loss episode_returns = batch["episode_return"][batch["done"]] stats = { "episode_returns": tuple(episode_returns.cpu().numpy()), "mean_episode_return": torch.mean(episode_returns).item(), "total_loss": total_loss.item(), "pg_loss": pg_loss.item(), "baseline_loss": baseline_loss.item(), "entropy_loss": entropy_loss.item(), } optimizer.zero_grad() total_loss.backward() nn.utils.clip_grad_norm_(model.parameters(), flags.grad_norm_clipping) optimizer.step() scheduler.step() actor_model.load_state_dict(model.state_dict()) return stats
def learn( flags, learner_queue, model, actor_model, D, optimizer, scheduler, stats, plogger, lock=threading.Lock(), ): for tensors in learner_queue: new_obs = tensors[1] tensors = edit_tuple(tensors, 1, new_obs["canvas"]) tensors = nest.map( lambda t: t.to(flags.learner_device, non_blocking=True), tensors) batch, new_frame, initial_agent_state = tensors env_outputs, actor_outputs = batch obs, reward, done, step, _ = env_outputs lock.acquire() # Only one thread learning at a time. if flags.use_tca: discriminator_reward = tca_reward_function(flags, obs, new_frame, D) reward = env_outputs[1] env_outputs = edit_tuple(env_outputs, 1, reward + discriminator_reward) batch = edit_tuple(batch, 0, env_outputs) else: if done.any().item(): discriminator_reward = reward_function(flags, done, new_frame, D) reward = env_outputs[1] env_outputs = edit_tuple(env_outputs, 1, reward + discriminator_reward) batch = edit_tuple(batch, 0, env_outputs) optimizer.zero_grad() actor_outputs = AgentOutput._make(actor_outputs) learner_outputs, agent_state = model(obs, done, initial_agent_state) # Take final value function slice for bootstrapping. learner_outputs = AgentOutput._make(learner_outputs) bootstrap_value = learner_outputs.baseline[-1] # Move from obs[t] -> action[t] to action[t] -> obs[t]. batch = nest.map(lambda t: t[1:], batch) learner_outputs = nest.map(lambda t: t[:-1], learner_outputs) # Turn into namedtuples again. env_outputs, actor_outputs = batch env_outputs = EnvOutput._make(env_outputs) actor_outputs = AgentOutput._make(actor_outputs) learner_outputs = AgentOutput._make(learner_outputs) discounts = (~env_outputs.done).float() * flags.discounting vtrace_returns = vtrace.from_logits( behavior_policy_logits=actor_outputs.policy_logits, target_policy_logits=learner_outputs.policy_logits, actions=actor_outputs.action, discounts=discounts, rewards=env_outputs.reward, values=learner_outputs.baseline, bootstrap_value=bootstrap_value, ) vtrace_returns = vtrace.VTraceFromLogitsReturns._make(vtrace_returns) pg_loss = compute_policy_gradient_loss( learner_outputs.policy_logits, actor_outputs.action, vtrace_returns.pg_advantages, ) baseline_loss = flags.baseline_cost * compute_baseline_loss( vtrace_returns.vs - learner_outputs.baseline) entropy_loss = flags.entropy_cost * compute_entropy_loss( learner_outputs.policy_logits) total_loss = pg_loss + baseline_loss + entropy_loss total_loss.backward() nn.utils.clip_grad_norm_(model.parameters(), flags.grad_norm_clipping) optimizer.step() scheduler.step() actor_model.load_state_dict(model.state_dict()) episode_returns = env_outputs.episode_return[env_outputs.done] stats["step"] = stats.get("step", 0) + flags.unroll_length * flags.batch_size stats["episode_returns"] = tuple(episode_returns.cpu().numpy()) stats["mean_environment_return"] = episode_returns.mean().item() stats["mean_discriminator_return"] = discriminator_reward.mean().item() stats["mean_episode_return"] = (stats["mean_environment_return"] + stats["mean_discriminator_return"]) stats["total_loss"] = total_loss.item() stats["pg_loss"] = pg_loss.item() stats["baseline_loss"] = baseline_loss.item() stats["entropy_loss"] = entropy_loss.item() stats["learner_queue_size"] = learner_queue.size() if flags.condition and new_frame.size() != 0: stats["l2_loss"] = F.mse_loss( *new_frame.split(split_size=new_frame.shape[1] // 2, dim=1)).item() plogger.log(stats) lock.release()