loss /= torch.tensor(num, device=device, dtype=torch.float32) policy_candidate_optimizer.zero_grad() loss.backward(retain_graph=True) nn.utils.clip_grad_norm(policy_candidate.parameters(), 1.) # Clip gradients policy_candidate_optimizer.step() policy_net = copy.deepcopy(policy_candidate).to(device) # Optimize value net for a given number of steps # Set value net in training mode value_net_in.train() value_net_ex.train() ex_rtg = memory.extrinsic_discounted_rtg( batch_size) # Use undiscounted reward-to-go to fit the value net in_rtg = memory.intrinsic_rtg(batch_size) ex_val_est = [] in_val_est = [] print("\n\n\tUpdate Value Net for %d steps" % (num_vn_iter)) for i in tqdm(range(num_vn_iter)): # Use tqdm to show progress bar for j in range(batch_size): in_val_traj = value_net_in( torch.cat([ states[j], torch.ones((states[j].shape[0], 1), dtype=torch.float32, device=device) * j ],
finished_rendering_this_epoch = True break ################################################################### # Optimize the model for a given number of steps # Make a candidate to update parameters actor_critic_candidate = copy.deepcopy(actor_critic).to(device) actor_critic_candidate.train() # initialize the optimizer candidate_optimizer = optim.Adam(actor_critic_candidate.parameters()) # Get batch data ex_rtg = memory.extrinsic_discounted_rtg(batch_size) ex_gae = memory.extrinsic_gae(batch_size) old_act_log_prob = memory.act_log_prob(batch_size) states = memory.states(batch_size) actions = memory.actions(batch_size) # Proximal Policy Optimization - Calculate joint loss for both actor and critic network loss = 0 critic_loss_total = 0 print("\n\n\tUpdate Actor Critic for %d steps:" % num_updates_per_epoch) for i in tqdm( range(num_updates_per_epoch)): # Use tqdm to show progress bar num = 0