def train_trpo(): R_avg = [] R_max = [] R_min = [] for i_iter in range(args.max_iter_num): """generate multiple trajectories that reach the minimum batch_size""" batch, log = agent.collect_samples(args.min_batch_size) t0 = time.time() sampled_states = torch.from_numpy(np.stack( batch.state)).to(dtype).to(device) sampled_actions = torch.from_numpy(np.stack( batch.action)).to(dtype).to(device) rewards = torch.from_numpy(np.stack(batch.reward)).to(dtype).to(device) terminals = torch.from_numpy(np.stack(batch.mask)).to(dtype).to(device) with torch.no_grad(): values = value_net(sampled_states) advantages, returns = estimate_advantages(rewards, terminals, values, args.gamma, args.tau, device) trpo_step(policy_net, value_net, sampled_states, sampled_actions, returns, advantages, args.max_kl, args.damping, args.l2_reg) t1 = time.time() if i_iter % args.log_interval == 0: print( '{}\tT_sample {:.4f}\tT_update {:.4f}\tR_min {:.2f}\tR_max {:.2f}\tR_avg {:.2f}' .format(i_iter, log['sample_time'], t1 - t0, log['min_reward'], log['max_reward'], log['avg_reward'])) if args.save_model_interval > 0 and ( i_iter + 1) % args.save_model_interval == 0: to_device(torch.device('cpu'), policy_net, value_net) pickle.dump( (policy_net, value_net, running_state), open( os.path.join( assets_dir(), 'learned_models/expert-policies/{}_trpo.p'.format( args.env_name)), 'wb')) to_device(device, policy_net, value_net) torch.cuda.empty_cache()
def airl_step(batch, i_iter): to_device(device, policy_net, value_net, discriminator) states = torch.from_numpy(np.stack(batch.state)).to(dtype).to(device) actions = torch.from_numpy(np.stack(batch.action)).to(dtype).to(device) masks = torch.from_numpy(np.stack(batch.mask)).to(dtype).to(device) with torch.no_grad(): values = value_net(states) X = torch.cat([states, actions], 1).to(dtype).to(device) # Concatenate s,a pairs of agent Y = torch.from_numpy(expert_traj).to(dtype).to(device) rewards = [] rs = discriminator(X).detach().clone() for r in rs: #Reward: log D - log 1-D reward rewards.append(math.log(r.item()) - math.log(1 - r.item())) rewards = torch.tensor(rewards) rewards = torch.clamp(rewards, max=10, min=-10) survival_bonus = 11 # Adding survival bonus improved performance on environment where you need to stay alive rewards = rewards + survival_bonus print(rewards.mean()) advantages, returns = estimate_advantages(rewards, masks, values, args.gamma, args.tau, device) sampled_episodes = [] epis = [] for pair in range(len(masks)): # Split to episodes (for tracking eval only) epis.append(X[pair].cpu().numpy()) if masks[pair] == 0: sampled_episodes.append(epis) epis = [] batch_size = args.batch for ep in range(1): permutation = torch.randperm(X.size()[0]) for i in range(0,X.size()[0], batch_size): indices = permutation[i:i+batch_size] batch_x = X[indices, ::] learner_samples_disc = discriminator(batch_x) expert_samples_disc = discriminator(Y) optimizer_discrim.zero_grad() discrim_loss = discrim_criterion(learner_samples_disc, zeros((batch_x.shape[0], 1), device=device)) + \ discrim_criterion(expert_samples_disc, ones((expert_traj.shape[0], 1), device=device)) discrim_loss.backward() optimizer_discrim.step() trpo_step(policy_net, value_net, states, actions, returns, advantages, args.max_kl, args.damping, args.l2_reg) # Update policy return len(sampled_episodes)
def gail_step(batch, i_iter): to_device(device, policy_net, value_net, discriminator) states = torch.from_numpy(np.stack(batch.state)).to(dtype).to(device) actions = torch.from_numpy(np.stack(batch.action)).to(dtype).to(device) masks = torch.from_numpy(np.stack(batch.mask)).to(dtype).to(device) with torch.no_grad(): values = value_net(states) X = torch.cat([states, actions], 1).to(dtype).to(device) # Concatenate s,a pairs of agent Y = torch.from_numpy(expert_traj).to(dtype).to(device) rewards = [] rs = discriminator(X).detach().clone() for r in rs: rewards.append(-math.log(r.item())) #gail reward rewards = torch.tensor(rewards) advantages, returns = estimate_advantages(rewards, masks, values, args.gamma, args.tau, device) sampled_episodes = [] epis = [] for pair in range(len(masks)): # Split to episodes for evaluation only epis.append(X[pair].cpu().numpy()) if masks[pair] == 0: sampled_episodes.append(epis) epis = [] batch_size = args.batch for ep in range(1): permutation = torch.randperm(X.size()[0]) for i in range(0,X.size()[0], batch_size): indices = permutation[i:i+batch_size] batch_x = X[indices, ::] learner_samples_disc = discriminator(batch_x) expert_samples_disc = discriminator(Y) optimizer_discrim.zero_grad() discrim_loss = discrim_criterion(learner_samples_disc, ones((batch_x.shape[0], 1), device=device)) + \ discrim_criterion(expert_samples_disc, zeros((expert_traj.shape[0], 1), device=device)) discrim_loss.backward() optimizer_discrim.step() trpo_step(policy_net, value_net, states, actions, returns, advantages, args.max_kl, args.damping, args.l2_reg) # Update policy (TRPO from: https://github.com/Khrylx/PyTorch-RL) return len(sampled_episodes)
def sil_step(batch, i_iter): states = torch.from_numpy(np.stack(batch.state)).to(dtype).to(device) next_states = torch.from_numpy(np.stack(batch.state)).to(dtype).to( device) # Use for state-next_state distribution matching actions = torch.from_numpy(np.stack(batch.action)).to(dtype).to(device) masks = torch.from_numpy(np.stack(batch.mask)).to(dtype).to(device) to_device(device, policy_net, value_net, critic_net) X = torch.cat([states, actions], 1).to(dtype).to(device) # Concatenate s,a pairs of agent with torch.no_grad(): values = value_net(states) for _ in range(1): sampled_episodes = [] epis = [] for pair in range( len(masks) ): # Split to episodes to do random matching (Corresponds to Step 3 of Algorithm 1) epis.append(X[pair].cpu().numpy()) if masks[pair] == 0: sampled_episodes.append(epis) epis = [] total_wasserstein = 0 # Keeps track of all Wassersteins for one episode rewards = [] # Logs rewards to update TRPO min_wasserstein = 10e10 # Used for logging at command line max_wasserstein = 0 # Used for logging at command line best_trajectory = None # Used for logging at command line worst_trajectory = None # Used for logging at command line index = 0 # Used for logging at command line best_idx = 0 # Used for logging at command line worst_idx = 0 # Used for logging at command line per_trajectory_dis = [] # Used for logging at command line cost_loss = [] num_of_samples = len(sampled_episodes) - 1 threshold = num_of_samples - 3 episodic_eval_sinkhorn = [] for trajectory in sampled_episodes: X = torch.tensor(trajectory).to(dtype).to( device) # Convert trajectory to tensor. sample_traj_index = random.randint(0, (args.dataset_size - 1)) Y = torch.from_numpy(expert_traj[sample_traj_index]).to(dtype).to( device ) # Randomly match (Corresponds to Step 3 of Algorithm 1) cost_matrix = cosine_critic( X, Y, critic_net ) # Get cost matrix for samples using critic network. transport_plan = optimal_transport_plan( X, Y, cost_matrix, method='sinkhorn_gpu') # Getting optimal coupling per_sample_costs = torch.diag( torch.mm(transport_plan, cost_matrix.T) ) # Get diagonals W = MC^T, where M is the optimal transport map and C the cost matrix distance = torch.sum( per_sample_costs ) # Calculate Wasserstein by summing diagonals, i.e., W=Trace[MC^T] wasserstein_distance = -( distance ) # Assign -wasserstein in order to GD to maximise if using adversary for training. per_trajectory_dis.append(distance.detach().cpu().numpy( )) # Keep track of all Wasserstein distances in one sample. #=========FOR EVALUATION ONLY=============# if args.log_actual_sinkhorn: evaluation_cost_matrix = cosine_distance(X, Y) evaluation_transport_plan = optimal_transport_plan( X, Y, evaluation_cost_matrix, method='sinkhorn_gpu') eval_wasserstein_distance = torch.sum( torch.diag( torch.mm(evaluation_transport_plan, evaluation_cost_matrix.T))) episodic_eval_sinkhorn.append(eval_wasserstein_distance.item()) #=========================================# if distance < min_wasserstein and index != ( len(sampled_episodes) ): # Keep track of best trajectory based on Wasserstein distance min_wasserstein = distance best_trajectory = X best_idx = index if distance > max_wasserstein and index != ( len(sampled_episodes) ): # Keep track of worst trajectory based on Wasserstein distance max_wasserstein = distance worst_trajectory = X worst_idx = index index += 1 counter = 0 survival_bonus = 4 / X.shape[0] for per_sample_cost in per_sample_costs: with torch.no_grad(): temp_r = -2 * per_sample_cost + survival_bonus temp_r.unsqueeze_(0) temp_r = running_reward(temp_r.cpu()) rewards.append(temp_r) counter += 1 total_wasserstein += distance torch.cuda.empty_cache() total_wasserstein = -total_wasserstein / num_of_samples optimizer_ot.zero_grad() total_wasserstein.backward( ) #Only backpropagates through the critic network. optimizer_ot.step() # args.critic_lr*=0.992 # Perhaps decreasing lr may be useful in stabilizing training process # for param_group in optimizer_ot.param_groups: # param_group['lr'] = args.critic_lr with torch.no_grad(): rewards = torch.tensor(rewards) advantages, returns = estimate_advantages( rewards, masks, values, args.gamma, args.tau, device) # Get Advantages for TRPO torch.cuda.empty_cache() trpo_step( policy_net, value_net, states, actions, returns, advantages, args.max_kl, args.damping, args.l2_reg ) # Update policy (TRPO from: https://github.com/Khrylx/PyTorch-RL) return (total_wasserstein**2)**(1 / 2), episodic_eval_sinkhorn, len( sampled_episodes ), min_wasserstein, best_trajectory, best_idx, max_wasserstein, worst_trajectory, worst_idx, per_trajectory_dis
def sil_step(batch, i_iter): # Get s,a,r of agent interaction with the environment. This is what agent.collect_samples returns in the main method. states = torch.from_numpy(np.stack(batch.state)).to(dtype).to(device) next_states = torch.from_numpy(np.stack(batch.state)).to(dtype).to(device) actions = torch.from_numpy(np.stack(batch.action)).to(dtype).to(device) masks = torch.from_numpy(np.stack(batch.mask)).to(dtype).to(device) to_device(device, policy_net, value_net, critic_net) X = torch.cat([states, actions], 1).to(dtype).to(device) # Concatenate s,a pairs of agent with torch.no_grad(): values = value_net(states) for _ in range(1): sampled_episodes = [] epis = [] for pair in range(len(masks)): # Split to episodes epis.append(X[pair].cpu().numpy()) if masks[pair] == 0: sampled_episodes.append(epis) epis = [] total_wasserstein = 0 # Keeps track of all Wassersteins for one episode rewards = [] # Logs rewards to update TRPO min_wasserstein = 10e10 # Used for logging at command line max_wasserstein = 0 # Used for logging at command line best_trajectory = None # Used for logging at command line worst_trajectory = None # Used for logging at command line index = 0 # Used for logging at command line best_idx = 0 # Used for logging at command line worst_idx = 0 # Used for logging at command line per_trajectory_dis = [] # Used for logging at command line cost_loss = [] num_of_samples = len(sampled_episodes) - 1 threshold = num_of_samples - 3 episodic_eval_sinkhorn = [] for trajectory in sampled_episodes: X = torch.tensor(trajectory).to(dtype).to( device) # Convert trajectory to tensor. sample_traj_index = random.randint(0, (args.dataset_size - 1)) Y = torch.from_numpy(expert_traj[sample_traj_index]).to(dtype).to( device ) # Comment this out if you do not want to use expert trajectories, but use the below to use direct expert feedback. cost_matrix = cosine_distance( X, Y ) # Get cost matrix for samples using fixed cosine transport cost. transport_plan = optimal_transport_plan( X, Y, cost_matrix, method='sinkhorn_gpu') # Getting optimal coupling per_sample_costs = torch.diag( torch.mm(transport_plan, cost_matrix.T) ) # Get diagonals W = MC^T, where M is the optimal transport map and C the cost matrix distance = torch.sum( per_sample_costs ) # Calculate Wasserstein by summing diagonals, i.e., W=Trace[MC^T] wasserstein_distance = -( distance ) # Assign -wasserstein in order to gradient descent to maximise if using adversary for training. per_trajectory_dis.append(distance.detach().cpu().numpy( )) # Keep track of all Wasserstein distances in one sample. episodic_eval_sinkhorn.append(distance.item()) if distance < min_wasserstein and index != ( len(sampled_episodes) ): # Keep track of best trajectory based on Wasserstein distance min_wasserstein = distance best_trajectory = X best_idx = index if distance > max_wasserstein and index != ( len(sampled_episodes) ): # Keep track of worst trajectory based on Wasserstein distance max_wasserstein = distance worst_trajectory = X worst_idx = index index += 1 counter = 0 survival_bonus = 4 / X.shape[0] for per_sample_cost in per_sample_costs: # Add rewards with torch.no_grad(): temp_r = -2 * per_sample_cost + survival_bonus temp_r.unsqueeze_(0) temp_r = running_reward(temp_r.cpu()) rewards.append(temp_r) counter += 1 total_wasserstein += distance torch.cuda.empty_cache() with torch.no_grad(): rewards = torch.tensor(rewards) advantages, returns = estimate_advantages( rewards, masks, values, args.gamma, args.tau, device) # Get Advantages for TRPO torch.cuda.empty_cache() trpo_step(policy_net, value_net, states, actions, returns, advantages, args.max_kl, args.damping, args.l2_reg) # Update policy return (total_wasserstein**2)**(1 / 2), episodic_eval_sinkhorn, len( sampled_episodes ), min_wasserstein, best_trajectory, best_idx, max_wasserstein, worst_trajectory, worst_idx, per_trajectory_dis