if args.model_path is None: if is_disc_action: policy_net = DiscretePolicy(state_dim, env_dummy.action_space.n) else: policy_net = Policy(state_dim, env_dummy.action_space.shape[0], log_std=args.log_std) value_net = Value(state_dim) else: policy_net, value_net, running_state = pickle.load( open(args.model_path, "rb")) policy_net.to(device) value_net.to(device) del env_dummy optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=0.01) optimizer_value = torch.optim.Adam(value_net.parameters(), lr=0.01) """create agent""" agent = Agent(env_factory, policy_net, device, running_state=running_state, render=args.render, num_threads=args.num_threads) def update_params(batch): states = torch.from_numpy(np.stack(batch.state)).to(dtype).to(device) actions = torch.from_numpy(np.stack(batch.action)).to(dtype).to(device) rewards = torch.from_numpy(np.stack(batch.reward)).to(dtype).to(device) masks = torch.from_numpy(np.stack(batch.mask)).to(dtype).to(device)
"""define actor and critic""" if args.model_path is None: if is_disc_action: policy_net = DiscretePolicy(state_dim, env_dummy.action_space.n, hidden_size=(20,20)) else: policy_net = Policy(state_dim, env_dummy.action_space.shape[0], log_std=args.log_std, hidden_size=(3,3)) value_net = Value(state_dim) else: policy_net, value_net, running_state = pickle.load(open(args.model_path, "rb")) if use_gpu: policy_net = policy_net.cuda() value_net = value_net.cuda() del env_dummy optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=args.learning_rate) optimizer_value = torch.optim.Adam(value_net.parameters(), lr=args.learning_rate) # optimization epoch number and batch size for PPO optim_epochs = 5 optim_batch_size = 64 """create agent""" agent = Agent(env_factory, policy_net, running_state=running_state, render=args.render, num_threads=args.num_threads) def update_params(batch, i_iter): states = torch.from_numpy(np.stack(batch.state)) actions = torch.from_numpy(np.stack(batch.action)) rewards = torch.from_numpy(np.stack(batch.reward)) masks = torch.from_numpy(np.stack(batch.mask).astype(np.float64))
else: policy_mgr = DiscretePolicy(state_dim, 4) policy_wrk = Policy(state_dim + subgoal_dim, env.action_space.shape[0], log_std=args.log_std) value_mgr = Value(state_dim) value_wrk = Value(state_dim + subgoal_dim) else: policy_net, value_net, running_state = pickle.load( open(args.model_path, "rb")) policy_mgr.to(device) policy_wrk.to(device) value_mgr.to(device) value_wrk.to(device) optim_policy_m = torch.optim.Adam(policy_mgr.parameters(), lr=0.01) optim_policy_w = torch.optim.Adam(policy_wrk.parameters(), lr=0.01) optim_value_m = torch.optim.Adam(value_mgr.parameters(), lr=0.01) optim_value_w = torch.optim.Adam(value_wrk.parameters(), lr=0.01) optim_epochs = 10 optim_batch_size = 50 """create agent""" agent = Agent(env, policy_mgr, policy_wrk, device, running_state=running_state, render=args.render, num_threads=args.num_threads)
def create_networks(): """define actor and critic""" if is_disc_action: policy_net = DiscretePolicy(state_dim, env.action_space.n, hidden_size=(64, 32), activation='relu') else: policy_net = Policy(state_dim, env.action_space.shape[0], log_std=args.log_std, hidden_size=(64, 32), activation='relu') value_net = Value(state_dim, hidden_size=(32, 16), activation='relu') if args.WGAN: discrim_net = SNDiscriminator(state_dim + action_dim, hidden_size=(32, 16), activation='relu') elif args.EBGAN or args.GMMIL: discrim_net = AESNDiscriminator(state_dim + action_dim, hidden_size=(32, ), encode_size=64, activation='relu', slope=0.1, dropout=False, dprob=0.2) elif args.GEOMGAN: # new kernel #discrim_net = KernelNet(state_dim + action_dim,state_dim + action_dim) noise_dim = 64 discrim_net = AESNDiscriminator(state_dim + action_dim, hidden_size=(32, ), encode_size=noise_dim, activation='relu', slope=0.1, dropout=False, dprob=0.2) kernel_net = NoiseNet(noise_dim, hidden_size=(32, ), encode_size=noise_dim, activation='relu', slope=0.1, dropout=False, dprob=0.2) optimizer_kernel = torch.optim.Adam(kernel_net.parameters(), lr=args.learning_rate / 2) scheduler_kernel = MultiStepLR(optimizer_kernel, milestones=args.milestones, gamma=args.lr_decay) else: discrim_net = Discriminator(state_dim + action_dim, hidden_size=(32, 16), activation='relu') optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=args.learning_rate) optimizer_value = torch.optim.Adam(value_net.parameters(), lr=args.learning_rate) optimizer_discrim = torch.optim.Adam(discrim_net.parameters(), lr=args.learning_rate) scheduler_policy = MultiStepLR(optimizer_policy, milestones=args.milestones, gamma=args.lr_decay) scheduler_value = MultiStepLR(optimizer_value, milestones=args.milestones, gamma=args.lr_decay) scheduler_discrim = MultiStepLR(optimizer_discrim, milestones=args.milestones, gamma=args.lr_decay) if args.WGAN: class ExpertReward(): def __init__(self): self.a = 0 def expert_reward(self, state, action): state_action = tensor(np.hstack([state, action]), dtype=dtype) with torch.no_grad(): return -discrim_net(state_action)[0].item() # return -discrim_net(state_action).sum().item() learned_reward = ExpertReward() elif args.EBGAN: class ExpertReward(): def __init__(self): self.a = 0 def expert_reward(self, state, action): state_action = tensor(np.hstack([state, action]), dtype=dtype) with torch.no_grad(): _, recon_out = discrim_net(state_action) return -elementwise_loss( recon_out, state_action).item() + args.r_margin learned_reward = ExpertReward() elif args.GMMIL or args.GEOMGAN: class ExpertReward(): def __init__(self): self.r_bias = 0 def expert_reward(self, state, action): with torch.no_grad(): return self.r_bias def update_XX_YY(self): self.XX = torch.diag(torch.mm(self.e_o_enc, self.e_o_enc.t())) self.YY = torch.diag(torch.mm(self.g_o_enc, self.g_o_enc.t())) learned_reward = ExpertReward() else: class ExpertReward(): def __init__(self): self.a = 0 def expert_reward(self, state, action): state_action = tensor(np.hstack([state, action]), dtype=dtype) with torch.no_grad(): return -math.log(discrim_net(state_action)[0].item()) learned_reward = ExpertReward() """create agent""" agent = Agent(env, policy_net, device, custom_reward=learned_reward, running_state=None, render=args.render, num_threads=args.num_threads) def update_params(batch, i_iter): dataSize = min(args.min_batch_size, len(batch.state)) states = torch.from_numpy(np.stack( batch.state)[:dataSize, ]).to(dtype).to(device) actions = torch.from_numpy(np.stack( batch.action)[:dataSize, ]).to(dtype).to(device) rewards = torch.from_numpy(np.stack( batch.reward)[:dataSize, ]).to(dtype).to(device) masks = torch.from_numpy(np.stack( batch.mask)[:dataSize, ]).to(dtype).to(device) with torch.no_grad(): values = value_net(states) fixed_log_probs = policy_net.get_log_prob(states, actions) """estimate reward""" """get advantage estimation from the trajectories""" advantages, returns = estimate_advantages(rewards, masks, values, args.gamma, args.tau, device) """update discriminator""" for _ in range(args.discriminator_epochs): #dataSize = states.size()[0] # expert_state_actions = torch.from_numpy(expert_traj).to(dtype).to(device) exp_idx = random.sample(range(expert_traj.shape[0]), dataSize) expert_state_actions = torch.from_numpy( expert_traj[exp_idx, :]).to(dtype).to(device) dis_input_real = expert_state_actions if len(actions.shape) == 1: actions.unsqueeze_(-1) dis_input_fake = torch.cat([states, actions], 1) actions.squeeze_(-1) else: dis_input_fake = torch.cat([states, actions], 1) if args.EBGAN or args.GMMIL or args.GEOMGAN: # tbd, no discriminaotr learning pass else: g_o = discrim_net(dis_input_fake) e_o = discrim_net(dis_input_real) optimizer_discrim.zero_grad() if args.GEOMGAN: optimizer_kernel.zero_grad() if args.WGAN: if args.LSGAN: pdist = l1dist(dis_input_real, dis_input_fake).mul(args.lamb) discrim_loss = LeakyReLU(e_o - g_o + pdist).mean() else: discrim_loss = torch.mean(e_o) - torch.mean(g_o) elif args.EBGAN: e_recon = elementwise_loss(e_o, dis_input_real) g_recon = elementwise_loss(g_o, dis_input_fake) discrim_loss = e_recon if (args.margin - g_recon).item() > 0: discrim_loss += (args.margin - g_recon) elif args.GMMIL: #mmd2_D,K = mix_rbf_mmd2(e_o_enc, g_o_enc, args.sigma_list) mmd2_D, K = mix_rbf_mmd2(dis_input_real, dis_input_fake, args.sigma_list) #tbd #rewards = K[0]+K[1]-2*K[2] rewards = K[1] - K[2] # -(exp - gen): -(kxy-kyy)=kyy-kxy rewards = -rewards.detach( ) # exp - gen, maximize (gen label negative) errD = mmd2_D discrim_loss = -errD # maximize errD # prep for generator advantages, returns = estimate_advantages( rewards, masks, values, args.gamma, args.tau, device) elif args.GEOMGAN: # larger, better, but slower noise_num = 100 mmd2_D, K = mix_imp_mmd2(e_o_enc, g_o_enc, noise_num, noise_dim, kernel_net, cuda) rewards = K[1] - K[2] # -(exp - gen): -(kxy-kyy)=kyy-kxy rewards = -rewards.detach() errD = mmd2_D #+ args.lambda_rg * one_side_errD discrim_loss = -errD # maximize errD # prep for generator advantages, returns = estimate_advantages( rewards, masks, values, args.gamma, args.tau, device) else: discrim_loss = discrim_criterion(g_o, ones((states.shape[0], 1), device=device)) + \ discrim_criterion(e_o, zeros((e_o.shape[0], 1), device=device)) if args.GEOMGAN: optimizer_kernel.step() """perform mini-batch PPO update""" optim_iter_num = int(math.ceil(states.shape[0] / args.ppo_batch_size)) for _ in range(args.generator_epochs): perm = np.arange(states.shape[0]) np.random.shuffle(perm) perm = LongTensor(perm).to(device) states, actions, returns, advantages, fixed_log_probs = \ states[perm].clone(), actions[perm].clone(), returns[perm].clone(), advantages[perm].clone(), \ fixed_log_probs[perm].clone() for i in range(optim_iter_num): ind = slice( i * args.ppo_batch_size, min((i + 1) * args.ppo_batch_size, states.shape[0])) states_b, actions_b, advantages_b, returns_b, fixed_log_probs_b = \ states[ind], actions[ind], advantages[ind], returns[ind], fixed_log_probs[ind] ppo_step(policy_net, value_net, optimizer_policy, optimizer_value, 1, states_b, actions_b, returns_b, advantages_b, fixed_log_probs_b, args.clip_epsilon, args.l2_reg) return rewards if args.GEOMGAN: return policy_net,value_net,discrim_net,kernel_net,optimizer_policy,optimizer_value,optimizer_discrim,optimizer_kernel,agent,update_params \ ,scheduler_policy,scheduler_value,scheduler_discrim,scheduler_kernel else: return policy_net,value_net,discrim_net,optimizer_policy,optimizer_value,optimizer_discrim,agent,update_params \ ,scheduler_policy,scheduler_value,scheduler_discrim
class BC(object): """ A vanilla Behavior Cloning model. """ def __init__(self, args, state_dim, action_dim, is_dict_action=False, is_atari=False): self.device = args.device self.config = args self.is_dict_action = is_dict_action self.is_atari = is_atari self.state_dim = state_dim self.actor = DiscretePolicy(state_dim, action_dim).to(self.device) if is_dict_action else \ Policy(state_dim, action_dim, log_std=self.config.log_std).to(self.device) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.config.learning_rate) self.actor_loss = nn.CrossEntropyLoss( ) if self.is_dict_action else nn.MSELoss() def set_expert(self, expert_traj, num_trajs): """ Set the expert trajectories. :param expert_traj: :param num_traj :return: """ # self.expert_traj_pool = expert_traj # self.expert_traj = np.vstack(expert_traj[:num_trajs]) def set_expert2(self, expert_traj): """ Set the expert trajectories. :param expert_traj: :param num_traj :return: """ self.expert_traj = expert_traj def train(self): """ :param num_traj: :return: """ if self.expert_traj is not None: expert_traj = self.expert_traj expert_state_actions = torch.DoubleTensor(expert_traj) expert_states = expert_state_actions[:, :self.state_dim].to( self.device) expert_actions = expert_state_actions[:, self.state_dim:].to(self.device) predicted_actions = self.actor(expert_states)[0] self.actor_optimizer.zero_grad() loss = self.actor_loss(predicted_actions, expert_actions) loss.backward() self.actor_optimizer.step() return loss.to('cpu').detach().numpy() def train2(self): """ :return: """ if self.expert_traj is not None: expert_traj = self.expert_traj # expert_state_actions = torch.DoubleTensor(expert_traj) # # expert_states = expert_state_actions[:,:self.state_dim].to(self.device) # expert_actions = expert_state_actions[:,self.state_dim:].to(self.device) expert_states = torch.DoubleTensor(self.expert_traj.state) expert_actions = torch.DoubleTensor(self.expert_traj.action) predicted_actions = self.actor(expert_states)[0] self.actor_optimizer.zero_grad() loss = self.actor_loss(predicted_actions, expert_actions) loss.backward() self.actor_optimizer.step() return loss.to('cpu').detach().numpy()
hidden_size=(64, 64), log_std=args.log_std) else: policy_net = Policy(state_dim, env.action_space.shape[0], log_std=args.log_std) value_net = Value(state_dim) else: policy_net, value_net, running_state, = pickle.load( open(args.model_path, "rb")) policy_net.to(device) value_net.to(device) optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=args.learning_rate) optimizer_value = torch.optim.Adam(value_net.parameters(), lr=args.learning_rate) params = list(policy_net.parameters()) + list(value_net.parameters()) unique_optimizer = torch.optim.Adam(params, lr=args.learning_rate) # optimization epoch number and batch size for PPO optim_epochs = args.optim_epochs optim_batch_size = args.optim_batch_size """create agent""" agent = Agent(env, policy_net, device, running_state=running_state,
def create_networks(): """define actor and critic""" if is_disc_action: policy_net = DiscretePolicy(state_dim, env.action_space.n, hidden_size=(64, 32), activation='relu') else: policy_net = Policy(state_dim, env.action_space.shape[0], log_std=args.log_std, hidden_size=(64, 32), activation='relu') value_net = Value(state_dim, hidden_size=(32, 16), activation='relu') if args.AL: discrim_net = SNDiscriminator(state_dim + action_dim, hidden_size=(32, 16), activation='relu') elif args.EBGAN or args.GMMIL: discrim_net = AESNDiscriminator(state_dim + action_dim, hidden_size=(32, ), encode_size=64, activation='leakyrelu', slope=0.1, dropout=True, dprob=0.2) elif args.VAKLIL: noise_dim = 64 mid_dim = 32 discrim_net = VAEDiscriminator(state_dim + action_dim, num_outputs=noise_dim, sigmoid_out=False, sn=True, test=False, w_init=False, hidden_size_enc=(), hidden_size_dec=(), encode_size=mid_dim, activation='relu', dropout=False) kernel_net = NoiseNet(noise_dim, hidden_size=(32, ), encode_size=noise_dim, activation='relu', dropout=False) optimizer_kernel = torch.optim.Adam(kernel_net.parameters(), lr=args.learning_rate) scheduler_kernel = MultiStepLR(optimizer_kernel, milestones=args.milestones, gamma=args.lr_kernel_decay) else: discrim_net = Discriminator(state_dim + action_dim, hidden_size=(32, 16), activation='relu') optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=args.learning_rate) optimizer_value = torch.optim.Adam(value_net.parameters(), lr=args.learning_rate) optimizer_discrim = torch.optim.Adam(discrim_net.parameters(), lr=args.learning_rate) scheduler_policy = MultiStepLR(optimizer_policy, milestones=args.milestones, gamma=args.lr_decay) scheduler_value = MultiStepLR(optimizer_value, milestones=args.milestones, gamma=args.lr_decay) scheduler_discrim = MultiStepLR(optimizer_discrim, milestones=args.milestones, gamma=args.lr_kernel_decay) if args.AL: class ExpertReward(): def __init__(self): self.a = 0 def expert_reward(self, state, action): state_action = tensor(np.hstack([state, action]), dtype=dtype) with torch.no_grad(): return -discrim_net(state_action)[0].item() learned_reward = ExpertReward() elif args.EBGAN: class ExpertReward(): def __init__(self): self.a = 0 def expert_reward(self, state, action): state_action = tensor(np.hstack([state, action]), dtype=dtype) with torch.no_grad(): _, recon_out = discrim_net(state_action) return -elementwise_loss( recon_out, state_action).item() + args.r_margin learned_reward = ExpertReward() elif args.GMMIL or args.VAKLIL: class ExpertReward(): def __init__(self): self.r_bias = 0 def expert_reward(self, state, action): with torch.no_grad(): return self.r_bias def update_XX_YY(self): self.XX = torch.diag(torch.mm(self.e_o_enc, self.e_o_enc.t())) self.YY = torch.diag(torch.mm(self.g_o_enc, self.g_o_enc.t())) learned_reward = ExpertReward() else: class ExpertReward(): def __init__(self): self.a = 0 def expert_reward(self, state, action): state_action = tensor(np.hstack([state, action]), dtype=dtype) with torch.no_grad(): return -math.log(discrim_net(state_action)[0].item()) learned_reward = ExpertReward() """create agent""" agent = Agent(env, policy_net, device, custom_reward=learned_reward, running_state=None, render=args.render, num_threads=args.num_threads) def update_params(batch, i_iter): dataSize = min(args.min_batch_size, len(batch.state)) states = torch.from_numpy(np.stack( batch.state)[:dataSize, ]).to(dtype).to(device) actions = torch.from_numpy(np.stack( batch.action)[:dataSize, ]).to(dtype).to(device) rewards = torch.from_numpy(np.stack( batch.reward)[:dataSize, ]).to(dtype).to(device) masks = torch.from_numpy(np.stack( batch.mask)[:dataSize, ]).to(dtype).to(device) with torch.no_grad(): values = value_net(states) fixed_log_probs = policy_net.get_log_prob(states, actions) """estimate reward""" """get advantage estimation from the trajectories""" advantages, returns = estimate_advantages(rewards, masks, values, args.gamma, args.tau, device) """update discriminator""" for _ in range(args.discriminator_epochs): exp_idx = random.sample(range(expert_traj.shape[0]), dataSize) expert_state_actions = torch.from_numpy( expert_traj[exp_idx, :]).to(dtype).to(device) dis_input_real = expert_state_actions if len(actions.shape) == 1: actions.unsqueeze_(-1) dis_input_fake = torch.cat([states, actions], 1) actions.squeeze_(-1) else: dis_input_fake = torch.cat([states, actions], 1) if args.EBGAN or args.GMMIL or args.VAKLIL: g_o_enc, g_mu, g_sigma = discrim_net(dis_input_fake, mean_mode=False) e_o_enc, e_mu, e_sigma = discrim_net(dis_input_real, mean_mode=False) else: g_o = discrim_net(dis_input_fake) e_o = discrim_net(dis_input_real) optimizer_discrim.zero_grad() if args.VAKLIL: optimizer_kernel.zero_grad() if args.AL: if args.LSGAN: pdist = l1dist(dis_input_real, dis_input_fake).mul(args.lamb) discrim_loss = LeakyReLU(e_o - g_o + pdist).mean() else: discrim_loss = torch.mean(e_o) - torch.mean(g_o) elif args.EBGAN: e_recon = elementwise_loss(e_o, dis_input_real) g_recon = elementwise_loss(g_o, dis_input_fake) discrim_loss = e_recon if (args.margin - g_recon).item() > 0: discrim_loss += (args.margin - g_recon) elif args.GMMIL: mmd2_D, K = mix_rbf_mmd2(e_o_enc, g_o_enc, args.sigma_list) rewards = K[1] - K[2] # -(exp - gen): -(kxy-kyy)=kyy-kxy rewards = -rewards.detach( ) # exp - gen, maximize (gen label negative) errD = mmd2_D discrim_loss = -errD # maximize errD advantages, returns = estimate_advantages( rewards, masks, values, args.gamma, args.tau, device) elif args.VAKLIL: noise_num = 20000 mmd2_D_net, _, penalty = mix_imp_with_bw_mmd2( e_o_enc, g_o_enc, noise_num, noise_dim, kernel_net, cuda, args.sigma_list) mmd2_D_rbf, _ = mix_rbf_mmd2(e_o_enc, g_o_enc, args.sigma_list) errD = (mmd2_D_net + mmd2_D_rbf) / 2 # 1e-8: small number for numerical stability i_c = 0.2 bottleneck_loss = torch.mean((0.5 * torch.sum((torch.cat( (e_mu, g_mu), dim=0)**2) + (torch.cat( (e_sigma, g_sigma), dim=0)**2) - torch.log((torch.cat( (e_sigma, g_sigma), dim=0)**2) + 1e-8) - 1, dim=1))) - i_c discrim_loss = -errD + (args.beta * bottleneck_loss) + ( args.lambda_h * penalty) else: discrim_loss = discrim_criterion(g_o, ones((states.shape[0], 1), device=device)) + \ discrim_criterion(e_o, zeros((e_o.shape[0], 1), device=device)) discrim_loss.backward() optimizer_discrim.step() if args.VAKLIL: optimizer_kernel.step() if args.VAKLIL: with torch.no_grad(): noise_num = 20000 g_o_enc, _, _ = discrim_net(dis_input_fake) e_o_enc, _, _ = discrim_net(dis_input_real) _, K_net, _ = mix_imp_with_bw_mmd2(e_o_enc, g_o_enc, noise_num, noise_dim, kernel_net, cuda, args.sigma_list) _, K_rbf = mix_rbf_mmd2(e_o_enc, g_o_enc, args.sigma_list) K = [sum(x) / 2 for x in zip(K_net, K_rbf)] rewards = K[1] - K[2] # -(exp - gen): -(kxy-kyy)=kyy-kxy rewards = -rewards #.detach() advantages, returns = estimate_advantages( rewards, masks, values, args.gamma, args.tau, device) """perform mini-batch PPO update""" optim_iter_num = int(math.ceil(states.shape[0] / args.ppo_batch_size)) for _ in range(args.generator_epochs): perm = np.arange(states.shape[0]) np.random.shuffle(perm) perm = LongTensor(perm).to(device) states, actions, returns, advantages, fixed_log_probs = \ states[perm].clone(), actions[perm].clone(), returns[perm].clone(), advantages[perm].clone(), \ fixed_log_probs[perm].clone() for i in range(optim_iter_num): ind = slice( i * args.ppo_batch_size, min((i + 1) * args.ppo_batch_size, states.shape[0])) states_b, actions_b, advantages_b, returns_b, fixed_log_probs_b = \ states[ind], actions[ind], advantages[ind], returns[ind], fixed_log_probs[ind] ppo_step(policy_net, value_net, optimizer_policy, optimizer_value, 1, states_b, actions_b, returns_b, advantages_b, fixed_log_probs_b, args.clip_epsilon, args.l2_reg) return rewards if args.VAKLIL: return policy_net,value_net,discrim_net,kernel_net,optimizer_policy,optimizer_value,optimizer_discrim,optimizer_kernel,agent,update_params \ ,scheduler_policy,scheduler_value,scheduler_discrim,scheduler_kernel else: return policy_net,value_net,discrim_net,optimizer_policy,optimizer_value,optimizer_discrim,agent,update_params \ ,scheduler_policy,scheduler_value,scheduler_discrim
class PPO(object): def __init__(self, args, state_dim, action_dim, is_dict_action=False, is_atari=False): self.device = args.device self.config = args if is_atari: self.actor = CNNPolicy(state_dim, action_dim).to(self.device) self.critic = CNNCritic(state_dim).to(self.device) else: self.actor = DiscretePolicy(state_dim, action_dim).to(self.device) if is_dict_action else \ Policy(state_dim, action_dim, log_std=self.config.log_std).to(self.device) self.critic = Value(state_dim).to(self.device) # initialize optimizer for actor and critic self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.config.learning_rate) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.config.learning_rate) # optimization epoch number and batch size for PPO self.optim_epochs = 10 self.optim_batch_size = 64 def train(self, batch): """ Train the policy using the given batch. :param batch: :return: """ states = torch.DoubleTensor(np.stack(batch.state)).to(self.device) actions = torch.DoubleTensor(np.stack(batch.action)).to(self.device) rewards = torch.DoubleTensor(np.stack(batch.reward)).to(self.device) masks = torch.DoubleTensor(np.stack(batch.mask)).to(self.device) with torch.no_grad(): values = self.critic(states) fixed_log_probs = self.actor.get_log_prob(states, actions) # get advantage estimation from the trajectories advantages, returns = estimate_advantages(rewards, masks, values, self.config.gamma, self.config.tau, self.device) # compute minibatch size optim_iter_num = int(math.ceil(states.shape[0] / self.optim_batch_size)) # PPO updates for _ in range(self.optim_epochs): perm = np.arange(states.shape[0]) np.random.shuffle(perm) perm = torch.LongTensor(perm).to(self.device) states, actions, returns, advantages, fixed_log_probs = \ states[perm].clone(), actions[perm].clone(), returns[perm].clone(), advantages[perm].clone(), \ fixed_log_probs[perm].clone() for i in range(optim_iter_num): ind = slice( i * self.optim_batch_size, min((i + 1) * self.optim_batch_size, states.shape[0])) states_b, actions_b, advantages_b, returns_b, fixed_log_probs_b = \ states[ind], actions[ind], advantages[ind], returns[ind], fixed_log_probs[ind] self.ppo_step(states_b, actions_b, returns_b, advantages_b, fixed_log_probs_b) def ppo_step(self, states, actions, returns, advantages, fixed_log_probs): """ A PPO policy gradient update step. :param states: :param actions: :param returns: :param advantages: :param fixed_log_probs: :return: """ # update critic, for now assume one epoch values_pred = self.critic(states) value_loss = (values_pred - returns).pow(2).mean() # weight decay for param in self.critic.parameters(): value_loss += param.pow(2).sum() * self.config.l2_reg self.critic_optimizer.zero_grad() value_loss.backward() self.critic_optimizer.step() # update actor log_probs = self.actor.get_log_prob(states, actions) ratio = torch.exp(log_probs - fixed_log_probs) surr1 = ratio * advantages surr2 = torch.clamp(ratio, 1.0 - self.config.clip_epsilon, 1.0 + self.config.clip_epsilon) * advantages policy_surr = -torch.min(surr1, surr2).mean() self.actor.zero_grad() policy_surr.backward() torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 40) self.actor_optimizer.step()