class Actor_Critic(object): def __init__(self, state_dim, action_dim, gamma, tau, buffer_size, is_mem_cuda, out_act): self.actor = Actor(state_dim, action_dim, is_evo=False, out_act=out_act) self.actor_target = Actor(state_dim, action_dim, is_evo=False, out_act=out_act) self.actor_optim = Adam(self.actor.parameters(), lr=1e-4) self.critic = Critic(state_dim, action_dim) self.critic_target = Critic(state_dim, action_dim) self.critic_optim = Adam(self.critic.parameters(), lr=1e-3) self.gamma = gamma self.tau = tau self.loss = nn.MSELoss() self.replay_buffer = ReplayMemory(buffer_size, is_mem_cuda) self.exploration_noise = OUNoise(action_dim) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) def act(self, state, is_noise): state = utils.to_tensor(state).unsqueeze(0) action = self.actor.forward(state) action = action.detach().numpy().flatten() if is_noise: action += self.exploration_noise.noise() return action def train_from_batch(self, batch): env_state_batch = torch.cat(batch.state) goal_batch = torch.cat(batch.goal) uvfa_states = torch.cat((env_state_batch, goal_batch), dim=1).detach() next_env_state_batch = torch.cat(batch.next_state) next_uvfa_states = torch.cat((next_env_state_batch, goal_batch), dim=1).detach() action_batch = torch.cat(batch.action).detach() reward_batch = torch.cat(batch.reward).detach() #if self.args.use_done_mask: done_batch = torch.cat(batch.done) #Load everything to GPU if not already # if self.args.is_memory_cuda and not self.args.is_cuda: self.actor.cuda() self.actor_target.cuda() self.critic_target.cuda() self.critic.cuda() uvfa_states = uvfa_states.cuda() next_uvfa_states = next_uvfa_states.cuda() action_batch = action_batch.cuda() reward_batch = reward_batch.cuda() # if self.args.use_done_mask: done_batch = done_batch.cuda() #Critic Update with torch.no_grad(): next_action_batch = self.actor_target.forward(next_uvfa_states) next_q = self.critic_target.forward(next_uvfa_states, next_action_batch) #if self.args.use_done_mask: next_q = next_q * ( 1 - done_batch.float()) #Done mask target_q = reward_batch + (self.gamma * next_q) self.critic_optim.zero_grad() current_q = self.critic.forward((uvfa_states.detach()), (action_batch.detach())) dt = self.loss(current_q, target_q) dt.backward() nn.utils.clip_grad_norm_(self.critic.parameters(), 10) self.critic_optim.step() #Actor Update self.actor_optim.zero_grad() policy_loss = -self.critic.forward( (uvfa_states), self.actor.forward((uvfa_states))) policy_loss = policy_loss.mean() policy_loss.backward() nn.utils.clip_grad_norm_(self.critic.parameters(), 10) self.actor_optim.step() soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) #Nets back to CPU if using memory_cuda self.actor.cpu() self.actor_target.cpu() self.critic_target.cpu() self.critic.cpu()
class DDPG(object): def __init__(self, args): self.args = args self.actor = Actor(args, init=True) self.actor_target = Actor(args, init=True) self.actor_optim = Adam(self.actor.parameters(), lr=0.5e-4) self.critic = Critic(args) self.critic_target = Critic(args) self.critic_optim = Adam(self.critic.parameters(), lr=0.5e-3) self.gamma = args.gamma self.tau = self.args.tau self.loss = nn.MSELoss() hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) def update_parameters(self, batch): state_batch = torch.cat(batch.state) next_state_batch = torch.cat(batch.next_state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) if self.args.use_done_mask: done_batch = torch.cat(batch.done) #Load everything to GPU if not already if self.args.is_memory_cuda and not self.args.is_cuda: self.actor.cuda() self.actor_target.cuda() self.critic_target.cuda() self.critic.cuda() state_batch = state_batch.cuda() next_state_batch = next_state_batch.cuda() action_batch = action_batch.cuda() reward_batch = reward_batch.cuda() if self.args.use_done_mask: done_batch = done_batch.cuda() #Critic Update next_action_batch = self.actor_target.forward(next_state_batch) with torch.no_grad(): next_q = self.critic_target.forward(next_state_batch, next_action_batch) if self.args.use_done_mask: next_q = next_q * (1 - done_batch.float()) #Done mask target_q = reward_batch + (self.gamma * next_q) self.critic_optim.zero_grad() current_q = self.critic.forward((state_batch), (action_batch)) dt = self.loss(current_q, target_q) dt.backward() nn.utils.clip_grad_norm_(self.critic.parameters(), 10) self.critic_optim.step() #Actor Update self.actor_optim.zero_grad() policy_loss = -self.critic.forward( (state_batch), self.actor.forward((state_batch))) policy_loss = policy_loss.mean() policy_loss.backward() nn.utils.clip_grad_norm_(self.critic.parameters(), 10) self.actor_optim.step() soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) #Nets back to CPU if using memory_cuda if self.args.is_memory_cuda and not self.args.is_cuda: self.actor.cpu() self.actor_target.cpu() self.critic_target.cpu() self.critic.cpu()