Esempio n. 1
0
class Actor_Critic(object):
    def __init__(self, state_dim, action_dim, gamma, tau, buffer_size,
                 is_mem_cuda, out_act):

        self.actor = Actor(state_dim,
                           action_dim,
                           is_evo=False,
                           out_act=out_act)
        self.actor_target = Actor(state_dim,
                                  action_dim,
                                  is_evo=False,
                                  out_act=out_act)
        self.actor_optim = Adam(self.actor.parameters(), lr=1e-4)

        self.critic = Critic(state_dim, action_dim)
        self.critic_target = Critic(state_dim, action_dim)
        self.critic_optim = Adam(self.critic.parameters(), lr=1e-3)

        self.gamma = gamma
        self.tau = tau
        self.loss = nn.MSELoss()
        self.replay_buffer = ReplayMemory(buffer_size, is_mem_cuda)
        self.exploration_noise = OUNoise(action_dim)

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

    def act(self, state, is_noise):
        state = utils.to_tensor(state).unsqueeze(0)
        action = self.actor.forward(state)
        action = action.detach().numpy().flatten()
        if is_noise: action += self.exploration_noise.noise()
        return action

    def train_from_batch(self, batch):
        env_state_batch = torch.cat(batch.state)
        goal_batch = torch.cat(batch.goal)
        uvfa_states = torch.cat((env_state_batch, goal_batch), dim=1).detach()
        next_env_state_batch = torch.cat(batch.next_state)
        next_uvfa_states = torch.cat((next_env_state_batch, goal_batch),
                                     dim=1).detach()
        action_batch = torch.cat(batch.action).detach()
        reward_batch = torch.cat(batch.reward).detach()

        #if self.args.use_done_mask: done_batch = torch.cat(batch.done)

        #Load everything to GPU if not already
        # if self.args.is_memory_cuda and not self.args.is_cuda:
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic_target.cuda()
        self.critic.cuda()
        uvfa_states = uvfa_states.cuda()
        next_uvfa_states = next_uvfa_states.cuda()
        action_batch = action_batch.cuda()
        reward_batch = reward_batch.cuda()
        #     if self.args.use_done_mask: done_batch = done_batch.cuda()

        #Critic Update
        with torch.no_grad():
            next_action_batch = self.actor_target.forward(next_uvfa_states)
            next_q = self.critic_target.forward(next_uvfa_states,
                                                next_action_batch)
            #if self.args.use_done_mask: next_q = next_q * ( 1 - done_batch.float()) #Done mask
            target_q = reward_batch + (self.gamma * next_q)

        self.critic_optim.zero_grad()
        current_q = self.critic.forward((uvfa_states.detach()),
                                        (action_batch.detach()))
        dt = self.loss(current_q, target_q)
        dt.backward()
        nn.utils.clip_grad_norm_(self.critic.parameters(), 10)
        self.critic_optim.step()

        #Actor Update
        self.actor_optim.zero_grad()
        policy_loss = -self.critic.forward(
            (uvfa_states), self.actor.forward((uvfa_states)))
        policy_loss = policy_loss.mean()
        policy_loss.backward()
        nn.utils.clip_grad_norm_(self.critic.parameters(), 10)
        self.actor_optim.step()

        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

        #Nets back to CPU if using memory_cuda
        self.actor.cpu()
        self.actor_target.cpu()
        self.critic_target.cpu()
        self.critic.cpu()
Esempio n. 2
0
class DDPG(object):
    def __init__(self, args):

        self.args = args

        self.actor = Actor(args, init=True)
        self.actor_target = Actor(args, init=True)
        self.actor_optim = Adam(self.actor.parameters(), lr=0.5e-4)

        self.critic = Critic(args)
        self.critic_target = Critic(args)
        self.critic_optim = Adam(self.critic.parameters(), lr=0.5e-3)

        self.gamma = args.gamma
        self.tau = self.args.tau
        self.loss = nn.MSELoss()

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

    def update_parameters(self, batch):
        state_batch = torch.cat(batch.state)
        next_state_batch = torch.cat(batch.next_state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
        if self.args.use_done_mask: done_batch = torch.cat(batch.done)

        #Load everything to GPU if not already
        if self.args.is_memory_cuda and not self.args.is_cuda:
            self.actor.cuda()
            self.actor_target.cuda()
            self.critic_target.cuda()
            self.critic.cuda()
            state_batch = state_batch.cuda()
            next_state_batch = next_state_batch.cuda()
            action_batch = action_batch.cuda()
            reward_batch = reward_batch.cuda()
            if self.args.use_done_mask: done_batch = done_batch.cuda()

        #Critic Update
        next_action_batch = self.actor_target.forward(next_state_batch)
        with torch.no_grad():
            next_q = self.critic_target.forward(next_state_batch,
                                                next_action_batch)
            if self.args.use_done_mask:
                next_q = next_q * (1 - done_batch.float())  #Done mask
            target_q = reward_batch + (self.gamma * next_q)

        self.critic_optim.zero_grad()
        current_q = self.critic.forward((state_batch), (action_batch))
        dt = self.loss(current_q, target_q)
        dt.backward()
        nn.utils.clip_grad_norm_(self.critic.parameters(), 10)
        self.critic_optim.step()

        #Actor Update
        self.actor_optim.zero_grad()
        policy_loss = -self.critic.forward(
            (state_batch), self.actor.forward((state_batch)))
        policy_loss = policy_loss.mean()
        policy_loss.backward()
        nn.utils.clip_grad_norm_(self.critic.parameters(), 10)
        self.actor_optim.step()

        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

        #Nets back to CPU if using memory_cuda
        if self.args.is_memory_cuda and not self.args.is_cuda:
            self.actor.cpu()
            self.actor_target.cpu()
            self.critic_target.cpu()
            self.critic.cpu()