Exemple #1
0
class DQNAgent():
    def __init__(self, state_size, action_size, double=False, duel=False):

        self.state_size = state_size
        self.action_size = action_size
        self.discounted_factor = 0.99
        self.learning_rate = 0.001

        self.double = double

        # Define Model
        if duel:
            self.local_model = Duel_Qnetwork(state_size,
                                             action_size).to(device)
            self.target_model = Duel_Qnetwork(state_size,
                                              action_size).to(device)
        else:
            self.local_model = Qnetwork(state_size, action_size).to(device)
            self.target_model = Qnetwork(state_size, action_size).to(device)

        # Define optimizer
        self.optimizer = optim.Adam(self.local_model.parameters(),
                                    lr=self.learning_rate)

        # Define Buffer
        self.buffer = Replay_buffer(action_size,
                                    buffer_size=BUFFER_SIZE,
                                    batch_size=BATCH_SIZE)

        # time_step, local_model update, target_model update
        self.t_step = 0
        self.target_update_t = 0

    def get_action(self, state, eps=0.0):
        """state (numpy.ndarray)"""
        state = torch.from_numpy(state.reshape(
            1, self.state_size)).float().to(device)

        self.local_model.eval()
        with torch.no_grad():
            action_values = self.local_model(state)  # .detach().cpu()
        self.local_model.train()

        # epsilon greedy policy
        if random.random() < eps:
            action = np.random.randint(4)
            return action
        else:
            action = np.argmax(action_values.cpu().data.numpy())

            return int(action)

    def append_sample(self, state, action, reward, next_state, done):
        self.buffer.add(state, action, reward, next_state, done)

        self.t_step += 1
        if self.t_step % LOCAL_UPDATE == 0:
            """If there are enough experiences"""
            if self.buffer.__len__() > BATCH_SIZE:
                experiences = self.buffer.sample()
                self.learn(experiences)

                # self.target_update_t += 1
                # if self.target_update_t % TARGET_UPDATE == 0:
                self.soft_target_model_update(TAU)

    def learn(self, experiences):
        """experiences ;tensor  """
        states, actions, rewards, next_states, dones = experiences

        pred_q = self.local_model(states).gather(1, actions)

        if self.double:
            _, argmax_actions = torch.max(
                self.local_model.forward(next_states).detach(),
                1,
                keepdim=True)
            pred_next_q = self.target_model.forward(next_states).gather(
                1, argmax_actions)
        else:
            pred_next_q, _ = torch.max(
                self.target_model.forward(next_states).detach(),
                1,
                keepdim=True)

        target_q = rewards + (
            (1 - dones) * self.discounted_factor * pred_next_q)
        loss = F.mse_loss(target_q, pred_q)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def soft_target_model_update(self, tau):
        for target_param, local_param in zip(self.target_model.parameters(),
                                             self.local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Exemple #2
0
class Agent:
    def __init__(self,
                 state_size,
                 action_size,
                 gamma=0.99,
                 lr=5e-4,
                 buffer_size=int(1e5),
                 batch_size=64,
                 tau=1e-3):
        # defining local and target networks
        self.qnet_local = Qnetwork(state_size, action_size).to(device)
        self.qnet_target = Qnetwork(state_size, action_size).to(device)

        # set local and target parameters equal to each other
        self.soft_update(tau=1.0)

        # experience replay buffer
        self.memory = ReplayBuffer(buffer_size, batch_size)

        # defining variables
        self.state_size = state_size
        self.action_size = action_size
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.lr = lr
        self.tau = tau

        self.t_step = 0

        # optimizer
        self.optimizer = optim.Adam(self.qnet_local.parameters(), lr=self.lr)

    def step(self, state, action, reward, next_state, done):
        """ saves the step info in the memory buffer and perform a learning iteration
        Input : 
            state,action,reward,state,done : non-batched numpy arrays
        
        Output : 
            none
        """
        # add sample to the memory buffer
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY

        # use replay buffer to learn if it has enough samples
        if self.t_step == 0:
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                self.learn(experiences)

    def learn(self, experiences):
        """ perform a learning iteration by using sampled experience batch
        Input : 
            experience : tuple from the memory buffer
            states, actions, rewards, next_states, dones = experiences
            eg : states.shape = [N,state_size]
        Output : 
            none
        """
        states, actions, rewards, next_states, dones, wj, choose = experiences
        #states, actions, rewards, next_states, dones = experiences

        # set optimizer grdient to zero
        self.optimizer.zero_grad()

        # predicted action value
        q_pred = self.qnet_local.forward(states).gather(1, actions)

        # target action value
        ## use double DQNs, refer https://arxiv.org/abs/1509.06461
        next_action_local = self.qnet_local.forward(next_states).max(1)[1]
        q_target = rewards + self.gamma * (
            1 - dones) * self.qnet_target.forward(next_states)[
                range(self.batch_size), next_action_local].unsqueeze(1)

        # compute td error
        td_error = q_target - q_pred
        # update td error in Replay buffer
        self.memory.update_td_error(choose,
                                    td_error.detach().cpu().numpy().squeeze())

        # defining loss
        loss = ((wj * td_error)**2).mean()

        # running backprop and optimizer step
        loss.backward()
        self.optimizer.step()

        # run soft update
        self.soft_update(self.tau)

    def act(self, state, eps=0.):
        """ return the local model's predicted action for the given state
        Input : 
            state : [state_size]
        
        Output : 
            action : scalar action as action space is discrete with dim = 1
        """
        state = torch.from_numpy(state).float().unsqueeze(dim=0).to(
            device)  # converts numpy array to torch tensor

        self.qnet_local.eval()  # put net in test mode
        with torch.no_grad():
            max_action = np.argmax(
                self.qnet_local(state)[0].cpu().data.numpy())
        self.qnet_local.train()  # put net back in train mode

        rand_num = np.random.rand(
        )  # sample a random number uniformly between 0 and 1

        # implementing epsilon greedy policy
        if rand_num < eps:
            return np.random.randint(self.action_size)
        else:
            return max_action

    def soft_update(self, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        """
        for target_param, local_param in zip(self.qnet_target.parameters(),
                                             self.qnet_local.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)