Exemple #1
0
class ddqn_Agent():
    """Interacts with and learns from the environment."""

    
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.wQ =0
        self.wQ1=0
        self.wQ2=0
        # Q-Network
        self.qnetwork_Qa  = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_Qb  = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer_Qa = optim.Adam(self.qnetwork_Qa.parameters(), lr=LR)
        self.optimizer_Qb = optim.Adam(self.qnetwork_Qb.parameters(), lr=LR)

        # Replay memory
        self.memory = ddqn_ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
        
    def print_result():
        somme=self.wQ1+self.wQ2+0.0000001
        print("qQ1=",self.wQ1/somme," qQ2=",self.wQ2/somme)
        
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

                
    def act_a(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
    
        self.qnetwork_Qa.eval()
        with torch.no_grad():
            action_values = self.qnetwork_Qa(state)
        self.qnetwork_Qa.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))
 

    def act_b(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
    
        self.qnetwork_Qb.eval()
        with torch.no_grad():
            action_values = self.qnetwork_Qb(state)
        self.qnetwork_Qb.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))
        

    def act(self,state,eps=0.):
        self.wQ=np.random.choice([0, 1])
        if(self.wQ):
            return self.act_a(state, eps)
        else:
            return self.act_b(state, eps)


    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        #
        states, actions, rewards, next_states, dones = experiences

        ## TODO: compute and minimize the loss
        "*** YOUR CODE HERE ***"
        if self.wQ:
            # wQ take either 0 or 1 based on uniform random function.
            yj=self.qnetwork_Qb.forward(next_states).detach().max(1)[0].unsqueeze(1)
            Q_targets=rewards+gamma*yj*(1.0-dones)
 
            # Get expected Q values from local model
            Q_expected = self.qnetwork_Qa.forward(states).gather(1, actions)
            # Compute loss: Mean Square Error by element
            loss = F.mse_loss(Q_expected, Q_targets)
  
            # Minimize the loss
            self.optimizer_Qa.zero_grad()
            loss.backward()
            self.optimizer_Qa.step()
            # ------------------- update target network ------------------- #
            self.soft_update(self.qnetwork_Qa, self.qnetwork_Qb, TAU)
        else:
            
            yj=self.qnetwork_Qa.forward(next_states).detach().max(1)[0].unsqueeze(1)
            Q_targets=rewards+gamma*yj*(1.0-dones)
 
            # Get expected Q values from local model
            Q_expected = self.qnetwork_Qb.forward(states).gather(1, actions)
            # Compute loss: Mean Square Error by element
            loss = F.mse_loss(Q_expected, Q_targets)
  
            # Minimize the loss
            self.optimizer_Qb.zero_grad()
            loss.backward()
            self.optimizer_Qb.step()
            # ------------------- update target network ------------------- #
            self.soft_update(self.qnetwork_Qb, self.qnetwork_Qa, TAU)      
        
        

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        teta_target = ro*teta_local + (1 - ro)*teta_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
Exemple #2
0
class Agent():
    
    def __init__(self, state_size, action_size, seed):
        """
        
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        
        # Q-Network
        self.q_network_local = QNetwork(state_size, action_size, seed).to(device)
        self.q_network_target = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.q_network_local.parameters(), lr=LR)

        # Replay buffer
        self.memory = ReplayBuffer(BATCH_SIZE, action_size, BUFFER_SIZE, seed)
        
        # Ini
        self.t_step = 0
        
    def soft_update(self, local_network, target_network, tau):
        """Soft update model parameters
        
        
        """
        for target_param, local_param in zip(target_network.parameters(), local_network.parameters()):
            target_param.data.copy_(tau*local_param.data + (1-tau)*target_param.data)


        
    def learn(self, experiences, gamma):
        """
        
        
        """
        states, actions, rewards, next_states, dones = experiences
        
        Q_expected = self.q_network_local.forward(states).gather(1, actions)
        Q_targets_next = self.q_network_target.forward(next_states).detach().max(1)[0].unsqueeze(1)
        Q_targets = rewards + gamma*Q_targets_next*(1-dones)
        
        self.optimizer.zero_grad()
        loss = F.mse_loss(Q_expected, Q_targets)
        loss.backward()
        self.optimizer.step()
        
        # Update the target network
        self.soft_update(self.q_network_local, self.q_network_target, TAU)
        
        
        
    def act(self, state, epsilon=0.):
        
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.q_network_local.eval()
        with torch.no_grad():
            action_values = self.q_network_local(state)
        self.q_network_local.train()    
        
        do_exploration = (random.random()<epsilon)
        if do_exploration:
            action = np.random.randint(self.action_size)
        else:
            action = np.argmax(action_values.cpu().data.numpy())
            
        return action
        
    def step(self, state, action, reward, next_state, done):
        # save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        
        # update self.t_step
        self.t_step = (self.t_step+1) % UPDATE_EVERY
        
        # learn from this batch
        if self.t_step == 0:
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA) 
class DQN():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, config):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.config = config
        self.state_size = state_size
        self.action_size = action_size
        nodes = self.config.get("nodes", [128, 64])
        self.seed = self.config.get("seed", 0)
        lr = self.config.get("lr", 1e-4)
        memory_size = self.config.get("memory_size", 100000)
        self.batch_size = self.config.get("batch_size", 256)
        self.discount = self.config.get("discount", 0.9)
        self.tau = self.config.get("tau", 0.001)
        self.epsilon = self.config.get("epsilon", 0.1)
        self.epsilon_end = self.config.get("epsilon_end", 0.0001)
        self.epsilon_decay = self.config.get("epsilon_decay", 0.995)
        self.learn_every = self.config.get("learn_every", 4)
        self.dqn = self.config.get("dqn", "simple")
        self.per = self.config.get("per", False)

        np.random.seed(self.seed)
        random.seed(self.seed)
        torch.manual_seed(self.seed)
        # Q-Network
        if self.dqn == "dueling":
            self.qnetwork_local = Dueling_QNetwork(state_size, action_size,
                                                   self.seed).to(device)
            self.qnetwork_target = Dueling_QNetwork(state_size, action_size,
                                                    self.seed).to(device)
        else:
            self.qnetwork_local = QNetwork(state_size,
                                           action_size,
                                           self.seed,
                                           nodes=nodes).to(device)
            self.qnetwork_target = QNetwork(state_size,
                                            action_size,
                                            self.seed,
                                            nodes=nodes).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr)
        #self.optimizer = optim.RMSprop(self.qnetwork_local.parameters(), lr= lr)

        # Replay memory
        if self.per:
            self.memory = Memory(memory_size)
        else:
            self.memory = ReplayBuffer(memory_size, self.batch_size, self.seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
        self.scores = []

    def add_sample(self, state, action, reward, next_state, done):
        if self.per == False:
            self.memory.add((state, action, reward, next_state, 1 * done))
        else:

            target = self.qnetwork_local(
                Variable(torch.FloatTensor(state)).to(device)).data
            old_val = target[action]
            target_val = self.qnetwork_target(
                Variable(torch.FloatTensor(next_state)).to(device)).data
            if done:
                target[action] = reward
            else:
                target[action] = reward + self.discount * torch.max(target_val)

            error = abs(old_val - target[action])

            self.memory.add(error,
                            (state, action, reward, next_state, 1 * done))

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()  #set to eval mode
        with torch.no_grad():
            action_values = self.qnetwork_local.forward(state)
        self.qnetwork_local.train()  # set to training mode

        # Epsilon-greedy action selection
        if add_noise == False: eps = 0.0
        else: eps = self.epsilon
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.add_sample(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.learn_every
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                self.learn()

    def learn(self):
        if self.per:
            mini_batch, idxs, is_weights = self.memory.sample(self.batch_size)
            mini_batch = np.array(mini_batch).transpose()
            states = torch.from_numpy(np.vstack(
                mini_batch[0])).float().to(device)
            actions = torch.from_numpy(np.vstack(
                mini_batch[1])).long().to(device)
            rewards = torch.from_numpy(np.vstack(
                mini_batch[2])).float().to(device)
            next_states = torch.from_numpy(np.vstack(
                mini_batch[3])).float().to(device)
            dones = torch.from_numpy(
                np.vstack(mini_batch[4]).astype(np.uint8)).float().to(device)

        else:
            states, actions, rewards, next_states, dones = self.memory.sample()

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions.long())
        if self.dqn == "simple":
            # Get max predicted Q values (for next states) from target model
            Q_targets_next = self.qnetwork_target(next_states).detach().max(
                1)[0].unsqueeze(1)
        elif self.dqn == "double":  # Double DQN
            _, Q_targets_next = self.qnetwork_local(next_states).detach().max(
                1)  # Get argmax
            Q_targets_next = self.qnetwork_target(next_states).detach().gather(
                1, Q_targets_next.unsqueeze(1))
        elif self.dqn == "dueling":  # Dueling
            _, Q_targets_next = self.qnetwork_local(next_states).detach().max(
                1)  # Get argmax
            Q_targets_next = self.qnetwork_target(next_states).detach().gather(
                1, Q_targets_next.unsqueeze(1))
        else:
            raise OSError(
                'Error in DQN: {}. Options: simple, double, dueling.'.format(
                    self.dqn))

        # Compute Q targets for current states
        Q_targets = rewards + (self.discount * Q_targets_next * (1 - dones))
        """
        # update priority
        if self.per:
            error= abs(Q_expected - Q_targets)
            errors = error.data.cpu().numpy()
            for i in range(len(idxs)):
                idx = idxs[i]
                self.memory.update(idx, errors[i])
        """
        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        #loss = F.smooth_l1_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network -------------------
        self.soft_update(self.qnetwork_local, self.qnetwork_target)

    def soft_update(self, local_model, target_model):
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(self.tau * local_param.data +
                                    (1.0 - self.tau) * target_param.data)

    def reset(self):
        pass

    def update(self, score):
        self.scores.append(score)
        self.epsilon = max(self.epsilon_end, self.epsilon_decay *
                           self.epsilon)  # decrease epsilon

    def save(self, filename):
        torch.save(self.qnetwork_local.state_dict(), filename)

    def load(self, filename):
        self.qnetwork_local.load_state_dict(torch.load(filename))
Exemple #4
0
class PrioritizedAgent:
    '''Interact with and learn from the environment.
    The agent uses prioritized experience replay.
    '''
    def __init__(self, state_size, action_size, seed, is_double_q=False):
        '''Initialize an Agent.

        Params
        ======
            state_size (int): the dimension of the state
            action_size (int): the number of actions
            seed (int): random seed
        '''

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.t_step = 0  # Initialize time step (for tracking LEARN_EVERY_STEP and UPDATE_EVERY_STEP)
        self.running_loss = 0
        self.training_cnt = 0

        self.is_double_q = is_double_q

        self.qnetwork_local = QNetwork(self.state_size, self.action_size,
                                       seed).to(device)
        self.qnetowrk_target = QNetwork(self.state_size, self.action_size,
                                        seed).to(device)

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        self.prioritized_memory = PrioritizedMemory(BATCH_SIZE, BUFFER_SIZE,
                                                    seed)

    def act(self, state, mode, epsilon=None):
        '''Returns actions for given state as per current policy.
        
        Params
        ======
            state (array): current state
            mode (string): train or test
            epsilon (float): for epsilon-greedy action selection
        '''
        state = torch.from_numpy(state).float().unsqueeze(0).to(
            device)  # shape of state (1, state)

        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local.forward(state)
        self.qnetwork_local.train()

        if mode == 'test':
            action = np.argmax(action_values.cpu().data.numpy()
                               )  # pull action values from gpu to local cpu

        elif mode == 'train':
            if random.random() <= epsilon:  # random action
                action = random.choice(np.arange(self.action_size))
            else:  # greedy action
                action = np.argmax(action_values.cpu().data.numpy(
                ))  # pull action values from gpu to local cpu

        return action

    def step(self, state, action, reward, next_state, done):
        # add new experience in memory
        self.prioritized_memory.add(state, action, reward, next_state, done)

        # activate learning every few steps
        self.t_step = self.t_step + 1
        if self.t_step % LEARN_EVERY_STEP == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.prioritized_memory) >= BUFFER_SIZE:
                idxes, experiences, is_weights = self.prioritized_memory.sample(
                    device)
                self.learn(experiences,
                           GAMMA,
                           is_weights=is_weights,
                           leaf_idxes=idxes)

    def learn(self, experiences, gamma, is_weights, leaf_idxes):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
            is_weights (tensor array): importance-sampling weights for prioritized experience replay
            leaf_idxes (numpy array): indexes for update priorities in SumTree

        """

        states, actions, rewards, next_states, dones = experiences

        q_local_chosen_action_values = self.qnetwork_local.forward(
            states).gather(1, actions)
        q_target_action_values = self.qnetowrk_target.forward(
            next_states).detach()

        if self.is_double_q == True:
            q_local_next_actions = self.qnetwork_local.forward(
                next_states).detach().max(1)[1].unsqueeze(
                    1)  # shape (batch_size, 1)
            q_target_best_action_values = q_target_action_values.gather(
                1, q_local_next_actions)  # Double DQN

        elif self.is_double_q == False:
            q_target_best_action_values = q_target_action_values.max(
                1)[0].unsqueeze(1)  # shape (batch_size, 1)

        rewards = rewards.tanh(
        )  # rewards are clipped to be in [-1,1], referencing from original paper
        q_target_values = rewards + gamma * q_target_best_action_values * (
            1 - dones)  # zero value for terminal state

        td_errors = (q_target_values - q_local_chosen_action_values).tanh(
        )  # TD-errors are clipped to be in [-1,1], referencing from original paper
        abs_errors = td_errors.abs().cpu().data.numpy()  # pull back to cpu
        self.prioritized_memory.batch_update(
            leaf_idxes, abs_errors)  # update priorities in SumTree

        loss = (is_weights * (td_errors**2)).mean(
        )  # adjust squared TD loss by Importance-Sampling Weights

        self.running_loss += float(loss.cpu().data.numpy())
        self.training_cnt += 1

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        if self.t_step % UPDATE_EVERY_STEP == 0:
            self.update(self.qnetwork_local, self.qnetowrk_target)

    def update(self, local_netowrk, target_network):
        """Hard update model parameters, as indicated in original paper.
        
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
        """
        for local_param, target_param in zip(local_netowrk.parameters(),
                                             target_network.parameters()):
            target_param.data.copy_(local_param.data)
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())  #expliding
        else:
            return random.choice(np.arange(self.action_size))  #exploration

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences
        ## TODO: compute and minimize the loss
        "*** YOUR CODE HERE ***"
        """
        for i in range(BATCH_SIZE):
            if not dones[i]:
                max_val = self.qnetwork_target(next_states[i])
                best_val = max_val.argmax()
                target = rewards[i] + gamma*(max_val[best_val])
            else:
                target = rewards[i]
            current = self.qnetwork_local(states[i])[actions[i]]
            #current = self.qnetwork_local(states).gather(-1, actions.reshape(actions.size()[0], 1))
            self.loss = F.mse_loss(target, current)
            #self.loss.requires_grad = True
            self.optimizer.zero_grad()
            self.loss.backward()
            self.optimizer.step()
        """
        current = self.qnetwork_local(states).gather(
            -1, actions.reshape(actions.size()[0], 1))
        target1 = self.qnetwork_local.forward(next_states)
        max_val = target1.argmax(dim=-1)
        final = target1.gather(-1, max_val.reshape(max_val.shape[0], 1))

        target2 = self.qnetwork_target.forward(next_states)
        max_val = target2.argmax(dim=-1)
        final2 = target2.gather(-1, max_val.reshape(max_val.shape[0], 1))

        data = torch.cat([final, final2], 1)
        min_val = data.argmin(dim=-1)
        final = data.gather(-1, min_val.reshape(min_val.shape[0], 1))
        target = rewards + gamma * final * (1 - dones)

        self.loss = F.mse_loss(current, target)
        self.optimizer.zero_grad()
        self.loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        ## TODO: compute and minimize the loss
        "*** YOUR CODE HERE ***"
        self.qnetwork_local.train()

        #state_action_values = self.qnetwork_local.forward(states)

        #computing max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target.forward(
            next_states).detach().max(1)[0].unsqueeze(1)

        #compute targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        #computing best Q-action value (for each state) from local model
        Q_expected = self.qnetwork_local.forward(states).gather(1, actions)

        #loss = self.criterion(state_action_values, expected_state_action_values)
        loss = F.mse_loss(Q_expected, Q_targets)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Exemple #7
0
class Agent():
    def __init__(self, state_size, action_size, seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        for target_param, param in zip(self.qnetwork_local.parameters(),
                                       self.qnetwork_target.parameters()):
            target_param.data.copy_(param)

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        states, actions, rewards, next_states, dones = experiences
        actions = actions.view(actions.size(0), 1)
        dones = dones.view(dones.size(0), 1)

        curr_Q = self.qnetwork_local.forward(states).gather(1, actions)
        next_Q = self.qnetwork_target.forward(next_states)
        max_next_Q = torch.max(next_Q, 1)[0]
        max_next_Q = max_next_Q.view(max_next_Q.size(0), 1)
        expected_Q = rewards + (1 - dones) * gamma * max_next_Q
        loss = F.mse_loss(curr_Q, expected_Q.detach())

        #         Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        #         Q_targets = rewards + (gamma * Q_targets_next * (1-dones))
        #         Q_expected = self.qnetwork_local(states).gather(1, actions)
        #         loss = F.mse_loss(Q_expected, Q_targets)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        #         self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)
        for target_param, param in zip(self.qnetwork_target.parameters(),
                                       self.qnetwork_local.parameters()):
            target_param.data.copy_(TAU * param + (1 - TAU) * target_param)

    def soft_update(self, local_model, target_model, tau):
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Exemple #8
0
class Agent():
    def __init__(self, state_size, action_size, seed):

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed)
        self.qnetwork_target = QNetwork(state_size, action_size, seed)
        self.qnetwork_local.load_model("./dqn_LL_model data.pickle")
        self.qnetwork_target.load_model("./dqn_LL_model data.pickle")
        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
        self.loss = 0
        self.loss_list = []

    def step(self, state, action, reward, next_state, done, t_step):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = t_step
        if self.t_step % UPDATE_EVERY == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > 100 * BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
    
        """

        action_values = self.qnetwork_local.forward(state)

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values)
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            
        """
        states, actions, rewards, next_states, dones = experiences

        for time in range(BATCH_SIZE):
            # compute Q_target from the target network inputing next_state
            Q_target_av = np.max(
                self.qnetwork_target.forward(next_states[time]))
            Q_target = rewards[time] + gamma * (Q_target_av) * (
                1 - dones[time])  # if done, than the second will not be added
            # compute the Q_expected
            Q_expected = self.qnetwork_local.forward(
                states[time]
            )  # get q value for corrosponding action along dimension 1 of 64,4 matrix

            self.qnetwork_local.backward(Q_target, "MSE", actions[time])
            self.loss_list.append((Q_target - Q_expected[actions[time]])**2)
        self.loss = np.mean(self.loss_list)
        self.qnetwork_local.step()
        self.loss_list.clear()

        #  update target network #
        if self.t_step % UPDATE_FREQUENCY == 0:
            self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = tau*θ_local + (1 - tau)*θ_target
        """
        self.qnetwork_target.soft_update(local_model, TAU)
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Parameters:
        ==========
            state_size (int): This is the dimension of each state.
            action_size (int): This is the dimension of each action.
            seed (int): This is the random seed.
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network (local and target one)
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        # mounting an Adam optimizer for the backward propagation
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        # mounting an MSE Loss function
        self.criterion = nn.MSELoss()

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps).
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory.
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn.
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Parameters:
        ==========
            state (array_like): The current state.
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        self.optimizer.zero_grad()

        # Forward and backward passes
        output = self.qnetwork_local.forward(states).gather(1, actions)
        loss = self.criterion(output,
                              self.targets(gamma, rewards, next_states, dones))

        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def targets(self, gamma, rewards, next_states, dones):

        with torch.no_grad():
            q = self.qnetwork_target.forward(next_states)

        y = torch.add(rewards,
                      torch.mul(torch.max(q, dim=1, keepdim=True)[0],
                                gamma)) * (1 - dones)

        return y

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Parameters:
        ==========
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Exemple #10
0
class PrioritizedAgent:
    '''Interact with and learn from the environment.'''

    def __init__(self, state_size, action_size, seed, is_prioritized_sample=False):
        '''Initialize an Agent.

        Params
        ======
            state_size (int): the dimension of the state
            action_size (int): the number of actions
            seed (int): random seed
        '''

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.t_step = 0 # Initialize time step (for tracking LEARN_EVERY_STEP and UPDATE_EVERY_STEP)

        self.is_prioritized_sample = is_prioritized_sample

        self.qnetwork_local = QNetwork(self.state_size, self.action_size, seed).to(device)
        self.qnetowrk_target = QNetwork(self.state_size, self.action_size, seed).to(device)
        
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        
        if self.is_prioritized_sample == False:
            self.replay_memory = ReplayMemory(BATCH_SIZE, BUFFER_SIZE, seed)
        else:
            self.replay_memory = PrioritizedReplayMemory(BATCH_SIZE, BUFFER_SIZE, seed)
            
    
    def act(self, state, epsilon=0.):
        '''Returns actions for given state as per current policy.
        
        Params
        ======
            state (array-like): current state
            epsilon (float): for epsilon-greedy action selection
        '''
        state = torch.from_numpy(state).float().unsqueeze(0).to(device) # shape of state (1, state)
        
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local.forward(state)
        self.qnetwork_local.train()

        if random.random() <= epsilon: # random action
            action = random.choice(np.arange(self.action_size))
        else: # greedy action
            action = np.argmax(action_values.cpu().data.numpy()) # pull action values from gpu to local cpu
        
        return action
    
    def step(self, state, action, reward, next_state, done):
        # add new experience in memory
        self.replay_memory.add(state, action, reward, next_state, done)

        # activate learning every few steps
        self.t_step = self.t_step + 1
        if self.t_step % LEARN_EVERY_STEP == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.replay_memory) >= BUFFER_SIZE and self.is_prioritized_sample==False:
                experiences = self.replay_memory.sample(device)
                self.learn(experiences, GAMMA)
            elif len(self.replay_memory) >= BUFFER_SIZE and self.is_prioritized_sample==True:
                batch_idx, experiences, batch_ISWeights = self.replay_memory.sample(device)
                self.learn(experiences, GAMMA, ISWeights=batch_ISWeights, leaf_idxes=batch_idx)

    
    def learn(self, experiences, gamma, ISWeights=None, leaf_idxes=None):
        """Update value parameters using given batch of experience tuples.

        If is_prioritized_sample, then weights update is adjusted by ISWeights. 
        In addition, Double DQN is optional.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
            ISWeights (tensor array): importance-sampling weights for prioritized experience replay
            leaf_idxes (numpy array): indexes for update priorities in SumTree

        """

        # compute and minimize the loss
        if self.is_prioritized_sample == False:
            states, actions, rewards, next_states, dones = experiences

            q_local_chosen_action_values = self.qnetwork_local.forward(states).gather(1, actions)
            q_target_action_values = self.qnetowrk_target.forward(next_states).detach() # # detach from graph, don't backpropagate
            q_target_best_action_values = q_target_action_values.max(1)[0].unsqueeze(1) # shape (batch_size, 1)
            q_target_values = rewards + gamma * q_target_best_action_values * (1 - dones) # zero value for terminal state 
        
            loss = F.mse_loss(q_local_chosen_action_values, q_target_values)

            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
        else:
            states, actions, rewards, next_states, dones = experiences
            
            q_local_chosen_action_values = self.qnetwork_local.forward(states).gather(1, actions)
            #q_local_next_actions = self.qnetwork_local.forward(next_states).detach().max(1)[1].unsqueeze(1) # shape (batch_size, 1)
            q_target_action_values = self.qnetowrk_target.forward(next_states).detach()
            q_target_best_action_values = q_target_action_values.max(1)[0].unsqueeze(1) # shape (batch_size, 1)
            #q_target_best_action_values = q_target_action_values.gather(1, q_local_next_actions) # Double DQN
            q_target_values = rewards + gamma * q_target_best_action_values * (1 - dones) # zero value for terminal state

            abs_errors = torch.abs(q_target_values - q_local_chosen_action_values).cpu().data.numpy() # pull back to cpu
            self.replay_memory.batch_update(leaf_idxes, abs_errors) # update priorities in SumTree

            loss = F.mse_loss(q_local_chosen_action_values, q_target_best_action_values, reduce=False)
            loss = (ISWeights * loss).mean() # adjust TD loss by Importance-Sampling Weights
            
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

        # ------------------- update target network ------------------- #
        if self.t_step % UPDATE_EVERY_STEP == 0:
            self.update(self.qnetwork_local, self.qnetowrk_target)
    
    def update(self, local_netowrk, target_network):
        """Hard update model parameters, as indicated in original paper.
        
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to

        """
        for local_param, target_param in zip(local_netowrk.parameters(), target_network.parameters()):
            target_param.data.copy_(local_param.data)
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        if torch.cuda.device_count() > 1:
            print("Let's use", torch.cuda.device_count(), "GPUs!")
            # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
            net = nn.DataParallel(self.qnetwork_local)

            if torch.cuda.is_available():
                print("using GPUs!")
                net.cuda()

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        ## TODO: compute and minimize the loss
        "*** YOUR CODE HERE ***"
        # target net update
        # Get max predicted Q values (for next states) from target model
        qs_local = self.qnetwork_local.forward(states)
        qsa_local = qs_local[torch.arange(BATCH_SIZE, dtype=torch.long),
                             actions.reshape(BATCH_SIZE)]
        Q_expected = qsa_local.reshape((BATCH_SIZE, 1))

        qs_target = self.qnetwork_target.forward(next_states)
        _, qsa_local_argmax_a = torch.max(
            qs_local, dim=1)  #using the greedy policy (q-learning)
        qsa_target = qs_target[torch.arange(BATCH_SIZE, dtype=torch.long),
                               qsa_local_argmax_a.reshape(BATCH_SIZE)]

        qsa_target = qsa_target * (
            1 - dones.reshape(BATCH_SIZE)
        )  #target qsa value is zero when episode is complete
        qsa_target = qsa_target.reshape((BATCH_SIZE, 1))
        Q_targets = rewards + gamma * qsa_target

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        #logger.info('mse: {}'.format(delta))

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()  # evolutionary step - increase survival chances
        #logger.info('avg reward: {} mse:{}'.format(delta, np.mean(experiences.rewards())))

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBufferWithPriority(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences_with_index = self.memory.sample()
                self.learn(experiences_with_index, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences_with_index, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences_with_index (Tuple[torch.Variable]): tuple of (s, a, r, s', done, index, weightsIS) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones, index, weightsIS = experiences_with_index

        ## TODO: compute and minimize the loss

        # Get max predicted Q values (for next states) from target model
        ### Regular DQN
        # Q_targets_next = self.qnetwork_target.forward(next_states).detach().max(1)[0].unsqueeze(1)
        ### Double DQN
        with torch.no_grad():
            estimated_action = self.qnetwork_local(next_states).argmax(dim=1, keepdim=True)
            Q_targets_next = self.qnetwork_target.forward(next_states).gather(1, estimated_action)
            # Compute Q targets for current states
            Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local.forward(states).gather(1, actions)
        
        # Compute importance-sampling weight
        
        # Compute loss
        loss_fn = nn.MSELoss(reduce=False)
        loss = loss_fn(Q_expected, Q_targets)
        weighted_loss = torch.sum(torch.from_numpy(weightsIS).float().to(device) * loss)
        # Update priority according to TD error
        self.memory.update_priority(list(loss.detach().cpu().numpy().squeeze()**ALPHA+EPS), index)
        
        # Minimize the loss
        self.optimizer.zero_grad()
        weighted_loss.backward()
        self.optimizer.step()
        
        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
Exemple #13
0
class Agent():
	""" Agent used to interact with and learns from the environment """

	def __init__(self, state_size, action_size, config):
		""" Initialize an Agent object """

		self.state_size = state_size
		self.action_size = action_size 
		self.config = config 

		# logging for this class 
		self.logger = logging.getLogger(self.__class__.__name__)

		# gpu support 
		self.device = pick_device(config, self.logger)

		## Q-Networks 
		self.qnetwork_local = QNetwork(state_size, action_size, config).to(self.device)
		self.qnetwork_target = QNetwork(state_size, action_size, config).to(self.device)

		## Get optimizer for local network 
		self.optimizer = getattr(optim, config["optimizer"]["optimizer_type"])(
			self.qnetwork_local.parameters(), 
			betas=tuple(config["optimizer"]["betas"]),
			**config["optimizer"]["optimizer_params"])

		## Replay memory
		self.memory = ReplayBuffer(
			config=config,
			action_size=action_size, 
			buffer_size=int(config["DQN"]["buffer_size"]), 
			batch_size=config["trainer"]["batch_size"]
			)

		## Initialize time step (for update every `update_every` steps)
		self.t_step = 0


	def step(self, state, action, reward, next_state, done):
		
		# Save experience in replay memory 
		self.memory.add(state, action, reward, next_state, done)

		# Learn every `update_every` time steps 
		self.t_step = (self.t_step + 1) % self.config["DQN"]["update_every"]
		if (self.t_step == 0):
			# If enough samples are available in memory, get random subset and learn
			if len(self.memory) > self.config["trainer"]["batch_size"]:
				experiences = self.memory.sample()
				self.learn(experiences, self.config["DQN"]["gamma"])



	def act(self, state, epsilon):
		""" Returns actions for given state as per current policy """
		# pdb.set_trace()

		# Convert state to tensor
		state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
		
		## Evaluation mode
		self.qnetwork_local.eval()
		with torch.no_grad():
			# Forward pass of local qnetwork 
			action_values = self.qnetwork_local.forward(state)
		
		## Training mode 
		self.qnetwork_local.train()
		# Epsilon-greedy action selection 
		if random.random() > epsilon:
			# Choose the best action (exploitation)
			return np.argmax(action_values.cpu().data.numpy())
		else:
			# Choose random action (exploration)
			return random.choice(np.arange(self.action_size))


	def learn(self, experiences, gamma):
		""" Update value parameters using given batch of experience tuples """
		
		states, actions, rewards, next_states, dones = experiences 

		## TD target
		# Get max predicted Q-values (for next states) from target model
		# Q_targets_next = torch.argmax(self.qnetwork_target(next_states).detach(), dim=1).unsqueeze(1)
        Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
		Q_targets_next = Q_targets_next.type(torch.FloatTensor)

		# Compute Q-targets for current states 
		Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

		## old value
		# Get expected Q-values from local model 
		Q_expected = torch.gather(self.qnetwork_local(states), dim=1, index=actions)

		# Compute loss 
		loss = F.mse_loss(Q_expected, Q_targets)
		# Minimize loss 
		self.optimizer.zero_grad()
		loss.backward()
		self.optimizer.step()

		# update target network with a soft update
		self.soft_update(self.qnetwork_local, self.qnetwork_target, self.config["DQN"]["tau"])
Exemple #14
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 fc1_units=64,
                 fc2_units=64,
                 fc3_units=None,
                 double_q=False):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.double_q = double_q

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed,
                                       fc1_units, fc2_units,
                                       fc3_units).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed,
                                        fc1_units, fc2_units,
                                        fc3_units).to(device)
        if torch.cuda.is_available():
            self.qnetwork_local.cuda()
            self.qnetwork_target.cuda()
        else:
            self.qnetwork_local.cpu()
            self.qnetwork_target.cpu()
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=LR,
                                    weight_decay=WD)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def get_action(self, state, eps=0.):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)

        with torch.no_grad():
            output = self.qnetwork_local.forward(state)
            action_values = self.qnetwork_local.forward(state)

        if random.random() <= eps:
            return np.random.choice(np.arange(self.action_size))
        else:
            return output.argmax().item()

    def learn(self, experiences, gamma):
        states, actions, rewards, next_states, dones = experiences

        # double q learning
        argmax_a = self.qnetwork_local.forward(next_states).detach().argmax(
            dim=1).unsqueeze(dim=1)
        a_val = self.qnetwork_target.forward(next_states).detach()
        Q_targets_next = a_val.gather(1, argmax_a)

        Q_targets = rewards + GAMMA * Q_targets_next
        Q_expected = self.qnetwork_local.forward(states).gather(1, actions)

        loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Exemple #15
0
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed=0):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Initialize both the target and the local Q networks
        self.qnetwork_local = QNetwork(state_size, action_size).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size).to(device)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, MINIBATCH_SIZE, seed)

        # The Optimizer used is Adam
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LEARNING_RATE)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        # Used to determine when the agent starts learning
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # check if there are enough examples in memory
            if len(self.memory) > MINIBATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            # Aquire an action by passing the current state to the local network
            action_values = self.qnetwork_local.forward(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """

        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        # running forward on the target network on the set of experiences
        Q_targets_next = self.qnetwork_target.forward(next_states).detach().max(1)[0].unsqueeze(1)

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Get expected Q values from local model
        Q_expected = self.qnetwork_local.forward(states).gather(1, actions)


        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.soft_updatee(self.qnetwork_local, self.qnetwork_target, TAU)

    def hard_update(self, local_model, target_model):
        """Hard update the bias and the weights from the local Network to the target network"""
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(local_param.data)

    def soft_updatee(self, local_model, target_model, tau):
        """Soft update the weights and biases from the local Network to the target network using the update factor tau"""
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)

    def save_weights(self, path='./p1_weights'):
        """Save the trained weights of the current local Q network‚"""
        torch.save(self.qnetwork_local.state_dict(), path)

    def load_saved_weights(self, path='./p1_weights'):
        """"
        Load the weights to the local and target Q network
        """
        self.qnetwork_local.load_state_dict(torch.load(path))
        self.qnetwork_local.eval()

        self.qnetwork_target.load_state_dict(torch.load(path))
        self.qnetwork_target.eval()
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.

        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(
                state)  # same as self.qnetwork_local.forward(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        ## TODO: compute and minimize the loss
        # "*** YOUR CODE HERE ***"
        qs_local = self.qnetwork_local.forward(states)
        qsa_local = qs_local[torch.arange(BATCH_SIZE, dtype=torch.long),
                             actions.reshape(BATCH_SIZE)]
        qsa_local = qsa_local.reshape((BATCH_SIZE, 1))
        # print(qsa_local.shape)

        # # DQN Target
        # qs_target = self.qnetwork_target.forward(next_states)
        # qsa_target, _ = torch.max(qs_target, dim=1) #using the greedy policy (q-learning)
        # qsa_target = qsa_target * (1 - dones.reshape(BATCH_SIZE)) #target qsa value is zero when episode is complete
        # qsa_target = qsa_target.reshape((BATCH_SIZE,1))
        # TD_target = rewards + gamma * qsa_target
        # #print(qsa_target.shape, TD_target.shape, rewards.shape)

        # # Double DQN Target ver 1
        # qs_target = self.qnetwork_target.forward(next_states)
        # if random.random() > 0.5:
        #     _, qsa_target_argmax_a = torch.max(qs_target, dim=1) #using the greedy policy (q-learning)
        #     qsa_target = qs_target[torch.arange(BATCH_SIZE, dtype=torch.long), qsa_target_argmax_a.reshape(BATCH_SIZE)]
        # else:
        #     _, qsa_local_argmax_a = torch.max(qs_local, dim=1) #using the greedy policy (q-learning)
        #     #qsa_target = qs_target[torch.arange(BATCH_SIZE, dtype=torch.long), qsa_local_argmax_a.reshape(BATCH_SIZE)]
        #     ##qsa_target = qs_local[torch.arange(BATCH_SIZE, dtype=torch.long), qsa_local_argmax_a.reshape(BATCH_SIZE)]

        # qsa_target = qsa_target * (1 - dones.reshape(BATCH_SIZE)) #target qsa value is zero when episode is complete
        # qsa_target = qsa_target.reshape((BATCH_SIZE,1))
        # TD_target = rewards + gamma * qsa_target

        # Double DQN Target ver 2 (based upon double dqn paper)
        qs_target = self.qnetwork_target.forward(next_states)
        _, qsa_local_argmax_a = torch.max(
            qs_local, dim=1)  # using the greedy policy (q-learning)
        qsa_target = qs_target[torch.arange(BATCH_SIZE, dtype=torch.long),
                               qsa_local_argmax_a.reshape(BATCH_SIZE)]

        qsa_target = qsa_target * (
            1 - dones.reshape(BATCH_SIZE)
        )  # target qsa value is zero when episode is complete
        qsa_target = qsa_target.reshape((BATCH_SIZE, 1))
        TD_target = rewards + gamma * qsa_target

        # print(qsa_target.shape, TD_target.shape, rewards.shape)

        # #Udacity's approach
        # # Get max predicted Q values (for next states) from target model
        # Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        # # Compute Q targets for current states
        # TD_target = rewards + (gamma * Q_targets_next * (1 - dones))
        # # Get expected Q values from local model
        # qsa_local = self.qnetwork_local(states).gather(1, actions)

        # diff = qsa_local - TD_target
        # loss = torch.matmul(torch.transpose(diff, dim0=0, dim1=1), diff) #loss is now a scalar
        loss = F.mse_loss(
            qsa_local, TD_target)  # much faster than the above loss function
        # print(loss)
        # minimize the loss
        self.optimizer.zero_grad()  # clears the gradients
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class Agent():
    """Interacts with the environment , act and learn from the environment"""
    def __init__(self, state_size, action_size, seed):
        """Initializes Agent object ,
            1. agent variables,
            2. local and target QNetworks,
            3. Optimizer, 
            4. Replay Buffer"""

        self.state_size = state_size
        self.action_size = action_size
        self.seed = torch.manual_seed(seed)
        self.t_step = 0

        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

    def act(self, state, eps=eps):
        """To read state , pass it through local network and return action values as per the given policy. 
        Then from action values , based on eps gives argmax or chooses a random action from action values"""

        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local.forward(state)
        self.qnetwork_local.train()

        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(1, self.action_size))

    def step(self, state, action, reward, next_state, done):
        """Perform a step which consists of,
                1. Add into Replay Buffer
                2. Learn      
        """
        self.memory.add(state, action, reward, next_state, done)

        self.t_step = (self.t_step + 1) % LEARN_EVERY

        if self.t_step == 0:
            if self.memory.getlength > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def learn(self, experiences, GAMMA):
        """Calculate the MSE based on Expected Q value and Target Q value. 
        Use optimizer to learn from MSE and calculate target weights and then update those weights in the target Q network"""

        states, actions, rewards, next_states, done = experiences

        Q_expected = self.qnetwork_local(states).gather(1, actions)

        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            0)[1].unsqueeze(1)

        Q_targets = rewards + (GAMMA * Q_targets_next * (1 - done))

        loss = F.mse_loss(Q_expected, Q_targets)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Perform inplace copy of target parameters based on Tau"""

        for local_params, target_params in zip(local_model.parameters(),
                                               target_model.parameters()):
            target_params.data.copy_(tau * local_params.data +
                                     (1 - tau) * target_params.data)
Exemple #18
0
class Agent(object):
    def __init__(self, state_size, action_size, mem_length=100000, ddqn=True):
        self.gamma = 0.99
        self.batch_size = 64
        self.action_size = action_size
        self.ddqn = ddqn

        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")

        if ddqn:
            self.model = DuelingQNetwork(state_size,
                                         action_size).to(self.device)
            self.target_model = DuelingQNetwork(state_size,
                                                action_size).to(self.device)
            self.optimizer = optim.Adam(self.model.parameters(), lr=5e-4)
            self.experience = self.ddqn_experience
        else:
            self.model = QNetwork(state_size, action_size).to(self.device)
            self.optimizer = optim.Adam(self.model.parameters(), lr=5e-4)
            self.experience = self.dqn_experience

        # replay memory
        self.memory = deque(maxlen=mem_length)

    def act(self, state, eps=0):
        # epsilon greedy
        if random.random() < eps:
            return random.choice(np.arange(self.action_size))

        # state to predict action from
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)

        self.model.eval()
        with torch.no_grad():
            action_values = self.model(state)

        self.model.train()
        return np.argmax(action_values.cpu().data.numpy())

    def ddqn_experience(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        if len(self.memory) < self.batch_size:
            return

        # get random batch
        states, actions, rewards, next_states, terminals = self.get_batch()

        # Get expected Q values from local model
        expected = self.model(states).gather(1, actions)
        Q = self.model(next_states).detach()

        # Get max predicted Q values (for next states) from target model
        targets_next = self.target_model(next_states).detach()
        targets_next = targets_next.gather(1, Q.max(1)[1].unsqueeze(1))

        # Compute Q targets for current states
        targets = rewards + (self.gamma * targets_next * (1 - terminals))

        # compute loss
        loss = functional.mse_loss(expected, targets)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # update target network
        lr = 0.001
        for target_param, primary_param in zip(self.target_model.parameters(),
                                               self.model.parameters()):
            target_param.data.copy_(lr * primary_param.data +
                                    (1 - lr) * target_param.data)

    def dqn_experience(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        if len(self.memory) < self.batch_size:
            return

        # get random batch
        states, actions, rewards, next_states, terminals = self.get_batch()

        Q = self.model.forward(states)
        Q = Q.gather(1, actions).squeeze(1)
        next_Q = self.model.forward(next_states)
        max_next_Q = torch.max(next_Q, 1)[0]
        expected = rewards.squeeze(1) + self.gamma * max_next_Q

        # update model
        loss = functional.mse_loss(Q, expected)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def get_batch(self):
        experiences = np.array(random.sample(self.memory, k=self.batch_size))
        experiences = [np.vstack(experiences[:, i]) for i in range(5)]

        # convert data to tensors
        states = torch.FloatTensor(experiences[0]).to(self.device)
        actions = torch.LongTensor(experiences[1]).to(self.device)
        rewards = torch.FloatTensor(experiences[2]).to(self.device)
        next_states = torch.FloatTensor(experiences[3]).to(self.device)
        terminals = torch.FloatTensor(experiences[4].astype(np.uint8)).to(
            self.device)

        return states, actions, rewards, next_states, terminals
Exemple #19
0
class Agent():
    def __init__(self, state_size, action_size, seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

        self.t_step = 0

    def step(self, state, action, reward, next_state, done, priority, B_P):

        self.memory.add(state, action, reward, next_state, done, priority)

        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample(B_P)
                self.learn(experiences, GAMMA)

    def priority(self, states, actions, rewards, next_states):
        if len(states.shape) == 1:
            # only a single experience tuple to evaluate
            # need to format variables accordingly:
            states = torch.from_numpy(states).float().unsqueeze(0).to(device)
            next_states = torch.from_numpy(next_states).float().unsqueeze(
                0).to(device)
            rewards = torch.tensor([[rewards]], dtype=torch.float).to(
                device)  # scalar value
            actions = torch.tensor([[actions]], dtype=torch.uint8).to(
                device)  # scalar value

        action_local = self.qnetwork_local.forward(next_states).argmax(1)
        max_q = self.qnetwork_target.forward(next_states)[
            np.arange(action_local.shape[0]), action_local]
        delta = (rewards.squeeze() + GAMMA * max_q) - self.qnetwork_local(
            states)[np.arange(actions.shape[0]),
                    actions.byte().squeeze().cpu().numpy()]
        priority = torch.abs(delta) + E_p
        return priority.squeeze().tolist()

    def act(self, state, eps=0.):

        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        states, actions, rewards, next_states, dones, weights, experience_indices = experiences

        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        Q_expected = self.qnetwork_local(states).gather(1, actions)

        loss = WEightedMSE(Q_expected, Q_targets, weights)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

        # ------------------- update priorities in the replay buffer ------------------- #
        new_priorities = self.priority(states, actions, rewards, next_states)
        for count, idx in enumerate(experience_indices):
            self.memory.memory[idx] = self.memory.memory[idx]._replace(
                priority=new_priorities[count])

    def soft_update(self, local_model, target_model, tau):
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Exemple #20
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def priority(self, states, actions, rewards, next_states):
        if len(states.shape) == 1:
            # only a single experience tuple to evaluate
            # need to format variables accordingly:
            states = torch.from_numpy(states).float().unsqueeze(0).to(device)
            next_states = torch.from_numpy(next_states).float().unsqueeze(
                0).to(device)
            rewards = torch.tensor([[rewards]], dtype=torch.float).to(
                device)  # scalar value
            actions = torch.tensor([[actions]], dtype=torch.uint8).to(
                device)  # scalar value

        action_local = self.qnetwork_local.forward(next_states).argmax(1)
        max_q = self.qnetwork_target.forward(next_states)[
            np.arange(action_local.shape[0]), action_local]
        delta = (rewards.squeeze() + GAMMA * max_q) - self.qnetwork_local(
            states)[np.arange(actions.shape[0]),
                    actions.byte().squeeze().cpu().numpy()]
        priority = torch.abs(delta) + E_PRIORITY
        return priority.squeeze().tolist()

    def step(self, state, action, reward, next_state, done, priority,
             b_priority):

        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done, priority)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample(
                    b_priority)  # needs b_priority to compute weights
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.
        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones, weights, experience_indices = experiences

        ## TODO: compute and minimize the loss
        "*** YOUR CODE HERE ***"
        actions_local = self.qnetwork_local.forward(next_states).argmax(1)
        max_q_values = self.qnetwork_target.forward(next_states)[
            np.arange(actions_local.shape[0]), actions_local]
        td_target = rewards.squeeze() + gamma * max_q_values * (
            1 - dones.squeeze())

        predicted_q_values = self.qnetwork_local.forward(states)
        predicted_q_values = predicted_q_values[
            np.arange(predicted_q_values.shape[0]),
            actions.squeeze()]

        self.optimizer.zero_grad(
        )  # must zero the gradients each time, otherwise they get summed

        # Forward and backward passes
        loss = WeightedMSE(predicted_q_values, td_target, weights)
        loss.backward()  # backward pass to compute the gradients
        self.optimizer.step(
        )  # take a step using the learning rate and computed gradient

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

        # ------------------- update priorities in the replay buffer ------------------- #
        new_priorities = self.priority(states, actions, rewards, next_states)
        for count, idx in enumerate(experience_indices):
            self.memory.memory[idx] = self.memory.memory[idx]._replace(
                priority=new_priorities[count])

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Exemple #21
0
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        hidden_layers = [128,64]
        self.qnetwork_local = QNetwork(state_size, action_size, hidden_layers, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, hidden_layers, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        
        #Initialising target and local environment with same weights
        self.hard_update(self.qnetwork_local,self.qnetwork_target)
    
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
    
    
    def update(self):
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if np.random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        ## TODO: compute and minimize the loss
        "*** YOUR CODE HERE ***"
        
        max_actions = self.qnetwork_local.forward(next_states).detach().max(1)[1].unsqueeze(1)
        output_target = self.qnetwork_target.forward(next_states).gather(1,max_actions)
        td_target = rewards + gamma*(output_target*(1-dones))
        output_local= self.qnetwork_local(states).gather(1, actions)
        
        loss = F.mse_loss(output_local,td_target)
        
        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.qnetwork_local.parameters(), 1)
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

    def hard_update(self,local_model,target_model):
        
        for target_param,local_param in zip(target_model.parameters(),local_model.parameters()):
            target_param.data.copy_(local_param)
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed, use_ddqn=True):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.use_ddqn = use_ddqn

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                if self.use_ddqn:  # Use double dqn for training if selected
                    self.learn_ddqn(experiences, GAMMA)
                else:
                    self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # compute and minimize the loss
        qvalue_target = self.qnetwork_target.forward(next_states).detach().max(
            1)[0].unsqueeze(1)
        y = rewards + gamma * qvalue_target * (1 - dones)

        qvalue = self.qnetwork_local.forward(states).gather(1, actions)
        #print("Best actions:")
        #print(qvalue)
        loss = F.mse_loss(y, qvalue)

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        #for param in self.qnetwork_local.parameters():
        #    param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def learn_ddqn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples uaing double dqn method.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get best actions from local model to use in the double DQN
        best_actions_local = self.qnetwork_local(next_states).detach().argmax(
            dim=1).unsqueeze(1)

        # Get predicted Q values (for next states) from target model using actions selected from double DQN using local model
        qvalue_target = self.qnetwork_target(next_states).detach().gather(
            1, best_actions_local)

        # compute and minimize the loss
        y = rewards + gamma * qvalue_target * (1 - dones)

        qvalue = self.qnetwork_local.forward(states).gather(1, actions)
        #print("Best actions:")
        #print(qvalue)
        loss = F.mse_loss(y, qvalue)

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        #for param in self.qnetwork_local.parameters():
        #    param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        #self.t_step = (self.t_step + 1) % UPDATE_EVERY
        #if self.t_step == 0:
        self.t_step += 1
        # If enough samples are available in memory, get random subset and learn
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        ## TODO: compute and minimize the loss
        # train loop
        # states and next_states (batch_size x num_states)
        # actions and rewards (batch_size x 1)

        # forward pass
        # use local network to compute q_est(s,w)[action]
        ps_local = self.qnetwork_local.forward(states).gather(1, actions)
        # use target network compute r + g*max(q_est[s',a, w-]), this tensor should be detached from backward computations
        ps_target = rewards + gamma * (
            1 - dones) * self.qnetwork_target.forward(
                next_states).detach().max(dim=1)[0].view(-1, 1)
        # compute loss
        loss = F.mse_loss(ps_local, ps_target)
        # backward pass
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        if (self.t_step % UPDATE_EVERY) == 0:
            self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)