class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 Double_DQN=False,
                 Priority_Replay_Paras=False):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.BUFFER_SIZE = BUFFER_SIZE
        # setting optional extra techniques
        self.Double_DQN = Double_DQN
        self.prio_e, self.prio_a, self.prio_b = Priority_Replay_Paras

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed,
                                   Double_DQN, Priority_Replay_Paras)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences, experience_indexes, priorities = self.memory.sample(
                )
                self.learn(experiences, experience_indexes, priorities, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, experience_indexes, priorities, gamma):
        """Update value parameters using given batch of experience tuples.
        
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        ## compute and minimize the loss

        # calculate current Q_sa
        Q_s = self.qnetwork_local(states)
        Q_s_a = self.qnetwork_local(states).gather(1, actions)

        # Get max predicted Q values (for next states) from target model
        if self.Double_DQN:
            # double DQN uses the local network for selecting best action and evaluates it with target network
            best_actions = self.qnetwork_local(next_states).max(
                1)[1].unsqueeze(1)
            Q_s_next = self.qnetwork_target(next_states).gather(
                1, best_actions)
        else:
            Q_s_next = self.qnetwork_target(next_states).max(1)[0].unsqueeze(1)

        targets = rewards + gamma * Q_s_next * (1 - dones)

        # calculate loss between the two
        losses = (Q_s_a - targets)**2

        # importance-sampling weights aka formula from Prioritized Experience Replay
        importance_weights = (((1 / self.BUFFER_SIZE) *
                               (1 / priorities))**self.prio_b).unsqueeze(1)

        loss = (importance_weights * losses).mean()

        # calculate gradients and do a step
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # calculate priorities and update them
        target_priorities = abs(Q_s_a -
                                targets).detach().cpu().numpy() + self.prio_e
        self.memory.update_priority(experience_indexes, target_priorities)

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def update_beta(self, interpolation):
        """Update priority beta for unbiased Q updates.

        Params
        ======
            interpolation (float): number between 0 and 1 specifying how much to interpolate to beta = 1
        """
        self.prio_b += (1 - self.prio_b) * interpolation
Beispiel #2
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences
        self.optimizer.zero_grad()
        print(actions.shape)
        print(states.shape)
        ## TODO: compute and minimize the loss
        Qpredicted = torch.gather(self.qnetwork_local(states), 1, actions)
        Qactual = self.qnetwork_target(next_states).max(1)
        Qactual[dones] = 0
        Qactual = rewards + gamma * Qactual
        loss = nn.MSELoss()
        loss1 = loss(Qactual, Qpredicted)
        loss1.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Beispiel #3
0
class Agent():
    """Agent that interacts with and learns from the environment."""
    def __init__(self, state_size, action_size,
                 seed):  #last arg was , , use_dueling=False use_double=False
        """Initialize the Agent:
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Apply the use of a Replay Memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

        # Initialize time step - this will need to be updated according to UPDATE_EVERY
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, np.int32(action), reward, next_state, done)

        #
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:  #Only need to learn when every UPDATE_EVERY time steps
            if len(self.memory) > BATCH_SIZE:  # Need enough to make a batch
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions as determined by the policy, and specific state agent is in.

        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update parameters from given batch.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        # Computer for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class Agent:
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, double_dqn=True):
        self.state_size = state_size
        self.action_size = action_size
        self.double_dqn = double_dqn

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size).to(device)
        self.qnetwork_target = copy.deepcopy(self.qnetwork_local)
        self.optimizer = torch.optim.Adam(self.qnetwork_local.parameters(),
                                          lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE)
        self.t_step = 0

    def save(self, path, *data):
        torch.save(self.qnetwork_local.state_dict(),
                   path / "model_checkpoint.local")
        torch.save(self.qnetwork_target.state_dict(),
                   path / "model_checkpoint.target")
        torch.save(self.optimizer.state_dict(),
                   path / 'model_checkpoint.optimizer')
        with open(path / 'model_checkpoint.meta', 'wb') as file:
            pickle.dump(data, file)

    def load(self, path, *defaults):
        try:
            print("Loading model from checkpoint...")
            self.qnetwork_local.load_state_dict(
                torch.load(path / 'model_checkpoint.local'))
            self.qnetwork_target.load_state_dict(
                torch.load(path / 'model_checkpoint.target'))
            self.optimizer.load_state_dict(
                torch.load(path / 'model_checkpoint.optimizer'))
            with open(path / 'model_checkpoint.meta', 'rb') as file:
                return pickle.load(file)
        except:
            print("No checkpoint file was found")
            return defaults

    def step(self, state, action, reward, next_state, done, train=True):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if train and len(self.memory) > BATCH_SIZE and self.t_step == 0:
            self.learn(self.memory.sample(), GAMMA)

    def act(self, state, eps=0.):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        states, actions, rewards, next_states, dones = experiences

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        if self.double_dqn:
            Q_best_action = self.qnetwork_local(next_states).max(1)[1]
            Q_targets_next = self.qnetwork_target(next_states).gather(
                1, Q_best_action.unsqueeze(-1))
        else:
            Q_targets_next = self.qnetwork_target(next_states).detach().max(
                1)[0].unsqueeze(-1)

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Compute loss and perform a gradient step
        self.optimizer.zero_grad()
        loss = F.mse_loss(Q_expected, Q_targets)
        loss.backward()
        self.optimizer.step()

        # Update target network
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Beispiel #5
0
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    
    def step(self, state, action, reward, next_state):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s') tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        # Compute Q targets for current states 
        Q_targets = rewards + (gamma * Q_targets_next)

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

    def extractPolicy(self):
        policy = np.zeros((9, 9)) - 1
        for a in range(9):
            for h in range(9):
                state = torch.from_numpy(np.asarray([a, h])).float().unsqueeze(0).to(device)
                self.qnetwork_local.eval()
                with torch.no_grad():
                    action_values = self.qnetwork_local(state)
                self.qnetwork_local.train()
                max_action = np.argmax(action_values.cpu().data.numpy())
                policy[a,h] = max_action
        return policy

    def processPolicy(self, policy):
        results = ''
        print(policy)
        for a in range(9):
            results += '{} & '.format(a)
            for h in range(9):
                action = policy[a, h]
                assert(action in [0, 1, 2])
                if action == 0:
                    results += '\\ag'
                elif action == 1:
                    results += '\\ob'
                else:
                    results += '\\wt'
                results += ' & '
            results = results[:-2]
            results += '\\\\ \n'
        print(results)
Beispiel #6
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(DEVICE)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(DEVICE)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory,
            # get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.

        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(DEVICE)
        # Notify all layers to work in eval mode
        self.qnetwork_local.eval()
        # Deactivate autograd engine -> reduces memory & speeds up computation
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        # Re-enable train mode in all layers
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Compute and minimize the loss
        criterion = torch.nn.MSELoss()

        ## Move input and label tensors to correct device
        self.qnetwork_local.to(DEVICE)
        self.qnetwork_target.to(DEVICE)
        inputs = next_states.to(DEVICE)

        ## Select max predicted Q value for next state using the target model
        with torch.no_grad():
            next_target = self.qnetwork_target(inputs)
            next_q_target = next_target.max(1)[0].unsqueeze(1)
        ## Calculate q targets
        target_q = rewards + (gamma * next_q_target * (1 - dones))

        ## Use local model to get the expected Q value
        expected_q = self.qnetwork_local(states).gather(1, actions)

        ## Compute and minimize the loss
        loss = criterion(expected_q, target_q)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed, prioritized=False):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            prioritized (bool): whether to use proportional prioritized experience replay
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.prioritized = prioritized
        if self.prioritized:
            self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE,
                                                  BATCH_SIZE, seed)
        else:
            self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                       seed)

        # Initialize time step
        self.t_step = 0

    def step(self, state, action, reward, next_state, done, double=False):
        """Gather experience for each step and learn from it
        Params
        ======
        """
        # Save experience to Replay Buffer
        self.memory.add(state, action, reward, next_state, done)

        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            if len(self.memory) >= BATCH_SIZE:
                self.learn(self.memory.sample(), GAMMA, double)

    def act(self, state, epsilon=0.0):
        """Get action from on/off policy

        Params
        ======
            state (array_like): current state
            epsilon (float): for epsilon-greedy action selection
        """
        _state = torch.from_numpy(state).float().unsqueeze(0).to(device)

        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(_state)
        self.qnetwork_local.train()

        if random.random() > epsilon:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma, double=False):
        """Learn from sample experiences and update weights for both target and local models

        Params
        ======
            experiences (*array_like): (s, a, r, s', done)
            gamma (float): discount rate
        """
        if self.prioritized:
            states, actions, rewards, next_states, dones, is_weights, sample_idx = experiences
        else:
            states, actions, rewards, next_states, dones = experiences
        self.qnetwork_local.train()
        self.qnetwork_target.eval()
        q_expected = self.qnetwork_local(states).gather(1, actions)

        if double:
            self.qnetwork_local.eval()
            with torch.no_grad():
                _, next_actions = self.qnetwork_local(next_states).max(1)
                q_target_next = self.qnetwork_target(next_states).gather(
                    1,
                    next_actions.unsqueeze(1).long())
            self.qnetwork_local.train()
        else:
            q_target_next = self.qnetwork_target(next_states).detach().max(
                1)[0].unsqueeze(1)
        q_target = rewards + gamma * q_target_next * (1 - dones)

        if self.prioritized:
            diff = q_target - q_expected
            loss = 0.5 * torch.pow(diff, 2)  # Mean Square Error
            loss = (is_weights * loss).mean()
            self.memory.update_priority(
                diff.abs().detach().squeeze(1).cpu().data.numpy(), sample_idx)
        else:
            loss = F.mse_loss(q_target, q_expected)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.update(self.qnetwork_local, self.qnetwork_target, TAU)

    def update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """

        for local_params, target_params in zip(local_model.parameters(),
                                               target_model.parameters()):
            target_params.data.copy_(tau * local_params.data +
                                     (1. - tau) * target_params.data)

    def save(self, path):
        """Save model parameters.

        Params
        ======
            path (str): path to save a model, torch model with extension of ".pt" or ".pth"
        """
        torch.save(self.qnetwork_local.state_dict(), path)
        print("Model saved as {}".format(path))

    def load(self, path, device='cpu'):
        """Load model parameters.

        Params
        ======
            path (str): path to load a model, torch model with extension of ".pt" or ".pth"
        """
        self.qnetwork_local.load_state_dict(
            torch.load(path, map_location=device))
        print("Model loaded from {} on {}".format(path, device))
class Agent():
    """ An agent to interact with the environment and learn from it """
    def __init__(self, state_size, action_size, seed):
        """ Initialization function. 
        
        Params
        ======
            state_size (int): dim of each state
            action_size (int): dim of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # define Q-Network
        if USE_DUELING_NETWORK:
            self.qnetwork_local = DuelingQNetwork(state_size, action_size,
                                                  seed, 128, 32, 64,
                                                  32).to(device)
            self.qnetwork_target = DuelingQNetwork(state_size, action_size,
                                                   seed, 128, 32, 64,
                                                   32).to(device)
        else:
            self.qnetwork_local = QNetwork(state_size, action_size,
                                           seed).to(device)
            self.qnetwork_target = QNetwork(state_size, action_size,
                                            seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        if USE_PRIORITIZED_REPLAY:
            self.memory = PrioritizedReplayBuffer(action_size,
                                                  BUFFER_SIZE,
                                                  BATCH_SIZE,
                                                  seed,
                                                  device,
                                                  alpha=.6,
                                                  beta=.4,
                                                  beta_scheduler=1.)
        else:
            self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                       seed)
        # initial time step
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # save experience in memory replay
        self.memory.add(state, action, reward, next_state, done)
        # learn every UPDATE_EVERY time steps
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # when the memory is full enough
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0):
        """ Return actions for given state as per current policy
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """ Update value parameters using given batch of experience tuples.
        
        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done, w) tuples
            gamma (float): discount factor
        """

        if USE_PRIORITIZED_REPLAY:
            states, actions, reward, next_states, dones, w = experiences
        else:
            states, actions, reward, next_states, dones = experiences

        with torch.no_grad():
            if USE_DOUBLE_DQN:
                Q_local_next = self.qnetwork_local(next_states).detach().max(
                    1)[1].unsqueeze(1)
                Q_target_next = self.qnetwork_target(next_states).gather(
                    1, Q_local_next)

            else:
                # get max predicted Q values (for next states) from target model
                Q_target_next = self.qnetwork_target(next_states).detach().max(
                    1)[0].unsqueeze(1)
            # compute Q targets for current states
            Q_target = reward + (gamma * Q_target_next * (1 - dones))

        # get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        if USE_PRIORITIZED_REPLAY:
            Q_target.sub_(Q_expected)
            Q_target = torch.squeeze(Q_target)
            Q_target.pow_(2)
            with torch.no_grad():
                TD_error = Q_target.detach()
                TD_error.pow_(.5)
                self.memory.update_priorities(TD_error)

            Q_target.mul_(w)
            loss = Q_target.mean()
        else:
            # compute loss
            loss = F.mse_loss(Q_expected, Q_target)

        # minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # update target network
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """ soft update model parameters.
        theta_target = tau*theta_local + (1 - tau)*theta_target
        
        Params
        ======
            local_model (pytorch model): weight will be copied from
            target-model (pytorch model): weight will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1. - tau) * target_param.data)
Beispiel #9
0
class AgentVanilla():
    """Interacts with and learns from the environment.

    This class implements the deep Q-Learning with experience replay.
    """
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork.parameters(), lr=LR)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        experience = (state, action, reward, next_state, done)
        self.learn(experience, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork.eval()
        with torch.no_grad():
            action_values = self.qnetwork(state)
        self.qnetwork.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experience, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experience (Tuple[torch.Tensor]): (s, a, r, s', done)
            gamma (float): discount factor
        """
        state, action, reward, next_state, done = experience

        with torch.no_grad():
            Q_next = self.qnetwork(torch.Tensor([next_state]).to(device))

        Q_val = reward + gamma * (1 - done) * Q_next.max(dim=1)[0]

        Q1 = self.qnetwork(torch.Tensor([state]).to(device))
        Q_expected = Q1.squeeze()[action].unsqueeze(dim=0)
        loss = F.mse_loss(Q_expected, Q_val.detach())
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
Beispiel #10
0
class AgentEr():
    """Interacts with and learns from the environment.

    This class implements the deep Q-Learning with experience replay.
    """
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork.eval()
        with torch.no_grad():
            action_values = self.qnetwork(state)
        self.qnetwork.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        with torch.no_grad():
            Q_next = self.qnetwork(next_states)

        Q_vals = rewards + gamma * (1 -
                                    dones) * Q_next.max(dim=1)[0].unsqueeze(1)

        Q1 = self.qnetwork(states)
        Q_expected = Q1.gather(dim=1, index=actions)

        loss = F.mse_loss(Q_expected, Q_vals.detach())
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
class Agent(object):
    def __init__(self, n_states, n_actions, hidden_dim):
        """Agent class that choose action and train

        Args:
            n_states (int): input dimension
            n_actions (int): output dimension
            hidden_dim (int): hidden dimension
        """

        self.q_local = QNetwork(n_states, n_actions, hidden_dim=16).to(device)
        self.q_target = QNetwork(n_states, n_actions, hidden_dim=16).to(device)

        self.mse_loss = torch.nn.MSELoss()
        self.optim = optim.Adam(self.q_local.parameters(), lr=LEARNING_RATE)

        self.n_states = n_states
        self.n_actions = n_actions

        #  ReplayMemory: trajectory is saved here
        self.replay_memory = ReplayMemory(10000)

    def get_action(self, state, eps, check_eps=True):
        """Returns an action

        Args:
            state : 2-D tensor of shape (n, input_dim)
            eps (float): eps-greedy for exploration

        Returns: int: action index
        """
        global steps_done
        sample = random.random()

        if check_eps == False or sample > eps:
            with torch.no_grad():
                return self.q_local(
                    Variable(state).type(FloatTensor)).data.max(1)[1].view(
                        1, 1)
        else:
            ## return LongTensor([[random.randrange(2)]])
            return torch.tensor([[random.randrange(self.n_actions)]],
                                device=device)

    def learn(self, experiences, gamma):
        """Prepare minibatch and train them

        Args:
        experiences (List[Transition]): batch of `Transition`
        gamma (float): Discount rate of Q_target
        """

        if len(self.replay_memory.memory) < BATCH_SIZE:
            return

        transitions = self.replay_memory.sample(BATCH_SIZE)

        batch = Transition(*zip(*transitions))

        states = torch.cat(batch.state)
        actions = torch.cat(batch.action)
        rewards = torch.cat(batch.reward)
        next_states = torch.cat(batch.next_state)
        dones = torch.cat(batch.done)

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to newtork q_local (current estimate)
        Q_expected = self.q_local(states).gather(1, actions)

        Q_targets_next = self.q_target(next_states).detach().max(1)[0]

        # Compute the expected Q values
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        self.q_local.train(mode=True)
        self.optim.zero_grad()
        loss = self.mse_loss(Q_expected, Q_targets.unsqueeze(1))
        # backpropagation of loss to NN
        loss.backward()
        self.optim.step()

    def soft_update(self, local_model, target_model, tau):
        """ tau (float): interpolation parameter"""

        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def hard_update(self, local, target):
        for target_param, param in zip(target.parameters(),
                                       local.parameters()):
            target_param.data.copy_(param.data)
Beispiel #12
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        #optimizer = tf.train.RMSPropOptimizer(learning_rate= LR)
        optimizer = tf.train.AdamOptimizer(learning_rate=LR)
        self.Qnetwork = QNetwork(state_size=state_size,
                                 action_size=action_size,
                                 optimizer=optimizer,
                                 gamma=GAMMA,
                                 tau=TAU,
                                 minibatch_size=BATCH_SIZE,
                                 neurons_of_layers=NEURONS_OF_LAYERS,
                                 with_bn=WITH_BN)
        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # ------------------- update target network ------------------- #
            self.Qnetwork.update_target_network()

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        if len(state.shape) == 1:
            # make it batch-like
            state = state[np.newaxis, :]

        # Epsilon-greedy action selection
        if random.random() > eps:
            action_values = self.Qnetwork.get_action(state)
            return np.argmax(action_values)
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        #states, actions, rewards, next_states, dones = experiences

        ## TODO: compute and minimize the loss
        "*** YOUR CODE HERE ***"
        current_loss = self.Qnetwork.train(experiences)
Beispiel #13
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 buf_size,
                 gamma,
                 tau,
                 update_t,
                 lr,
                 batch_size,
                 fc1_units,
                 fc2_units,
                 seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            buf_size (int): replay buffer size
            gamma (float): discount factor
            tau (float): for soft update of target parameters
            update_t (int): how often to update the network
            lr (float): learning rate
            batch_size (int): minibatch size
            fc1_units (int): number of nodes in first hidden layer of Q network
            fc2_units (int): number of nodes in second hidden layer of Q network
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed, fc1_units, fc2_units).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed, fc1_units, fc2_units).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr)

        # Replay memory
        self.memory = ReplayBuffer(action_size, buf_size, batch_size, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
        
        self.batch_size = batch_size
        self.update_t = update_t
        self.gamma = gamma
        self.tau = tau
    
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.update_t
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                self.learn(experiences, self.gamma)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        # Compute Q targets for current states 
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class SAC(object):
    def __init__(self, num_inputs, action_space, args):

        self.gamma = args.gamma
        self.tau = args.tau
        self.alpha = args.alpha

        self.policy_type = args.policy
        self.target_update_interval = args.target_update_interval
        self.automatic_entropy_tuning = args.automatic_entropy_tuning

        self.device = torch.device("cuda" if args.cuda else "cpu")

        self.critic = QNetwork(num_inputs, action_space.shape[0],
                               args.hidden_size).to(device=self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)

        self.critic_target = QNetwork(num_inputs, action_space.shape[0],
                                      args.hidden_size).to(self.device)
        hard_update(self.critic_target, self.critic)

        if self.policy_type == "Gaussian":
            # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
            if self.automatic_entropy_tuning is True:
                self.target_entropy = -torch.prod(
                    torch.Tensor(action_space.shape).to(self.device)).item()
                self.log_alpha = torch.zeros(1,
                                             requires_grad=True,
                                             device=self.device)
                self.alpha_optim = Adam([self.log_alpha], lr=args.lr)

            self.policy = GaussianPolicy(num_inputs, action_space.shape[0],
                                         args.hidden_size,
                                         action_space).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

        else:
            self.alpha = 0
            self.automatic_entropy_tuning = False
            self.policy = DeterministicPolicy(num_inputs,
                                              action_space.shape[0],
                                              args.hidden_size,
                                              action_space).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

    def select_action(self, state, evaluate=False):
        state = torch.FloatTensor(state).to(self.device).unsqueeze(0)
        if evaluate is False:
            action, _, _ = self.policy.sample(state)
        else:
            _, _, action = self.policy.sample(state)
        return action.detach().cpu().numpy()[0]

    def update_parameters(self, memory, batch_size, updates):
        # Sample a batch from memory
        state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample(
            batch_size=batch_size)

        state_batch = torch.FloatTensor(state_batch).to(self.device)
        next_state_batch = torch.FloatTensor(next_state_batch).to(self.device)
        action_batch = torch.FloatTensor(action_batch).to(self.device)
        reward_batch = torch.FloatTensor(reward_batch).to(
            self.device).unsqueeze(1)
        mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1)

        with torch.no_grad():
            next_state_action, next_state_log_pi, _ = self.policy.sample(
                next_state_batch)
            qf1_next_target, qf2_next_target = self.critic_target(
                next_state_batch, next_state_action)
            min_qf_next_target = torch.min(
                qf1_next_target,
                qf2_next_target) - self.alpha * next_state_log_pi
            next_q_value = reward_batch + mask_batch * self.gamma * (
                min_qf_next_target)
        qf1, qf2 = self.critic(
            state_batch, action_batch
        )  # Two Q-functions to mitigate positive bias in the policy improvement step
        qf1_loss = F.mse_loss(
            qf1, next_q_value
        )  # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]
        qf2_loss = F.mse_loss(
            qf2, next_q_value
        )  # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]
        qf_loss = qf1_loss + qf2_loss

        self.critic_optim.zero_grad()
        qf_loss.backward()
        self.critic_optim.step()

        pi, log_pi, _ = self.policy.sample(state_batch)

        qf1_pi, qf2_pi = self.critic(state_batch, pi)
        min_qf_pi = torch.min(qf1_pi, qf2_pi)

        policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean(
        )  # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))]

        self.policy_optim.zero_grad()
        policy_loss.backward()
        self.policy_optim.step()

        if self.automatic_entropy_tuning:
            alpha_loss = -(self.log_alpha *
                           (log_pi + self.target_entropy).detach()).mean()

            self.alpha_optim.zero_grad()
            alpha_loss.backward()
            self.alpha_optim.step()

            self.alpha = self.log_alpha.exp()
            alpha_tlogs = self.alpha.clone()  # For TensorboardX logs
        else:
            alpha_loss = torch.tensor(0.).to(self.device)
            alpha_tlogs = torch.tensor(self.alpha)  # For TensorboardX logs

        if updates % self.target_update_interval == 0:
            soft_update(self.critic_target, self.critic, self.tau)

        return qf1_loss.item(), qf2_loss.item(), policy_loss.item(
        ), alpha_loss.item(), alpha_tlogs.item()

    # Save model parameters
    def save_checkpoint(self, env_name, suffix="", ckpt_path=None):
        if not os.path.exists('checkpoints/'):
            os.makedirs('checkpoints/')
        if ckpt_path is None:
            ckpt_path = "checkpoints/sac_checkpoint_{}_{}".format(
                env_name, suffix)
        print('Saving models to {}'.format(ckpt_path))
        torch.save(
            {
                'policy_state_dict': self.policy.state_dict(),
                'critic_state_dict': self.critic.state_dict(),
                'critic_target_state_dict': self.critic_target.state_dict(),
                'critic_optimizer_state_dict': self.critic_optim.state_dict(),
                'policy_optimizer_state_dict': self.policy_optim.state_dict()
            }, ckpt_path)

    # Load model parameters
    def load_checkpoint(self, ckpt_path, evaluate=False):
        print('Loading models from {}'.format(ckpt_path))
        if ckpt_path is not None:
            checkpoint = torch.load(ckpt_path)
            self.policy.load_state_dict(checkpoint['policy_state_dict'])
            self.critic.load_state_dict(checkpoint['critic_state_dict'])
            self.critic_target.load_state_dict(
                checkpoint['critic_target_state_dict'])
            self.critic_optim.load_state_dict(
                checkpoint['critic_optimizer_state_dict'])
            self.policy_optim.load_state_dict(
                checkpoint['policy_optimizer_state_dict'])

            if evaluate:
                self.policy.eval()
                self.critic.eval()
                self.critic_target.eval()
            else:
                self.policy.train()
                self.critic.train()
                self.critic_target.train()
Beispiel #15
0
class PrioritizedAgent:
    '''Interact with and learn from the environment.
    The agent uses prioritized experience replay.
    '''
    def __init__(self, state_size, action_size, seed, is_double_q=False):
        '''Initialize an Agent.

        Params
        ======
            state_size (int): the dimension of the state
            action_size (int): the number of actions
            seed (int): random seed
        '''

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.t_step = 0  # Initialize time step (for tracking LEARN_EVERY_STEP and UPDATE_EVERY_STEP)
        self.running_loss = 0
        self.training_cnt = 0

        self.is_double_q = is_double_q

        self.qnetwork_local = QNetwork(self.state_size, self.action_size,
                                       seed).to(device)
        self.qnetowrk_target = QNetwork(self.state_size, self.action_size,
                                        seed).to(device)

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        self.prioritized_memory = PrioritizedMemory(BATCH_SIZE, BUFFER_SIZE,
                                                    seed)

    def act(self, state, mode, epsilon=None):
        '''Returns actions for given state as per current policy.
        
        Params
        ======
            state (array): current state
            mode (string): train or test
            epsilon (float): for epsilon-greedy action selection
        '''
        state = torch.from_numpy(state).float().unsqueeze(0).to(
            device)  # shape of state (1, state)

        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local.forward(state)
        self.qnetwork_local.train()

        if mode == 'test':
            action = np.argmax(action_values.cpu().data.numpy()
                               )  # pull action values from gpu to local cpu

        elif mode == 'train':
            if random.random() <= epsilon:  # random action
                action = random.choice(np.arange(self.action_size))
            else:  # greedy action
                action = np.argmax(action_values.cpu().data.numpy(
                ))  # pull action values from gpu to local cpu

        return action

    def step(self, state, action, reward, next_state, done):
        # add new experience in memory
        self.prioritized_memory.add(state, action, reward, next_state, done)

        # activate learning every few steps
        self.t_step = self.t_step + 1
        if self.t_step % LEARN_EVERY_STEP == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.prioritized_memory) >= BUFFER_SIZE:
                idxes, experiences, is_weights = self.prioritized_memory.sample(
                    device)
                self.learn(experiences,
                           GAMMA,
                           is_weights=is_weights,
                           leaf_idxes=idxes)

    def learn(self, experiences, gamma, is_weights, leaf_idxes):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
            is_weights (tensor array): importance-sampling weights for prioritized experience replay
            leaf_idxes (numpy array): indexes for update priorities in SumTree

        """

        states, actions, rewards, next_states, dones = experiences

        q_local_chosen_action_values = self.qnetwork_local.forward(
            states).gather(1, actions)
        q_target_action_values = self.qnetowrk_target.forward(
            next_states).detach()

        if self.is_double_q == True:
            q_local_next_actions = self.qnetwork_local.forward(
                next_states).detach().max(1)[1].unsqueeze(
                    1)  # shape (batch_size, 1)
            q_target_best_action_values = q_target_action_values.gather(
                1, q_local_next_actions)  # Double DQN

        elif self.is_double_q == False:
            q_target_best_action_values = q_target_action_values.max(
                1)[0].unsqueeze(1)  # shape (batch_size, 1)

        rewards = rewards.tanh(
        )  # rewards are clipped to be in [-1,1], referencing from original paper
        q_target_values = rewards + gamma * q_target_best_action_values * (
            1 - dones)  # zero value for terminal state

        td_errors = (q_target_values - q_local_chosen_action_values).tanh(
        )  # TD-errors are clipped to be in [-1,1], referencing from original paper
        abs_errors = td_errors.abs().cpu().data.numpy()  # pull back to cpu
        self.prioritized_memory.batch_update(
            leaf_idxes, abs_errors)  # update priorities in SumTree

        loss = (is_weights * (td_errors**2)).mean(
        )  # adjust squared TD loss by Importance-Sampling Weights

        self.running_loss += float(loss.cpu().data.numpy())
        self.training_cnt += 1

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        if self.t_step % UPDATE_EVERY_STEP == 0:
            self.update(self.qnetwork_local, self.qnetowrk_target)

    def update(self, local_netowrk, target_network):
        """Hard update model parameters, as indicated in original paper.
        
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
        """
        for local_param, target_param in zip(local_netowrk.parameters(),
                                             target_network.parameters()):
            target_param.data.copy_(local_param.data)
Beispiel #16
0
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed=SEED, batch_size=BATCH_SIZE,
                 buffer_size=BUFFER_SIZE, start_since=START_SINCE, gamma=GAMMA, target_update_every=T_UPDATE,
                 tau=TAU, lr=LR, weight_decay=WEIGHT_DECAY, update_every=UPDATE_EVERY, priority_eps=P_EPS,
                 a=A, initial_beta=INIT_BETA, clip=CLIP, **kwds):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            batch_size (int): size of each sample batch
            buffer_size (int): size of the experience memory buffer
            start_since (int): number of steps to collect before start training
            gamma (float): discount factor
            target_update_every (int): how often to update the target network
            tau (float): target network soft-update parameter
            lr (float): learning rate
            weight_decay (float): weight decay for optimizer
            update_every (int): update(learning and target update) interval
            priority_eps (float): small base value for priorities
            a (float): priority exponent parameter
            initial_beta (float): initial importance-sampling weight
            clip (float): gradient norm clipping (`None` to disable)
        """
        if kwds != {}:
            print("Ignored keyword arguments: ", end='')
            print(*kwds, sep=', ')
        assert isinstance(state_size, int)
        assert isinstance(action_size, int)
        assert isinstance(seed, int)
        assert isinstance(batch_size, int) and batch_size > 0
        assert isinstance(buffer_size, int) and buffer_size >= batch_size
        assert isinstance(start_since, int) and batch_size <= start_since <= buffer_size
        assert isinstance(gamma, (int, float)) and 0 <= gamma <= 1
        assert isinstance(target_update_every, int) and target_update_every > 0
        assert isinstance(tau, (int, float)) and 0 <= tau <= 1
        assert isinstance(lr, (int, float)) and lr >= 0
        assert isinstance(weight_decay, (int, float)) and weight_decay >= 0
        assert isinstance(update_every, int) and update_every > 0
        assert isinstance(priority_eps, (int, float)) and priority_eps >= 0
        assert isinstance(a, (int, float)) and 0 <= a <= 1
        assert isinstance(initial_beta, (int, float)) and 0 <= initial_beta <= 1
        if clip: assert isinstance(clip, (int, float)) and clip >= 0

        self.state_size          = state_size
        self.action_size         = action_size
        self.seed                = random.seed(seed)
        self.batch_size          = batch_size
        self.buffer_size         = buffer_size
        self.start_since         = start_since
        self.gamma               = gamma
        self.target_update_every = target_update_every
        self.tau                 = tau
        self.lr                  = lr
        self.weight_decay        = weight_decay
        self.update_every        = update_every
        self.priority_eps        = priority_eps
        self.a                   = a
        self.beta                = initial_beta
        self.clip                = clip

        # Q-Network
        self.qnetwork_local  = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target.load_state_dict(self.qnetwork_local.state_dict())

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr, weight_decay=weight_decay)

        # Replay memory
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size, a, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps and TARGET_UPDATE_EVERY steps)
        self.u_step = 0
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.u_step = (self.u_step + 1) % self.update_every
        if self.u_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) >= self.start_since:
                experiences, is_weights, indices = self.memory.sample(self.beta)
                new_priorities = self.learn(experiences, is_weights, self.gamma)
                self.memory.update_priorities(indices, new_priorities)

        # update the target network every TARGET_UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.target_update_every
        if self.t_step == 0:
            self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.

        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        return random.choice(np.arange(self.action_size))

    def learn(self, experiences, is_weights, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            is_weights (torch.Tensor): tensor of importance-sampling weights
            gamma (float): discount factor

        Returns
        =======
            new_priorities (List[float]): list of new priority values for the given sample
        """
        states, actions, rewards, next_states, dones = experiences

        with torch.no_grad():
            target = rewards + gamma * (1 - dones) * self.qnetwork_target(next_states)\
                                                         .gather(dim=1, index=self.qnetwork_local(next_states)\
                                                                                  .argmax(dim=1, keepdim=True))

        pred = self.qnetwork_local(states)

        diff = target.sub(pred.gather(dim=1, index=actions))
        new_priorities = diff.detach().abs().add(P_EPS).cpu().numpy().reshape((-1,))
        loss = diff.pow(2).mul(is_weights).mean()

        self.optimizer.zero_grad()
        loss.backward()
        if self.clip:
            torch.nn.utils.clip_grad_norm_(self.qnetwork_local.parameters(), CLIP)
        self.optimizer.step()

        return new_priorities

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
Beispiel #17
0
class Agent():
    def __init__(self, state_size, action_size, behavior_name, index_player, replay_memory_size=1e4, batch_size=128, gamma=0.99,
                 learning_rate = 1e-3, target_tau=1e-3, update_rate=4, seed=0):
        self.state_size = state_size
        self.current_state = []
        self.action_size = action_size
        self.buffer_size = int(replay_memory_size)
        self.batch_size = batch_size
        self.gamma = gamma
        self.learn_rate = learning_rate
        self.tau = target_tau
        self.update_rate = update_rate
        self.seed = random.seed(seed)
        self.behavior_name = behavior_name
        self.index_player = index_player
        self.close_ball_reward = 0
        self.touch_ball_reward = 0

        """
        Now we define two models: 
        (a) one netwoek will be updated every (step % update_rate == 0),
        (b) A target network, with weights updated to equal to equal to the network (a) at a slower (target_tau) rate.
        """

        self.network = QNetwork(state_size, action_size, seed).to(device)
        self.target_network =  QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.network.parameters(), lr= self.learn_rate)

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, seed)

        # Initialize time step ( for updating every UPDATE_EVERY steps)
        self.t_step = 0
    def load_model(self, path_model, path_target = None):
        params = torch.load(path_model)
        #self.network.set_params(params)
        self.network.load_state_dict(torch.load(path_model))
        if path_target != None:
            self.target_network.load_state_dict(torch.load(path_target))
    def model_step(self, state, action, reward, next_state):
        # save experience in replay memory
        self.memory.add(state, action, reward, next_state)

        # learn every UPDATE_EVERY time steps
        self.t_step = (self.t_step + 1)%1003 #% self.update_rate
        if self.t_step% self.update_rate == 0:
            print('LEAR HERE')
            # if enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                for hhh in range(1):
                    experiences = self.memory.sample()
                    self.learn(experiences, self.gamma,self.t_step)

    def choose_action(self, state, eps = 0.0):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.network.eval()
        with torch.no_grad():
            action_values = self.network(state)
        self.network.train()

        # epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy()) # return a number from 0 to action_size
        else:
            return random.choice(np.arange(self.action_size)) # return a number from 0 to action_size

    def learn(self, experiences, gamma,stp):
        states, actions, rewards, next_states = experiences

        # Get Q values from current observations (s,a) using model network
        # get max Q values for (s', a') from target model
        self.network.train()
        Q_sa = self.network(states).gather(1, actions)
        #print(Q_sa)
        Q_sa_prime_target_values = self.target_network(next_states).max(1)[0].to(device).float().detach()
        #Q_sa_prime_targets = Q_sa_prime_target_values.max(1)[0].unsqueeze(1)
        #print(Q_sa_prime_target_values)

        # compute Q targets for current states

        Q_sa_targets = rewards + gamma * Q_sa_prime_target_values.unsqueeze(1)
        # Compute loss (error)
        criterion = torch.nn.MSELoss(reduction='sum')
        loss = criterion(Q_sa,Q_sa_targets)#F.mse_loss(Q_sa, Q_sa_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # update target network
        if(stp%1000==0):
            self.soft_update(self.network, self.target_network, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """"
        local_model (PyTorch model): weights will be copied from
        target_model (PyTorch model): weights will be copied to
        tau (float): interpolation parameter
        """

        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
    def Read(self):
        decision_steps, terminal_steps = env.get_steps(self.behavior_name)
        try:
            signal_front = np.array(sensor_front_sig(decision_steps.obs[0][self.index_player, :]))  # 3 x 11 x 8
            signal_back = np.array(sensor_back_sig(decision_steps.obs[1][self.index_player, :]))  # 3 x 3 x 8
            pre_state = []
            signal_front =np.array(signal_front)
            r = np.concatenate((signal_front,signal_back),axis=1)
            self.current_state = r
            count_close_to_ball = 0
            count_touch_ball = 0
            count_back_touch = 0
            count_back_close = 0
            self.rew_d_to_our_post =0
            self.rew_for_ball_dist = -0.1
            for i in range(len(signal_front[0])):
                if signal_front[0][i][0] == 1.0:
                    print('baaallllll')
                    count_close_to_ball+= 1
                    self.rew_for_ball_dist = max(0.3*(1 - signal_front[0][i][7]),self.rew_for_ball_dist)

                    if signal_front[0][i][7] <= 0.02:
                        count_touch_ball += 1

                if signal_front[0][i][1] == 1.0:
                    self.rew_d_to_our_post =-0.1
                if signal_front[0][i][2] == 1.0:
                    self.rew_d_to_our_post =0.1


            for i in range(len(signal_back[0])):
                if signal_back[0][i][0] == 1.0:
                    count_back_close+= 1
                    if signal_back[0][i][7] <= 0.03:
                        count_back_touch += 0
            self.back_touch = 1 if count_back_touch>0 else 0
            self.back_close = 1 if count_back_close>0 else 0
            # add reward if kick the ball
            self.touch_ball_reward = 2.5 if count_touch_ball > 0 else 0
            #if count_back_touch>0:
            #    self.touch_ball_reward= -0.3
            if count_back_close >0:
                self.close_ball_reward = -0.1
            # penalize if the ball is not in view
            self.close_ball_reward = -0.15 if count_close_to_ball == 0 else 0.2
            if count_back_close >0:
                self.close_ball_reward = -0.15

            return self.current_state
        except:
            self.touch_ball_reward = 0
            self.close_ball_reward = 0
        return self.current_state
    def upd_after_goal(self, n_upds):
        self.memory.upd_goal(n_upds)
    def we_goll(self):
        self.memory.we_goll()
    def us_goll(self):
        self.memory.us_goll()
Beispiel #18
0
class Agent:
    """Interacts with and learns from the environment."""

    def __init__(self, action_size, seed, state_size, visual):
        """Initialize an Agent object.
        
        Params
        ======
            action_size (int): dimension of each action
            seed (int): random seed
            state_size (int): dimension of each state. Note this can be None if visual is true
            visual (bool): whether to train the agent on visual pixels or vector observations
        """
        if not visual:
            self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed) if not visual else VisualQNetwork(action_size, seed)
        self.qnetwork_local = self.qnetwork_local.to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed) if not visual else VisualQNetwork(action_size, seed)
        self.qnetwork_target = self.qnetwork_target.to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        self.beta_start = 0.4

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, GAMMA)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
        self.batch_no = 0
        self.beta_batch_nos = 50_000
    
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                beta = min(1.0, self.beta_start + (self.batch_no / self.beta_batch_nos) * (1 - self.beta_start))
                self.batch_no += 1
                experiences = self.memory.sample(beta)
                self._learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def _learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones, sample_indices, weight_update_weights = experiences

        # Get max predicted Q values (for next states) from target model
        q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0]
        # Compute Q targets for current states 
        q_targets = rewards.squeeze(1) + (gamma * q_targets_next * (1 - dones.squeeze(1)))

        # Get expected Q values from local model
        q_expected = self.qnetwork_local(states).gather(1, actions).squeeze(1)
        # Compute loss
        loss = (q_expected - q_targets.detach()).pow(2) * weight_update_weights
        prios = loss + 1e-5
        loss = loss.mean()
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()

        self.memory.update_priorities(prios.data.cpu().numpy(), sample_indices)

        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
Beispiel #19
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        self.criterion = torch.nn.MSELoss()

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                # print('obtaining experiences')
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        ## TODO: compute and minimize the loss
        "*** YOUR CODE HERE ***"
        self.optimizer.zero_grad()

        # obtain the estimated q-values of the states from the local network
        # Then obtain the q_value corresponding to the action taken
        estimated_q = self.qnetwork_local(states)[
            range(BATCH_SIZE), actions.view(-1)].view(
                -1, 1
            )  # there is simpler way to do this by using the .gather() method
        # estimated_q = self.qnetwork_local(states).gather(1, actions)

        # print(estimated_q)
        # print(estimated_q.size(0) == states.size(0))

        # now compute the target q-value using the target qnetwork in eval mode
        self.qnetwork_target.eval()
        with torch.no_grad():
            next_q_max = torch.max(self.qnetwork_target(next_states).detach(),
                                   axis=1).values.view(-1, 1)

        self.qnetwork_target.train()

        # if done then next_q_max should be zero
        # print(dones.size())
        # print(next_q_max.size())
        next_q_max *= (1 - dones)

        # target value is the sum of rewards and next_q_max discounted by GAMMA
        targets = rewards + GAMMA * next_q_max

        # now compute the loss
        loss = self.criterion(estimated_q, targets)
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        ## TODO: compute and minimize the loss
        "*** YOUR CODE HERE ***"

        # Get max predicted Q values for next states from the target model (frozen weights)
        #
        #    next_states is 64x8
        #    self.qnetwork_target(next_states) is 64x4
        #    detach() returns a tensor copy detached from the graph (no gradient)
        #    max(1)[0] returns the the max value in given dim  (max value indexes in 2nd array)
        #    => This returns an array of 64 values    
        #    Unsqueeze(1)returns a new Tensor of size one inserted at the given position
        #    => This returns a 64X1 tensor
        Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        
        # Compute Q targets for current states 
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model (being trained)
        # x.gather(1, actions) returns a tensor (located on the current device) that is the result of
        # concataining the input tensor values along the provided dimensions (here the dim indexes are the taken actions indexes) 
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        
        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)    
        
        
        

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
Beispiel #21
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE,
                                              self.seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

        # Minimum priority
        self.eps = 0.0001

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        start_time = time.time()
        self.memory.add(state, action, reward, next_state, done)
        # print("Sample add time {:.4f}".format(start_time - time.time()))

        # Learn every UPDATE_EVERY time steps.
        self.t_step += 1
        if len(self.memory) > 0 and (self.t_step % UPDATE_EVERY == 0):
            # start_time = time.time()
            experiences = self.memory.sample()
            # print("Sample time {:.4f}".format(start_time - time.time()))
            self.learn(experiences, GAMMA, self.t_step)
            self.memory.updateBeta()

        if self.t_step % 1e3 == 0:
            self.memory.reorder()

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma, t_step):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones, indices, weights = experiences

        # Get best actions from local network
        target_actions = self.qnetwork_local(next_states).detach().max(
            1)[1].unsqueeze(1)
        # And use them to evaluate the target network
        Q_targets_next = self.qnetwork_target(next_states).detach().gather(
            1, target_actions)

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # TD Error
        td_error = (Q_targets - Q_expected).abs().detach().numpy() + self.eps
        start_time = time.time()
        self.memory.setPriority(indices, td_error)
        # print("Update time {:.4f}".format(start_time - time.time()))

        start_time = time.time()
        # Compute loss
        loss = F.mse_loss(weights * Q_expected, weights * Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        # print("Backprop time {:.4f}".format(start_time - time.time()))

        if (t_step % UPDATE_EVERY) == 0:
            # ------------------- update target network ------------------- #
            self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        self.loss = torch.nn.MSELoss()
        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        # it seems every element of memory is only one single state, action, reward ... not a series of them.
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
#                 print(experiences)
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences
#         print("states", states.shape)
#         print("actions", actions)
#         print("rewards", rewards.shape)
#         print("next_states", next_states.shape)
        ## TODO: compute and minimize the loss
        "*** YOUR CODE HERE ***"
        # the most confusing thing here is that we are training the local model, but our objective is the target model ?
        # the target model parameters only get updated once in a while. 
        best_action_q_value_target, _ = torch.max(self.qnetwork_target(next_states),1)
        best_action_q_value_target = best_action_q_value_target.view(-1,1)
        action_q_values_local = self.qnetwork_local(states).gather(1, actions.view(-1,1))
#         action_q_values_target = self.qnetwork_target(states)[actions,]
#         print("action_q_values_local", best_action_q_value_local.shape)
#         print("action_q_values_target", action_q_values_target.shape)
#         error = action_q_values_target - (rewards + gamma*best_action_q_value_local)
        loss = self.loss(action_q_values_local, rewards + gamma*best_action_q_value_target* (1 - dones))
#         loss = torch.sum(error**2)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
Beispiel #23
0
class AgentPR:
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBufferPR(action_size, BUFFER_SIZE, BATCH_SIZE,
                                     seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def priority(self, state, action, reward, next_state, done, gamma, alpha):
        """
        Description:
            Calculates priority of a given (state action, reward, next_state, done)-tuple.
            The priority if made non-zero given a positive constant EPS and the significance of the TD error is controlled with the alpha paramteter.
        
        Input:
            state:
            action:
            reward:
            next_state:
            done: 
            gamma: discount factor
            alpha: Weighting factor of the experience replays. I.e. how much should we care about the priorities. alpha=0: not to much. alpha=1: quite a bit.
        """

        # taget values
        state = torch.from_numpy(np.vstack([state])).float().to(device)
        action = torch.from_numpy(np.vstack([action])).long().to(device)
        reward = torch.from_numpy(np.vstack([reward])).float().to(device)
        next_state = torch.from_numpy(np.vstack([next_state
                                                 ])).float().to(device)
        done = torch.from_numpy(np.vstack([done]).astype(
            np.uint8)).float().to(device)

        qs_target = self.qnetwork_target(next_state).detach()
        qmax, qmax_index = torch.max(qs_target, axis=1)
        qmax = qmax.unsqueeze(1)
        y = reward + gamma * qmax * (1 - done)
        y_hat = self.qnetwork_local(state).gather(1, action)
        # delta = TD error (used for prioritized replay)
        delta = y - y_hat
        return ((abs(delta) + EPS).detach()**alpha).item()

    def step(self, state, action, reward, next_state, done, beta):

        # Calculate priority of the new sample
        priority = self.priority(state, action, reward, next_state, done,
                                 GAMMA, ALPHA)

        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done, priority)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.prioritized_sample()
                #experiences = self.memory.prioritized_sample(ALPHA, EPS)
                self.learn(experiences, GAMMA, ALPHA, beta)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy()).astype(
                np.int32)  # must be in32 to work with unity
        else:
            return random.choice(np.arange(self.action_size)).astype(
                np.int32)  # must be in32 to work with unity

    def loss_function(self, y, y_hat, imp_w):
        """
            imp_w: importance sampling weights for that sample
        """
        return torch.sum(
            imp_w * torch.clamp((y - y_hat).pow(2), 0, 1)
        )  # Following the clipping approach suggested in: https://web.stanford.edu/class/psych209/Readings/MnihEtAlHassibis15NatureControlDeepRL.pdf

    def learn(self, experiences, gamma, alpha, beta):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
            max_priority: Maximum priority given to any experience. Used to rescale the priorities to avoid outliers impacting heavly on the update. 
            beta: Bias correction of importance sampling weights. beta = 1 full bias correction. beta = 0 no bias correction.
        """
        states, actions, rewards, next_states, dones, priorities, idxs = experiences

        # taget values
        qs_target = self.qnetwork_target(next_states).detach()
        qmax, qmax_index = torch.max(qs_target, axis=1)
        qmax = qmax.unsqueeze(1)

        y = rewards + gamma * qmax * (1 - dones)

        y_hat = self.qnetwork_local(states).gather(1, actions)

        # delta = TD error (used for prioritized replay)
        delta = y - y_hat

        # Importance sampling weights
        if self.memory.max_priority:
            imp_w = (BUFFER_SIZE *
                     priorities)**(-beta) / self.memory.max_priority
            #imp_w = (priorities) ** (- BETA) / self.memory.max_priority
        else:
            imp_w = torch.Tensor([1]) * len(y)

        imp_w = imp_w.to(device)

        # Set gradients to zero
        self.optimizer.zero_grad()
        # Calculate the loss between target and estimate
        loss = self.loss_function(
            y, y_hat, imp_w
        )  # # Adjust the loss according to the importance sampling distribution
        # Backpropagation with loss function
        loss.backward()
        # Update weights
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
            
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Beispiel #24
0
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed, learning_rate=LR, update_every=UPDATE_EVERY, discount_factor=GAMMA):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        #saving hyperparams
        self.update_every = update_every
        self.discount_factor = discount_factor

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed, 64, 128).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed, 64, 128).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=learning_rate)

        # Replay memory
        self.memory = PrioretizedReplayBuffer( BUFFER_SIZE, BATCH_SIZE, seed, device)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
        self.loss_track = []

    def eval_action_values(self, state, qnetwork):
        """ Helper method to evaluate model on given state and return action state values

        Params
        ==== 
            state (Torch tensor) - current env state
            model (QNetwork) - one of the Q networks (qnetwork_local, qnetwork_target)
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        qnetwork.eval() # setting  model to inference 
        with torch.no_grad():
            action_values = qnetwork(state)
        qnetwork.train() # setting model back to training
        return action_values
    
    def load_model_weights(self, weights_file):
        state_dict = torch.load(weights_file)
        self.qnetwork_local.load_state_dict(state_dict)

    def step(self, state, action, reward, next_state, done):

        # calculate TD error in order to save the experience with correct priority into PrioritiedReplayBuffer
        Q_target_vals = self.eval_action_values(state, self.qnetwork_target).numpy()
        Q_vals = self.eval_action_values(state, self.qnetwork_local).numpy()[0]
        td_error = reward + GAMMA*np.max(Q_target_vals) - Q_vals[action] if done != 0 else reward - Q_vals[action]

        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done, td_error)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, self.discount_factor)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.

        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences
#         loss_fn = self.loss_fn

        ## this is required as we're not learning qnetwork_targets weights
#         with torch.no_grad():
#             Q_target = rewards + gamma * (torch.max(self.qnetwork_target(next_states), dim=1)[0].view(64,1))*(1 - dones)
#             Q_target[dones == True] = rewards[dones == True]
#         Q_pred = torch.max(self.qnetwork_local(states), dim=1)[0].view(64,1)

        ## Double Q-Learning implementation 
        # Find action with highest value using Q network under training (argmax on qnetwork_local) for each S'
        best_actions_by_local_nn = torch.max(self.qnetwork_local(next_states).detach(), dim=1)[1].unsqueeze(1)
        # Then use Target Q-network (one not trained atm) to predict Q values for each (S', best_action) pair, which hopefully should be less noisy than Qnetwork_local would predict
        action_values_by_target_nn = self.qnetwork_target(next_states).detach().gather(1, best_actions_by_local_nn)
        # once action_values are predicted using  
        Q_target = rewards + gamma * action_values_by_target_nn * (1 - dones)

        Q_pred = self.qnetwork_local(states).gather(1, actions)

        self.optimizer.zero_grad()
        loss = F.mse_loss(Q_pred, Q_target)

        self.loss_track.append(loss.item())

        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
Beispiel #25
0
class AgentDQN():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBufferDQN(action_size, BUFFER_SIZE, BATCH_SIZE,
                                      seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                #experiences = self.memory.prioritized_sample(ALPHA, EPS)
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy()).astype(
                np.int32)  # must be in32 to work with unity
        else:
            return random.choice(np.arange(self.action_size)).astype(
                np.int32)  # must be in32 to work with unity

    def loss_function(self, y, y_hat):
        return torch.sum((y - y_hat).pow(2))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # taget values
        qs_target = self.qnetwork_target(next_states).detach()
        qmax, qmax_index = torch.max(qs_target, axis=1)
        qmax = qmax.unsqueeze(1)
        #qmax = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)

        y = rewards + gamma * qmax * (1 - dones)

        # current estimate
        #y_all = self.qnetwork_local(states)
        #y_hat = torch.Tensor([y_all[i][action] for i, action in enumerate(actions)]).unsqueeze(1)
        #print('y_hat v1', y_hat)

        y_hat = self.qnetwork_local(states).gather(1, actions)
        #print('y_hat v2', y_hat)

        # Set gradients to zero
        self.optimizer.zero_grad()
        # Calculate the loss between target and estimate
        loss = self.loss_function(y, y_hat)
        # Backpropagation with loss function
        loss.backward()
        # Upate weights
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Beispiel #26
0
class ddqn_Agent():
    """Interacts with and learns from the environment."""

    
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.wQ =0
        self.wQ1=0
        self.wQ2=0
        # Q-Network
        self.qnetwork_Qa  = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_Qb  = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer_Qa = optim.Adam(self.qnetwork_Qa.parameters(), lr=LR)
        self.optimizer_Qb = optim.Adam(self.qnetwork_Qb.parameters(), lr=LR)

        # Replay memory
        self.memory = ddqn_ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
        
    def print_result():
        somme=self.wQ1+self.wQ2+0.0000001
        print("qQ1=",self.wQ1/somme," qQ2=",self.wQ2/somme)
        
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

                
    def act_a(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
    
        self.qnetwork_Qa.eval()
        with torch.no_grad():
            action_values = self.qnetwork_Qa(state)
        self.qnetwork_Qa.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))
 

    def act_b(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
    
        self.qnetwork_Qb.eval()
        with torch.no_grad():
            action_values = self.qnetwork_Qb(state)
        self.qnetwork_Qb.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))
        

    def act(self,state,eps=0.):
        self.wQ=np.random.choice([0, 1])
        if(self.wQ):
            return self.act_a(state, eps)
        else:
            return self.act_b(state, eps)


    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        #
        states, actions, rewards, next_states, dones = experiences

        ## TODO: compute and minimize the loss
        "*** YOUR CODE HERE ***"
        if self.wQ:
            # wQ take either 0 or 1 based on uniform random function.
            yj=self.qnetwork_Qb.forward(next_states).detach().max(1)[0].unsqueeze(1)
            Q_targets=rewards+gamma*yj*(1.0-dones)
 
            # Get expected Q values from local model
            Q_expected = self.qnetwork_Qa.forward(states).gather(1, actions)
            # Compute loss: Mean Square Error by element
            loss = F.mse_loss(Q_expected, Q_targets)
  
            # Minimize the loss
            self.optimizer_Qa.zero_grad()
            loss.backward()
            self.optimizer_Qa.step()
            # ------------------- update target network ------------------- #
            self.soft_update(self.qnetwork_Qa, self.qnetwork_Qb, TAU)
        else:
            
            yj=self.qnetwork_Qa.forward(next_states).detach().max(1)[0].unsqueeze(1)
            Q_targets=rewards+gamma*yj*(1.0-dones)
 
            # Get expected Q values from local model
            Q_expected = self.qnetwork_Qb.forward(states).gather(1, actions)
            # Compute loss: Mean Square Error by element
            loss = F.mse_loss(Q_expected, Q_targets)
  
            # Minimize the loss
            self.optimizer_Qb.zero_grad()
            loss.backward()
            self.optimizer_Qb.step()
            # ------------------- update target network ------------------- #
            self.soft_update(self.qnetwork_Qb, self.qnetwork_Qa, TAU)      
        
        

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        teta_target = ro*teta_local + (1 - ro)*teta_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
Beispiel #27
0
class DQNAgent:
    def __init__(self,
                 state_size,
                 action_size,
                 buffer_size=int(1e5),
                 batch_size=64,
                 gamma=.99,
                 tau=1e-3,
                 lr=5e-4,
                 update_every=4,
                 use_double=False,
                 use_dueling=False,
                 use_priority=False,
                 use_noise=False,
                 seed=42):
        """Deep Q-Network Agent
        
        Args:
            state_size (int)
            action_size (int)
            buffer_size (int): Experience Replay buffer size
            batch_size (int)
            gamma (float): 
                discount factor, used to balance immediate and future reward
            tau (float): interpolation parameter for soft update target network
            lr (float): neural Network learning rate, 
            update_every (int): how ofter we're gonna learn, 
            use_double (bool): whether or not to use double networks improvement
            use_dueling (bool): whether or not to use dueling network improvement
            use_priority (bool): whether or not to use priority experience replay
            use_noise (bool): whether or not to use noisy nets for exploration
            seed (int)
        """

        self.state_size = state_size
        self.action_size = action_size
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.lr = lr
        self.update_every = update_every
        self.use_double = use_double
        self.use_dueling = use_dueling
        self.use_priority = use_priority
        self.use_noise = use_noise

        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)

        # Q-Network
        if use_dueling:
            self.qn_local = DuelingQNetwork(state_size,
                                            action_size,
                                            noisy=use_noise).to(device)
        else:
            self.qn_local = QNetwork(state_size, action_size,
                                     noisy=use_noise).to(device)

        if use_dueling:
            self.qn_target = DuelingQNetwork(state_size,
                                             action_size,
                                             noisy=use_noise).to(device)
        else:
            self.qn_target = QNetwork(state_size, action_size,
                                      noisy=use_noise).to(device)

        # Initialize target model parameters with local model parameters
        self.soft_update(1.0)

        # TODO: make the optimizer configurable
        self.optimizer = optim.Adam(self.qn_local.parameters(), lr=lr)

        if use_priority:
            self.memory = PrioritizedReplayBuffer(buffer_size, batch_size)
        else:
            self.memory = ReplayBuffer(buffer_size, batch_size)

        # Initialize time step (for updating every update_every steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        """Step performed by the agent 
        after interacting with the environment and receiving feedback
        
        Args:
            state (int)
            action (int)
            reward (float)
            next_state (int)
            done (bool)
        """

        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every update_every time steps.
        self.t_step = (self.t_step + 1) % self.update_every

        if self.t_step == 0:

            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:

                if self.use_priority:
                    experiences, indices, weights = self.memory.sample()
                    self.learn(experiences, indices, weights)
                else:
                    experiences = self.memory.sample()
                    self.learn(experiences)

    def act(self, state, eps=0.):
        """Given a state what's the next action to take
        
        Args:
            state (int)
            eps (flost): 
                controls how often we explore before taking the greedy action
        
        Returns:
            int: action to take
        """

        state = torch.from_numpy(state).float().unsqueeze(0).to(device)

        self.qn_local.eval()
        with torch.no_grad():
            action_values = self.qn_local(state)
        self.qn_local.train()

        if self.use_noise:
            return np.argmax(action_values.cpu().numpy())
        else:
            # Epsilon-greedy action selection
            if random.random() > eps:
                return np.argmax(action_values.cpu().numpy())
            else:
                return random.choice(np.arange(self.action_size))

    def learn(self, experiences, indices=None, weights=None):
        """Use a batch of experiences to calculate TD errors and update Q networks
        
        Args:
            experiences: tuple with state, action, reward, next_state and done
            indices (Numpy array): 
                array of indices to update priorities (only used with PER)
            weights (Numpy array): 
                importance-sampling weights (only used with PER)
        """

        states = torch.from_numpy(
                np.vstack([e.state for e in experiences if e is not None]))\
                .float().to(device)
        actions = torch.from_numpy(
                np.vstack([e.action for e in experiences if e is not None]))\
                .long().to(device)
        rewards = torch.from_numpy(
                np.vstack([e.reward for e in experiences if e is not None]))\
                .float().to(device)
        next_states = torch.from_numpy(
                np.vstack([e.next_state for e in experiences if e is not None]))\
                .float().to(device)
        dones = torch.from_numpy(
                np.vstack([e.done for e in experiences if e is not None])\
                .astype(np.uint8)).float().to(device)

        if self.use_priority:
            weights = torch.from_numpy(np.vstack(weights)).float().to(device)

        if self.use_double:  # uses Double Deep Q-Network

            # Get the best action using local model
            best_action = self.qn_local(next_states).argmax(-1, keepdim=True)

            # Evaluate the action using target model
            max_q = self.qn_target(next_states).detach().gather(
                -1, best_action)

        else:  # normal Deep Q-Network

            # Get max predicted Q value (for next states) from target model
            max_q = self.qn_target(next_states).detach().max(-1,
                                                             keepdim=True)[0]

        # Compute Q targets for current states
        q_targets = rewards + (self.gamma * max_q * (1 - dones))

        # Get expected Q values from local model
        q_expected = self.qn_local(states).gather(-1, actions)

        # Compute loss...
        if self.use_priority:
            # Calculate TD error to update priorities
            weighted_td_errors = weights * (q_targets - q_expected)**2
            loss = weighted_td_errors.mean()
        else:
            loss = F.mse_loss(q_expected, q_targets)

        # ...and minimize
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if self.use_priority:
            self.memory.update(indices,
                               weighted_td_errors.detach().cpu().numpy())

        # Update target network
        self.soft_update(self.tau)

    def soft_update(self, tau):
        """Soft update model parameters:
            θ_target = τ*θ_local + (1 - τ)*θ_target

        Args:
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """

        for target_param, local_param in zip(self.qn_target.parameters(),
                                             self.qn_local.parameters()):
            target_param.data.copy_(tau * local_param +
                                    (1.0 - tau) * target_param)

    def make_filename(self, filename):
        filename = 'noisy_' + filename if self.use_noise else filename
        filename = 'dueling_' + filename if self.use_dueling else filename
        filename = 'double_' + filename if self.use_double else filename
        filename = 'prioritized_' + filename if self.use_priority else filename

        return filename

    def save_weights(self, filename='local_weights.pth', path='weights'):
        filename = self.make_filename(filename)
        torch.save(self.qn_local.state_dict(), '{}/{}'.format(path, filename))

    def load_weights(self, filename='local_weights.pth', path='weights'):
        self.qn_local.load_state_dict(
            torch.load('{}/{}'.format(path, filename)))

    def summary(self):
        print('DQNAgent:')
        print('========')
        print('')
        print('Using Double:', self.use_double)
        print('Using Dueling:', self.use_dueling)
        print('Using Priority:', self.use_priority)
        print('Using Noise:', self.use_noise)
        print('')
        print(self.qn_local)
Beispiel #28
0
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        self.loss = None
        self.loss_list = None
        self.exp = None

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
#         self.memory.add2(state, action, reward, next_state, done,None)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            #### Original
#             if len(self.memory) > BATCH_SIZE:
#                 experiences = self.memory.sample()
#                 self.learn(experiences, GAMMA)
            #### Testing
            if len(self.memory.memory2) > BATCH_SIZE:
                experiences = self.memory.sample2()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        # Compute Q targets for current states 
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        self.loss=loss
        loss_list = ((Q_expected-Q_targets)**2)**.5
        self.loss_list = loss_list
#         for s,a,r,n,d,ll in zip(states, actions, rewards, next_states, dones,loss_list):
#             self.memory.add2(s,a,r,n,d,ll)
        for i in range(len(states)):
            self.memory.add2(states[i], actions[i], rewards[i], next_states[i], dones[i],loss_list[i].detach().numpy())


        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent():
    def __init__(self, state_size, action_size, seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

        self.t_step = 0

    def step(self, state, action, reward, next_state, done):

        self.memory.add(state, action, reward, next_state, done)

        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:

            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):

        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        states, actions, rewards, next_states, dones = experiences

        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        Q_expected = self.qnetwork_local(states).gather(1, actions)

        loss = F.mse_loss(Q_expected, Q_targets)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        ## TODO: compute and minimize the loss
        "*** YOUR CODE HERE ***"

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
Beispiel #31
0
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        hidden_layers = [128,64]
        self.qnetwork_local = QNetwork(state_size, action_size, hidden_layers, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, hidden_layers, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        
        #Initialising target and local environment with same weights
        self.hard_update(self.qnetwork_local,self.qnetwork_target)
    
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
    
    
    def update(self):
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if np.random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        ## TODO: compute and minimize the loss
        "*** YOUR CODE HERE ***"
        
        max_actions = self.qnetwork_local.forward(next_states).detach().max(1)[1].unsqueeze(1)
        output_target = self.qnetwork_target.forward(next_states).gather(1,max_actions)
        td_target = rewards + gamma*(output_target*(1-dones))
        output_local= self.qnetwork_local(states).gather(1, actions)
        
        loss = F.mse_loss(output_local,td_target)
        
        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.qnetwork_local.parameters(), 1)
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

    def hard_update(self,local_model,target_model):
        
        for target_param,local_param in zip(target_model.parameters(),local_model.parameters()):
            target_param.data.copy_(local_param)