Esempio n. 1
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            model (string): which network to use
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = DuelingQNetwork(state_size, action_size,
                                              seed).to(device)
        self.qnetwork_target = DuelingQNetwork(state_size, action_size,
                                               seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.0):
        """Returns actions for given state as per current policy.

        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Double DQN
        # Local network picks action
        next_action = self.qnetwork_local(next_states).detach().argmax(
            1).unsqueeze(1)
        # Target network estimates the value of said action
        Q_targets_next = self.qnetwork_target(next_states).gather(
            1, next_action)

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Esempio n. 2
0
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, num_episodes, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            num_episodes (int): number of training epochs
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = seed

        # Q-Network
        self.qnetwork_local = DuelingQNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = DuelingQNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.anneal_beta = (1. - BETA) / num_episodes

        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, ALPHA, BETA)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
        self.t_learning_step = 0
    
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def update_weights(self):
        self.memory.anneal_beta(self.anneal_beta)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones, idxs, weights = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        # Compute Q targets for current states 
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # update priorities
        updates = torch.abs(Q_expected - Q_targets).cpu().data.squeeze(1).numpy()
        self.memory.update_priorities(idxs, updates)

        # Compute loss
        loss = F.l1_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        (loss * weights).mean().backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.t_learning_step += 1
        if self.t_learning_step % UPDATE_TARGET_STEPS == 0:
            self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            # PyTorch copy: destination.data.copy(source.data)
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
Esempio n. 3
0
class DDDQNPolicy(Policy):
    """Dueling Double DQN policy"""
    def __init__(self,
                 state_size,
                 action_size,
                 parameters,
                 evaluation_mode=False):
        self.evaluation_mode = evaluation_mode

        self.state_size = state_size
        self.action_size = action_size
        self.double_dqn = True
        self.hidsize = 1

        if not evaluation_mode:
            self.hidsize = parameters.hidden_size
            self.buffer_size = parameters.buffer_size
            self.batch_size = parameters.batch_size
            self.update_every = parameters.update_every
            self.learning_rate = parameters.learning_rate
            self.tau = parameters.tau
            self.gamma = parameters.gamma
            self.buffer_min_size = parameters.buffer_min_size

        # Device
        if parameters.use_gpu and torch.cuda.is_available():
            self.device = torch.device("cuda:0")
            print(" Using GPU")
            print(" GPU")

        else:
            self.device = torch.device("cpu")
            print(" Using CPU")

        # Q-Network
        self.qnetwork_local = DuelingQNetwork(state_size,
                                              action_size,
                                              hidsize1=self.hidsize,
                                              hidsize2=self.hidsize).to(
                                                  self.device)

        if not evaluation_mode:
            self.qnetwork_target = copy.deepcopy(self.qnetwork_local)
            self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                        lr=self.learning_rate)
            self.memory = ReplayBuffer(action_size, self.buffer_size,
                                       self.batch_size, self.device)

            self.t_step = 0
            self.loss = 0.0

    def act(self, state, eps=0.):
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def step(self, state, action, reward, next_state, done):
        assert not self.evaluation_mode, "Policy has been initialized for evaluation only."

        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.buffer_min_size and len(
                    self.memory) > self.batch_size:
                self._learn()

    def _learn(self):
        experiences = self.memory.sample()
        states, actions, rewards, next_states, dones = experiences

        # Get expected Q values from local model
        q_expected = self.qnetwork_local(states).gather(1, actions)

        if self.double_dqn:
            # Double DQN
            q_best_action = self.qnetwork_local(next_states).max(1)[1]
            q_targets_next = self.qnetwork_target(next_states).gather(
                1, q_best_action.unsqueeze(-1))
        else:
            # DQN
            q_targets_next = self.qnetwork_target(next_states).detach().max(
                1)[0].unsqueeze(-1)

        # Compute Q targets for current states
        q_targets = rewards + (self.gamma * q_targets_next * (1 - dones))

        # Compute loss
        self.loss = F.mse_loss(q_expected, q_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        self.loss.backward()
        self.optimizer.step()

        # Update target network
        self._soft_update(self.qnetwork_local, self.qnetwork_target, self.tau)

    def _soft_update(self, local_model, target_model, tau):
        # Soft update model parameters.
        # θ_target = τ*θ_local + (1 - τ)*θ_target
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def save(self, filename):
        torch.save(self.qnetwork_local.state_dict(), filename + ".local")
        torch.save(self.qnetwork_target.state_dict(), filename + ".target")

    def load(self, filename):
        if os.path.exists(filename + ".local"):
            self.qnetwork_local.load_state_dict(
                torch.load(filename + ".local",
                           map_location=torch.device('cpu')))
            print('local')
        if os.path.exists(filename + ".target"):
            self.qnetwork_target.load_state_dict(
                torch.load(filename + ".target",
                           map_location=torch.device('cpu')))
            print('target')
Esempio n. 4
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):  #, writer):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = DuelingQNetwork(state_size, action_size,
                                              seed).to(device)
        self.qnetwork_target = DuelingQNetwork(state_size, action_size,
                                               seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # TODO: Swap ReplayBuffer for PER buffer
        # Replay memory
        #         self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        self.memory = PrioritisedReplayBuffer(action_size, BUFFER_SIZE,
                                              BATCH_SIZE, ALPHA, EPSILON)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
        self.beta = BETA_START


#         self.writer = writer

    def step(self, state, action, reward, next_state, done):
        # calculate error, and store experience in replay buffer accordingly

        #         next_actions = self.qnetwork_local(next_states).max(1).indices.unsqueeze(1)
        #         Q_targets_next = self.qnetwork_target(next_states).detach().max(1).values.unsqueeze(1) # << [64,1] of max Q values
        #         Q_targets_next = self.qnetwork_target(next_states).gather(1, next_actions)
        #         Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        #         Q_expected = self.qnetwork_local(states).gather(1, actions) # gather uses the actions as indices to select the Qs
        # get next action from qnetwork_local, using next_state
        # get next reward using next action, from qnetwork_target
        # calc. target: reward + (gamma * next reward * done mask)
        # get expected from qnetwork_local, for current state/action
        s = torch.tensor([state]).float().to(device)
        ns = torch.tensor([next_state]).float().to(device)

        next_action = self.qnetwork_local(ns).max(1).indices.unsqueeze(1)
        next_reward = self.qnetwork_target(ns).detach()[0, next_action]

        target = reward + (GAMMA * next_reward * (1 - done))
        expected = self.qnetwork_local(s).detach()[0, action]
        error = torch.abs(target - expected).cpu().detach()

        self.memory.add(error, state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step += 1

        #         self.writer.add_scalar('Timestep Error', error, self.t_step)

        if (self.t_step % UPDATE_EVERY) == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                self.beta += (
                    (1 - self.beta) / BETA_STEPS
                )  # anneal the beta, from a starting value towards 1.0
                self.beta = np.min([1., self.beta])

                experiences = self.memory.sample(self.beta)
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return action_values.max(1).indices.unsqueeze(
                1).cpu().detach().numpy()
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        #         states, actions, rewards, next_states, dones = experiences
        states, actions, rewards, next_states, dones, weights, idxs = experiences

        next_actions = self.qnetwork_local(next_states).max(
            1).indices.unsqueeze(1)

        #         Q_targets_next = self.qnetwork_target(next_states).detach().max(1).values.unsqueeze(1) # << [64,1] of max Q values
        Q_targets_next = self.qnetwork_target(next_states).detach().gather(
            1, next_actions)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        Q_expected = self.qnetwork_local(states).gather(
            1, actions)  # gather uses the actions as indices to select the Qs

        # refresh errors in replay buffer
        errors = torch.abs(Q_expected - Q_targets).cpu().detach()
        for (idx, error) in zip(idxs, errors):
            self.memory.update(idx, error)

        loss = (weights.detach() *
                F.mse_loss(Q_expected, Q_targets)).mean()  # weighted loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Esempio n. 5
0
class Agent():
    """ Interacts with and learns from the environment.

    This agent implements a few improvements over the vanilla DQN, making it
    a Double Dueling Deep Q-Learning Network with Prioritized Experience Replay.

    * Deep Q-Learning Network:  RL where a deep learning network
      is used for the Q-network estimate.
    * Double DQN:  The local network from DQN is used to select the
      optimal action during learning, but the policy estimate for
      that action is computed using the target network.
    * Dueling DQN:  The deep learning network explicitly estimates
      the value function and the advantage functions separately.
    * DQN-PER:  Experiences are associated with a probability weight
      based upon the absolute error between the estimated Q-value
      and the target Q-value at time of estimation -- prioritizing
      experiences that help learn more.
    """
    def __init__(self,
                 state_size,
                 action_size,
                 buffer_size=int(1e5),
                 batch_size=64,
                 gamma=0.99,
                 tau=1e-3,
                 learn_rate=5e-4,
                 update_every=4,
                 per_epsilon=1e-5,
                 per_alpha=0.6,
                 per_beta=0.9,
                 device=DEFAULT_DEVICE,
                 seed=0):
        """ Initialize an object.

        :param state_size:  (int) Dimension of each state
        :param action_size:  (int) Dimension of each action
        :param buffer_size:  (int) Replay buffer size
        :param batch_size:  (int) Minibatch size used during learning
        :param gamma:  (float) Discount factor
        :param tau:  (float) Scaling parameter for soft update
        :param learn_rate:  (float) Learning rate used by optimizer
        :param update_every:  (int) Steps between updates of target network
        :param per_epsilon:  (float) PER hyperparameter, constant added to each error
        :param per_alpha:  (float) PER hyperparameter, exponent applied to each probability
        :param per_beta:  (float) PER hyperparameter, bias correction exponent for probability weight
        :param device:  (torch.device)  Object representing the device where to allocate tensors
        :param seed:  (int) Seed used for PRNG
        """
        # Save copy of model parameters
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.device = device

        # Save copy of hyperparameters
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.learn_rate = learn_rate
        self.update_every = update_every
        self.per_epsilon = per_epsilon
        self.per_alpha = per_alpha
        self.per_beta = per_beta

        # Q networks
        self.qnetwork_local = DuelingQNetwork(state_size, action_size,
                                              seed).to(device)
        self.qnetwork_target = DuelingQNetwork(state_size, action_size,
                                               seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=learn_rate)

        # Replay memory
        self.memory = PrioritizedReplayBuffer(memory_size=buffer_size,
                                              device=device,
                                              update_every=update_every,
                                              seed=seed)

        # Initialize time step (for updating every self.update_every steps)
        self.t_step = 0
        self.episode = 0

    def step(self, state, action, reward, next_state, done):
        """ Store a single agent step, learning every N steps

        :param state: (array-like) Initial state on the visit
        :param action: (int) Action on the visit
        :param reward: (float) Reward received on the visit
        :param next_state:  (array-like) State reached after the visit
        :param done:  (bool) Flag whether the next state is a terminal state
        """
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every self.update_every time steps
        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample(batch_size=self.batch_size,
                                                 alpha=self.per_alpha,
                                                 beta=self.per_beta)
                self.learn(experiences)

        # Keep track of episode number
        if done:
            self.episode += 1

    def act(self, state, eps=0.):
        """ Returns the selected action for the given state according to the current policy

        :param state: (array_like) Current state
        :param eps: (float) Epsilon, for epsilon-greedy action selection
        :return: action (int)
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        # Convert types to np.int32 for compatibility with environment
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy()).astype(np.int32)
        else:
            return random.choice(np.arange(self.action_size)).astype(np.int32)

    def learn(self, experiences):
        """ Update value parameters using given batch of indexed experience tuples

        :param experiences:  (Tuple[torch.Tensor, np.array]) (s, a, r, s', done, index) tuples
        """
        states, actions, rewards, next_states, dones, indexes = experiences

        # Get max predicted Q values (for next states) from target model

        # Double DQN: use local network to select action with maximum value,
        # then use target network to get Q value for that action
        Q_next_indices = self.qnetwork_local(next_states).detach().argmax(
            1).unsqueeze(1)
        Q_next_values = self.qnetwork_target(next_states).detach()
        Q_targets_next = Q_next_values.gather(1, Q_next_indices)

        # Compute Q target for current states
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute estimation error (for Prioritized Experience Replay) and update weights
        Q_error = (torch.abs(Q_expected.detach() - Q_targets.detach()) +
                   self.per_epsilon).squeeze()
        self.memory.update(indexes, Q_error)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Update target network
        for target_param, local_param in zip(self.qnetwork_target.parameters(),
                                             self.qnetwork_local.parameters()):
            target_param.data.copy_(self.tau * local_param.data +
                                    (1.0 - self.tau) * target_param.data)
Esempio n. 6
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 hidden_sizes=[64, 64],
                 flavor='plain'):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            hidden_sizes (list): list of neurons in each layer
            flavor (str): flavor of the network - plain, double, dueling, double-dueling
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.hidden_sizes = hidden_sizes
        self.flavor = flavor

        # Q-Network
        if self.flavor == 'plain' or self.flavor == 'double':
            self.qnetwork_local = QNetwork(state_size, action_size, seed,
                                           hidden_sizes).to(device)
            self.qnetwork_target = QNetwork(state_size, action_size, seed,
                                            hidden_sizes).to(device)
        # Dueling Q-Network
        if self.flavor == 'dueling' or self.flavor == 'double-dueling':
            self.qnetwork_local = DuelingQNetwork(state_size, action_size,
                                                  seed,
                                                  hidden_sizes).to(device)
            self.qnetwork_target = DuelingQNetwork(state_size, action_size,
                                                   seed,
                                                   hidden_sizes).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def show_network(self):
        x = Variable(torch.randn(1, self.state_size))
        y = self.qnetwork_local(x)
        return make_dot(y,
                        params=dict(
                            list(self.qnetwork_local.named_parameters())))

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences
        # Best actions ...
        if self.flavor == 'plain' or self.flavor == 'dueling':
            # ... according to target model for Double DQN
            best_actions = self.qnetwork_target(next_states).detach().argmax(
                dim=1).unsqueeze(1)
        if self.flavor == 'double' or self.flavor == 'double-dueling':
            # ... according to local model for Double DQN
            best_actions = self.qnetwork_local(next_states).detach().argmax(
                dim=1).unsqueeze(1)
        # Maximal predicted Q value for next state from target model
        Q_t_max = self.qnetwork_target(next_states).gather(
            1, best_actions).detach()
        # Q targets for current state
        Q_t = rewards + (1 - dones) * gamma * Q_t_max
        # Expected Q values of local model
        Q_exp = self.qnetwork_local(states).gather(1, actions)
        # Loss function
        loss = F.mse_loss(Q_exp, Q_t)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Update target network
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Esempio n. 7
0
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = DuelingQNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = DuelingQNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, ALPHA)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
        # Initialize learning step for updating beta
        self.learn_step = 0
    
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get prioritized subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA, BETA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        # Choose action values according to local model
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma, beta):
        """Update value parameters using given batch of experience tuples.
        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
            beta (float): reliance of importance sampling weight on priortization
        """

        # Beta will reach 1 after 25,000 training steps (~325 episodes)
        b = min(1.0, beta + self.learn_step * (1.0 - beta) / 25000)
        self.learn_step += 1
        
        states, actions, rewards, next_states, dones, probabilities, indices = experiences

#         # Get max predicted actions (for next states) from local model
#         next_local_actions = self.qnetwork_local(next_states).max(1)[1].unsqueeze(1)
#         # Evaluate the max predicted actions from the local model on the target model
#         # based on Double DQN
#         Q_targets_next_values = self.qnetwork_target(next_states).detach().gather(1, next_local_actions)
#         # Compute Q targets for current states
#         Q_targets = rewards + (gamma * Q_targets_next_values * (1 - dones))

#         # Get expected Q values from local
#         Q_expected = self.qnetwork_local(states).gather(1, actions)
        
                ## Double DQN
        Q_expected = self.qnetwork_local(states).gather(1, actions)
        next_actions = self.qnetwork_local(next_states).argmax(-1, keepdim=True)
        Q_targets_next = self.qnetwork_target(next_states).gather(-1, next_actions)
        Q_targets = rewards + GAMMA * Q_targets_next * (1-dones)

        # Compute and update new priorities
        new_priorities = (abs(Q_expected - Q_targets) + 0.2).detach()
        self.memory.update_priority(new_priorities, indices)

        # Compute and apply importance sampling weights to TD Errors
        ISweights = (((1 / len(self.memory)) * (1 / probabilities)) ** b)
        max_ISweight = torch.max(ISweights)
        ISweights /= max_ISweight
        Q_targets *= ISweights
        Q_expected *= ISweights

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        self.last_loss = loss
        
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent():
    def __init__(self, state_size, action_size, seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.model = DuelingQNetwork(state_size, action_size, seed).to(device)
#         self.qnetwork_target = DuelingQNetwork(state_size, action_size, seed).to(device)
#         for target_param, param in zip(self.qnetwork_local.parameters(),self.qnetwork_target.parameters()):
#             target_param.data.copy_(param)
            
        self.optimizer = optim.Adam(self.model.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.model.eval()
        with torch.no_grad():
            action_values = self.model(state)
        self.model.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        states, actions, rewards, next_states, dones = experiences
        actions = actions.view(actions.size(0), 1)
        dones = dones.view(dones.size(0), 1)
        
#         curr_Q = self.qnetwork_local.forward(states).gather(1, actions)
#         next_Q = self.qnetwork_target.forward(next_states)
#         max_next_Q = torch.max(next_Q, 1)[0]
#         max_next_Q = max_next_Q.view(max_next_Q.size(0), 1)
#         expected_Q = rewards + (1 - dones) * gamma * max_next_Q
#         loss = F.mse_loss(curr_Q, expected_Q.detach())
        
        curr_Q = self.model.forward(states).gather(1, actions.unsqueeze(1))
        curr_Q = curr_Q.squeeze(1)
        next_Q = self.model.forward(next_states)
        max_next_Q = torch.max(next_Q, 1)[0]
        expected_Q = rewards.squeeze(1) + self.gamma * max_next_Q
        loss = self.MSE_loss(curr_Q, expected_Q)


        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
            
#         self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)   
#         for target_param, param in zip(self.qnetwork_target.parameters(), self.qnetwork_local.parameters()):
#             target_param.data.copy_(TAU * param + (1 - TAU) * target_param)


    def soft_update(self, local_model, target_model, tau):
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)