Beispiel #1
0
    def __init__(self,
                 state_size,
                 action_size,
                 random_seed,
                 lr_actor=1e-2,
                 lr_critic=1e-2,
                 fc1_units=128,
                 fc2_units=128,
                 buffer_size=int(1e6),
                 batch_size=50,
                 gamma=0.95,
                 tau=1e-2,
                 max_norm=1.0,
                 learn_period=100,
                 learn_sampling_num=50,
                 adam_critic_weight_decay=0.0,
                 name=None,
                 exploration_mu=0.0,
                 exploration_sigma=0.2,
                 exploration_theta=0.15,
                 epsilon_start=1.0,
                 epsilon_end=0.1,
                 epsilon_decay=0.99):
        """Initialize an Agent object.
        Args:
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
            
        """
        super().__init__()
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        self.max_norm = max_norm
        self.learn_period = learn_period
        self.learn_sampling_num = learn_sampling_num
        self.epsilon = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_decay = epsilon_decay

        self.actor_local = DDPGActorVersion1(state_size,
                                             action_size,
                                             random_seed,
                                             fc1_units=fc1_units,
                                             fc2_units=fc2_units).to(device)

        self.actor_target = DDPGActorVersion1(state_size,
                                              action_size,
                                              random_seed,
                                              fc1_units=fc1_units,
                                              fc2_units=fc2_units).to(device)

        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = DDPGCriticVersion1(state_size,
                                               action_size,
                                               random_seed,
                                               fcs1_units=fc1_units,
                                               fc2_units=fc2_units).to(device)

        self.critic_target = DDPGCriticVersion1(state_size,
                                                action_size,
                                                random_seed,
                                                fcs1_units=fc1_units,
                                                fc2_units=fc2_units).to(device)

        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=lr_critic,
            weight_decay=adam_critic_weight_decay)
        # Noise process for action
        # Noise process
        #         self.exploration_mu = 0
        #         self.exploration_theta = 0.15 # (Timothy Lillicrap, 2016)
        #         self.exploration_sigma = 0.2 # (Timothy Lillicrap, 2016)

        self.exploration_mu = exploration_mu
        self.exploration_theta = exploration_theta  # (Timothy Lillicrap, 2016)
        self.exploration_sigma = exploration_sigma  # (Timothy Lillicrap, 2016)

        self.noise = OUNoise(action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory

        #self.memory = ReplayBuffer(action_size, buffer_size, batch_size, random_seed, device)

        self.memory = PrioritizedReplayBuffer(action_size, buffer_size,
                                              batch_size, random_seed, device)

        # Prioritized Replay Buffer Params
        #self.a, self.b = 0.7, 0.5   # rank-based variant
        self.a, self.b = 0.6, 0.4  # proportional variant

        self.e = 1e-3  # 0.01 * (reward of each time step) = 0.01 * 0.1

        # parameter of discounted reward
        self.gamma = gamma

        # soft update parameter
        self.tau = tau

        self.batch_size = batch_size

        self.name = name

        self.time_step = 0
Beispiel #2
0
    def train(self,
              episode=50000,
              batch_size=32,
              episode_step=10000,
              random_step=50000,
              min_greedy=0.0,
              max_greedy=0.9,
              greedy_step=1000000,
              test_step=1000,
              update_period=10000,
              train_frequency=4,
              test_eps_greedy=0.05,
              test_period=10):

        self._buffer = PrioritizedReplayBuffer([
            1,
        ],
                                               self._state_size,
                                               self._imsize,
                                               0,
                                               episode,
                                               buffer_size=self.buffer_size)

        LOG_EVERY_N_STEPS = 1000
        logger = Logger(self.log_dir)

        state = self.env.reset()
        steps = 0

        eps_greedy = min_greedy
        g_step = (max_greedy - min_greedy) / greedy_step

        for i in range(random_step):
            action = self.env.action_space.sample()
            next_state, reward, terminal, _ = self.env.step(action)
            self._buffer.store(state, np.array(action), np.array(reward),
                               next_state, np.array(terminal))
            state = next_state

            if terminal:
                state = self.env.reset()

        for e in range(episode):
            loss = 0
            total_reward = 0
            state = self.env.reset()

            train_one_episode_reward = []
            train_each_episode_reward = []
            test_one_episode_reward = []
            test_each_episode_reward = []
            bar = tqdm()

            for j in range(episode_step):
                if np.random.rand() < eps_greedy:
                    img = Image.fromarray(state).convert('L')
                    x = np.array(img.resize(self._imsize))
                    x = torch.from_numpy(
                        np.expand_dims(x[None, ...].astype(np.float),
                                       axis=3).transpose(
                                           (0, 3, 1, 2))).type(dtype)
                    action = self.q_net(x / 255.).max(1)[1].item()
                    # take an action maximizing Q function
                else:
                    action = self.env.action_space.sample()

                eps_greedy += g_step
                eps_greedy = np.clip(eps_greedy, min_greedy, max_greedy)

                next_state, reward, terminal, _ = self.env.step(action)

                total_reward += reward
                train_each_episode_reward.append(reward)
                self._buffer.store(state, np.array(action), np.array(reward),
                                   next_state, np.array(terminal))
                train_one_episode_reward.append(reward)
                state = next_state

                if len(self._buffer) > batch_size and j % train_frequency == 0:
                    train_prestate, train_action, train_reward, train_state, train_terminal, _ = self._buffer.get_minibatch(
                        batch_size)

                    ## target
                    x = torch.from_numpy(
                        np.expand_dims(train_state, axis=3).transpose(
                            (0, 3, 1, 2)) / 255.).type(dtype)
                    prior_action = self.q_net(x).max(1)[1].detach()
                    next_q_value = self.target_q_net(x)
                    next_q_value = next_q_value.gather(
                        1,
                        prior_action.view(-1, 1).type(dlongtype)).view(-1)
                    non_terminal = torch.from_numpy(
                        (~train_terminal).astype(np.float)).type(dtype)
                    target = torch.from_numpy(train_reward).type(
                        dtype) + next_q_value * self.gamma * non_terminal
                    target = target.detach()
                    # -----

                    train_prestate = torch.from_numpy(
                        np.expand_dims(train_prestate, axis=3).transpose(
                            (0, 3, 1, 2)) / 255.).type(dtype)
                    z = self.q_net(
                        Variable(train_prestate,
                                 requires_grad=True).type(dtype))
                    z = z.gather(1, torch.Tensor(train_action).type(dlongtype))
                    l = self.loss_func(
                        z,
                        Variable(torch.tensor(target.reshape(
                            (-1, 1)))).type(dtype))
                    self.opt.zero_grad()
                    l.backward()
                    for param in self.q_net.parameters():
                        param.grad.data.clamp_(-1, 1)
                    self.opt.step()

                    loss += l.cpu().detach().numpy()

                    msg = "episode {:03d} each step reward:{:5.3f}".format(
                        e, total_reward)
                    bar.set_description(msg)
                    bar.update(1)

                if steps % update_period == 0:
                    self.update_params()

                info = {
                    'reward_per_1000_steps': total_reward,
                }
                if steps % LOG_EVERY_N_STEPS == 0:
                    for tag, value in info.items():
                        logger.scalar_summary(tag, value, steps + 1)
                if terminal:
                    break
                steps += 1

            state = self.env.reset()
            self.train_reward_list.append(total_reward)
            self.train_error_list.append(float(loss) / (j + 1))
            best_reward = np.max(self.train_reward_list)
            if len(self.train_reward_list) > 100:
                mean_reward = np.mean(self.train_reward_list[-100:])
                if (best_reward != -float('inf')):
                    info = {
                        'mean_episode_reward_last_100': mean_reward,
                        'best_mean_episode_reward': best_reward
                    }

                    for tag, value in info.items():
                        logger.scalar_summary(tag, value, e + 1)
            else:
                mean_reward = np.mean(self.train_reward_list)

            msg = (
                "episode {:03d} avg_loss:{:6.3f} total_reward [train:{:5.3f} test:-] average_reward {:5.3f} best_reward {:5.3f} e-greedy:{:5.3f}"
                .format(e,
                        float(loss) / ((j + 1) // train_frequency),
                        total_reward, mean_reward, best_reward, eps_greedy))

            if e % 1000 == 0:
                torch.save(
                    self.q_net,
                    os.path.join(self.weight_dir, "model_{:d}.h5".format(e)))

            bar.set_description(msg)
            bar.update(0)
            bar.refresh()
            bar.close()

            sleep(0.05)
Beispiel #3
0
class DDPGAgentVersion5(BaseAgent):
    def __init__(self,
                 state_size,
                 action_size,
                 random_seed,
                 lr_actor=1e-2,
                 lr_critic=1e-2,
                 fc1_units=128,
                 fc2_units=128,
                 buffer_size=int(1e6),
                 batch_size=50,
                 gamma=0.95,
                 tau=1e-2,
                 max_norm=1.0,
                 learn_period=100,
                 learn_sampling_num=50,
                 adam_critic_weight_decay=0.0,
                 name=None,
                 exploration_mu=0.0,
                 exploration_sigma=0.2,
                 exploration_theta=0.15,
                 epsilon_start=1.0,
                 epsilon_end=0.1,
                 epsilon_decay=0.99):
        """Initialize an Agent object.
        Args:
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
            
        """
        super().__init__()
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        self.max_norm = max_norm
        self.learn_period = learn_period
        self.learn_sampling_num = learn_sampling_num
        self.epsilon = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_decay = epsilon_decay

        self.actor_local = DDPGActorVersion1(state_size,
                                             action_size,
                                             random_seed,
                                             fc1_units=fc1_units,
                                             fc2_units=fc2_units).to(device)

        self.actor_target = DDPGActorVersion1(state_size,
                                              action_size,
                                              random_seed,
                                              fc1_units=fc1_units,
                                              fc2_units=fc2_units).to(device)

        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = DDPGCriticVersion1(state_size,
                                               action_size,
                                               random_seed,
                                               fcs1_units=fc1_units,
                                               fc2_units=fc2_units).to(device)

        self.critic_target = DDPGCriticVersion1(state_size,
                                                action_size,
                                                random_seed,
                                                fcs1_units=fc1_units,
                                                fc2_units=fc2_units).to(device)

        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=lr_critic,
            weight_decay=adam_critic_weight_decay)
        # Noise process for action
        # Noise process
        #         self.exploration_mu = 0
        #         self.exploration_theta = 0.15 # (Timothy Lillicrap, 2016)
        #         self.exploration_sigma = 0.2 # (Timothy Lillicrap, 2016)

        self.exploration_mu = exploration_mu
        self.exploration_theta = exploration_theta  # (Timothy Lillicrap, 2016)
        self.exploration_sigma = exploration_sigma  # (Timothy Lillicrap, 2016)

        self.noise = OUNoise(action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory

        #self.memory = ReplayBuffer(action_size, buffer_size, batch_size, random_seed, device)

        self.memory = PrioritizedReplayBuffer(action_size, buffer_size,
                                              batch_size, random_seed, device)

        # Prioritized Replay Buffer Params
        #self.a, self.b = 0.7, 0.5   # rank-based variant
        self.a, self.b = 0.6, 0.4  # proportional variant

        self.e = 1e-3  # 0.01 * (reward of each time step) = 0.01 * 0.1

        # parameter of discounted reward
        self.gamma = gamma

        # soft update parameter
        self.tau = tau

        self.batch_size = batch_size

        self.name = name

        self.time_step = 0

    def step(self, state, action, reward, next_state, done):
        self.time_step += 1
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if (len(self.memory) > self.batch_size) and (self.time_step %
                                                     self.learn_period == 0):

            for _ in range(self.learn_sampling_num):
                experiences = self.memory.sample()
                self.learn(experiences, self.gamma)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)

        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()

        self.actor_local.train()

        if add_noise:
            action += self.epsilon * self.noise.sample()

        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        
        Q_targets = r + gamma * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        
        Args:
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones, indices, probs = experiences
        # train critic
        # loss fuction = Q_target(TD 1-step boostrapping) - Q_local(current)
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        Q_expected = self.critic_local(states, actions)

        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # compute td error (delta) for updating prioritized replay buffer
        abs_td_error = torch.abs(Q_targets - Q_expected)

        # Calculate importance sampling weight
        if probs:
            weights = np.array(probs).reshape(-1, 1) * len(
                self.memory)**(-self.b)
            weights /= np.max(weights)
            #weights = [(prob * size_memory) ** (-self.b) for prob in probs]
            #max_weight = max(weights)
            #weights = np.array([w / max_weight for w in weights]).reshape((-1, 1))
        else:
            weights = np.ones(critic_loss.shape, dtype=np.float)

        # Calculate weighted loss
        weighted_critic_loss = torch.mean(
            torch.from_numpy(weights).float().to(device) * critic_loss)
        self.critic_optimizer.zero_grad()
        weighted_critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(),
                                       self.max_norm)
        self.critic_optimizer.step()

        if indices:
            # convert errors to priorities and update them
            self.memory.update(
                indices,
                list(
                    abs_td_error.detach().to('cpu').numpy().squeeze()**self.a +
                    self.e))

        # train actor (policy gradient)
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # update critic_target
        self.soft_update(self.critic_local, self.critic_target, self.tau)

        # update actor_target
        self.soft_update(self.actor_local, self.actor_target, self.tau)

        #------ update noise ---#
        self.epsilon = max(self.epsilon * self.epsilon_decay, self.epsilon_end)
        self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Args:
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def model_dicts(self):
        return {
            'agent_{}_actor'.format(self.name): self.actor_target,
            'agent_{}_critic'.format(self.name): self.critic_target
        }
Beispiel #4
0
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 double_DQN=False,
                 prioritized_replay=False,
                 dueling_networks=False):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            double_DQN (bool) : use double DQN
            prioritized_replay (bool): used prioritized_replay
            

        """

        self.state_size = state_size
        self.action_size = action_size
        self.seed = seed
        self.tau = TAU
        self.double_DQN = double_DQN
        self.prioritized_replay = prioritized_replay
        self.dueling_networks = dueling_networks

        if self.dueling_networks:
            # Q-Networks - Local, Target Neural Nets
            self.qnetwork_local = DuelingQNetwork(state_size, action_size,
                                                  seed).to(device)
            self.qnetwork_target = DuelingQNetwork(state_size, action_size,
                                                   seed).to(device)
            self.qnetwork_target.eval()

        else:
            # Q-Networks - Local, Target Neural Nets
            self.qnetwork_local = QNetwork(state_size, action_size,
                                           seed).to(device)
            self.qnetwork_target = QNetwork(state_size, action_size,
                                            seed).to(device)
            self.qnetwork_target.eval()

        # Use optimizer to update the "local" neural net
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        if self.prioritized_replay:
            prioritized_params = {
                'a': 0.6,
                'b': 0.4,
                'b_inc_rate': 1.001,
                'e': 0.01
            }
            self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE,
                                                  BATCH_SIZE, seed, device,
                                                  prioritized_params)
        else:
            # Replay memory
            self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                       seed, device)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0