Exemple #1
0
class DQNAgent:
    def __init__(self, gamma, action_number, minibatch, episodes, begin_train,
                 train_step, begin_copy, copy_step, epsilon_delta,
                 epsilon_start, epsilon_end, load_model, path_to_load,
                 path_to_save, episode_steps, episode_to_save, max_buffer_len):

        # Epsilon

        self.epsilon_delta = epsilon_delta
        self.epsilon_end = epsilon_end
        self.epsilon_start = epsilon_start
        self.epsilon = epsilon_start

        # Main Params

        self.minibatch = minibatch
        self.action_number = action_number
        self.gamma = gamma

        # Episode Params

        self.begin_train = begin_train
        self.begin_copy = begin_copy
        self.copy_step = copy_step
        self.train_step = train_step
        self.episodes = episodes
        self.episode_steps = episode_steps
        self.episode_to_save = episode_to_save

        # I/O params

        self.path_to_load = path_to_load
        self.path_to_save = path_to_save
        self.load_model = load_model

        # Model Fields

        self.action = None
        self.state = None
        self.replay_buffer = ReplayBuffer(max_buffer_len)

        # Model
        self.device = torch.device(
            'cuda:0' if torch.cuda.is_available() else 'cpu')
        # self.device = torch.device('cpu')
        self.model = BoxModel((150, 100, 1), action_number).to(self.device)
        if self.load_model:
            self.model.load_state_dict(torch.load(self.path_to_load))

        # Rewards

        self.rewards_white, self.rewards_black, self.rewards = [], [], []

    def reduce_epsilon(self, episode):
        self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
            np.exp(-1. * episode / self.epsilon_delta)

    def epsilon_greedy(self):
        if (1 - self.epsilon) <= np.random.random():
            self.action = np.random.randint(self.action_number)
        else:
            state = torch.autograd.Variable(
                torch.FloatTensor(self.state).to(self.device).unsqueeze(0))
            self.action = self.model(state).max(1)[1].item()
        return self.action

    @staticmethod
    def preprocess_observation(observation):
        rgb = observation[30:180, 30:130] / 255
        r, g, b = rgb[:, :, 0], rgb[:, :, 1], rgb[:, :, 2]
        gray = 0.2989 * r + 0.5870 * g + 0.1140 * b
        return gray.reshape(1, 150, 100)

    def transition_process(self, o_state, o_act, o_reward, o_next_state,
                           o_done):

        return \
            torch.autograd.Variable(torch.FloatTensor(np.float32(o_state)).to(self.device)), \
            torch.autograd.Variable(torch.LongTensor(o_act).to(self.device)), \
            torch.autograd.Variable(torch.FloatTensor(o_reward).to(self.device)), \
            torch.autograd.Variable(torch.FloatTensor(np.float32(o_next_state)).to(self.device)), \
            torch.autograd.Variable(torch.FloatTensor(o_done).to(self.device))

    def train_model(self):
        o_state, o_act, o_reward, o_next_state, o_done = \
            self.transition_process(*self.replay_buffer.sample(self.minibatch))
        q = self.model(o_state)
        q_next = self.model(o_next_state)
        y_hat = o_reward + self.gamma * q_next.max(1)[0] * (1 - o_done)
        loss = (q.gather(1, o_act.unsqueeze(1)).squeeze(1) -
                torch.autograd.Variable(y_hat.data)).pow(2).mean()
        self.model.optimizer.zero_grad()
        loss.backward()
        self.model.optimizer.step()

    def print(self, episode, reward_black, reward_white, epsilon):
        print(f"For episode {episode} reward white - "
              f"{reward_white} and black - {reward_black},"
              f"epsilon - {epsilon}")

    def train(self, env: gym.wrappers.time_limit.TimeLimit):
        start = time()
        print("Begin to Train")

        for episode in range(self.episodes):
            observation = env.reset()
            self.state = self.preprocess_observation(observation)
            reward_black, reward_white, total_reward = 0, 0, 0
            for episode_steps in range(self.episode_steps):
                action = self.epsilon_greedy()
                next_observation, reward, done, _ = env.step(action)
                reward_black += (reward < 0) * abs(reward)
                reward_white += (reward > 0) * reward
                total_reward += reward
                next_state = self.preprocess_observation(next_observation)
                self.replay_buffer.push(self.state, action, reward, next_state,
                                        done)
                if len(self.replay_buffer) >= self.begin_train:
                    self.train_model()
                # if (episode_step >= self.begin_copy) and (episode_step % self.copy_step == 0):
                #     plt.plot(total_reward)
                #     plt.show()
                # self.const_model = self.model.clone()
                if done:
                    break

            self.reduce_epsilon(episode)
            if episode != 0 and episode % self.episode_to_save == 0:
                torch.save(self.model.state_dict(), self.path_to_save)
                plt.plot(self.rewards)
                plt.show()

            self.rewards_black.append(reward_black)
            self.rewards_white.append(reward_white)
            self.rewards.append(total_reward)
            self.print(episode,
                       reward_black=reward_black,
                       reward_white=reward_white,
                       epsilon=self.epsilon)
            print(time() - start)

    def play(self, env: gym.wrappers.time_limit.TimeLimit):
        observation = env.reset()
        reward_black, reward_white, total_reward = 0, 0, 0
        for episode_steps in range(self.episode_steps):
            state = self.preprocess_observation(observation)
            state = torch.autograd.Variable(
                torch.FloatTensor(state).to(self.device).unsqueeze(0))
            print(self.model(state))
            action = self.model(state).max(1)[1].item()
            observation, reward, done, _ = env.step(action)
            reward_black += (reward < 0) * abs(reward)
            reward_white += (reward > 0) * reward
            total_reward += reward
            sleep(0.01)
            env.render()
            if done:
                break
        print(total_reward)
Exemple #2
0
class DDQNAgentCnn(GeneralAgent):
    def __init__(self,
                 gamma,
                 action_number,
                 minibatch,
                 episodes,
                 begin_train,
                 copy_step,
                 epsilon_delta,
                 epsilon_start,
                 epsilon_end,
                 load_model,
                 path_to_load,
                 path_to_save,
                 plots_to_save,
                 episode_steps,
                 episode_to_save,
                 max_buffer_len,
                 model_type
                 ):

        super().__init__(gamma=gamma,
                         action_number=action_number,
                         path_to_load=path_to_load,
                         path_to_save=path_to_save,
                         plots_to_save=plots_to_save,
                         load_model=load_model,
                         episode_to_save=episode_to_save,
                         episodes=episodes,
                         model_type=model_type)
        # Epsilon

        self.epsilon_delta = epsilon_delta
        self.epsilon_end = epsilon_end
        self.epsilon_start = epsilon_start
        self.epsilon = epsilon_start

        # Main Params

        self.minibatch = minibatch

        # Episode Params

        self.begin_train = begin_train
        self.copy_step = copy_step
        self.episode_steps = episode_steps

        # Model Fields

        self.action = None
        self.state = None
        self.replay_buffer = ReplayBuffer(max_buffer_len)

        # Model
        self.target_model = model_type(action_number).to(self.device)
        self.update_target()

        # Rewards

        self.rewards_white, self.rewards_black, self.rewards = [], [], []
        self.losses = []
        self.periodic_reward = 0
        self.periodic_rewards = []

    def update_target(self):
        self.target_model.load_state_dict(self.model.state_dict())

    def reduce_epsilon(self, episode):
        self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
            np.exp(-1. * episode / self.epsilon_delta)

    def epsilon_greedy(self):
        if (1 - self.epsilon) <= np.random.random():
            self.action = np.random.randint(self.action_number)
        else:
            state = torch.autograd.Variable(torch.FloatTensor(self.state).to(self.device).unsqueeze(0))
            self.action = self.model(state).max(1)[1].item()
        return self.action

    @staticmethod
    def preprocess_observation(obs):
        img = resize(rgb2gray(obs[0:188, 23:136, :]), (28, 28), mode='constant')
        img = img.reshape(1, 28, 28)
        return img

    def transition_process(self, o_state, o_act, o_reward, o_next_state, o_done):

        return \
            torch.autograd.Variable(torch.FloatTensor(np.float32(o_state)).to(self.device)), \
            torch.autograd.Variable(torch.LongTensor(o_act).to(self.device)), \
            torch.autograd.Variable(torch.FloatTensor(o_reward).to(self.device)), \
            torch.autograd.Variable(torch.FloatTensor(np.float32(o_next_state)).to(self.device)), \
            torch.autograd.Variable(torch.FloatTensor(o_done).to(self.device))

    def train_model(self):
        o_state, o_act, o_reward, o_next_state, o_done = \
            self.transition_process(*self.replay_buffer.sample(self.minibatch))
        q = self.model(o_state).gather(1, o_act.unsqueeze(1)).squeeze(1)
        q_next = self.target_model(o_next_state)
        y_hat = o_reward + self.gamma * q_next.max(1)[0] * (1 - o_done)
        loss = (q - y_hat.detach()).pow(2).mean()

        self.model.optimizer.zero_grad()
        loss.backward()
        self.model.optimizer.step()
        return loss

    def init_new_episode(self, env):
        observation = env.reset()
        self.state = self.preprocess_observation(observation)

    def episode_check(self, episode, loss):

        if episode % self.copy_step == 0:
            self.losses.append(loss)
            self.update_target()

        if episode % self.episode_steps == 0:
            self.periodic_rewards.append(self.periodic_reward / self.episode_steps)
            self.periodic_reward = 0

        if episode % self.episode_to_save == 0:
            torch.save(self.model.state_dict(), self.path_to_save)
            fig = plt.figure()
            plt.plot(self.rewards)
            fig.savefig(self.plots_to_save + '_reward.png')
            plt.close(fig)
            fig = plt.figure()
            plt.plot(self.losses)
            fig.savefig(self.plots_to_save + '_loss.png')
            plt.close(fig)
            fig = plt.figure()
            plt.plot(self.periodic_rewards)
            fig.savefig(self.plots_to_save + '_periodic_reward.png')
            plt.close(fig)
            
    def train(self, env: gym.wrappers.time_limit.TimeLimit):
        self.init_new_episode(env)
        total_reward = 0
        episode_reward = 0
        loss = 0
        for episode in self.trangle:
            self.trangle.set_description(
                f"Episode: {episode} | Episode Reward {episode_reward} | Periodic reward "
                f"{self.periodic_reward / self.episode_steps} | Average Reward {total_reward / (episode + 1)}"
            )
            self.trangle.refresh()
            action = self.epsilon_greedy()
            next_observation, reward, done, _ = env.step(action)
            total_reward += reward
            episode_reward += reward
            self.periodic_reward += reward
            next_state = self.preprocess_observation(next_observation)
            self.replay_buffer.push(self.state, action, reward, next_state, done)
            self.state = next_state
            if len(self.replay_buffer) >= self.begin_train:
                loss = self.train_model()

            self.reduce_epsilon(episode)
            self.episode_check(episode, loss)

            if done:
                self.init_new_episode(env)
                self.rewards.append(episode_reward)
                episode_reward = 0