Example #1
0
class DeepSeaTreasureGraphicalDDQN(object):
    def __init__(self, episodes):
        self.current_episode = 0
        self.episodes = episodes

        self.episode_score = []
        self.episode_qs = []
        self.episode_height = []
        self.episode_loss = []
        self.episode_policies = []

        self.fig, self.ax = plt.subplots(figsize=(10, 4))
        self.fig.canvas.draw()
        plt.show(block=False)

        self.numRewards = 2

        self.env = DeepSeaTreasure(width=5,
                                   speed=10000,
                                   graphical_state=True,
                                   render=True,
                                   is_debug=False,
                                   frame_stack=2)
        self.agent = DQNAgent(stateShape=(84, 84, 2),
                              actionSpace=self.env.get_action_space(),
                              numPicks=32,
                              memorySize=10000,
                              numRewards=self.numRewards)

    def train(self):
        for _ in range(self.episodes):
            self.episode()
            self.current_episode += 1

        plt.show(block=True)
        self.env.close()

    def episode(self):
        done = False
        rewardsSum = 0

        lossSum = 0
        qSums = [0] * (self.numRewards)
        actions = 1

        state = self.env.reset()
        maxHeight = -1

        while not done:
            action, qs = self.agent.selectAction(state)
            if qs != -100000:
                qSums += qs
                actions += 1

            obs, reward, done, _ = self.env.step_all(action)

            nextState = obs
            rewardsSum = np.add(rewardsSum, sum(reward))

            self.agent.addMemory(
                (state, action, (reward[0] + reward[1]), nextState, done))
            state = nextState

            loss = self.agent.trainDQN()
            lossSum += loss

        if self.current_episode % self.agent.sync == 0:
            self.agent.targetNetwork.set_weights(
                self.agent.trainNetwork.get_weights())

        print("now epsilon is {}, the reward is {} with loss {} in episode {}".
              format(self.agent.epsilon, rewardsSum, lossSum,
                     self.current_episode))

        self.episode_score.append(rewardsSum)
        self.episode_height.append(maxHeight)
        self.episode_loss.append(lossSum)
        self.episode_qs.append([qSum / actions for qSum in qSums])
        self.plot()

        print("Report: \nrewardSum:{}\nheight:{}\nloss:{}\nqAverage:{}".format(
            self.episode_score[-1], self.episode_height[-1],
            self.episode_loss[-1], self.episode_qs[-1]))

    def plot(self):
        spline_x = np.linspace(0,
                               self.current_episode,
                               num=self.current_episode)

        ep_scores = np.array(self.episode_score)
        ep_groups = [
            ep_scores[i * GROUP_NUM:(i + 1) * GROUP_NUM]
            for i in range((len(ep_scores) + GROUP_NUM - 1) // GROUP_NUM)
        ]
        # Pad for weird numpy error for now
        ep_groups[-1] = np.append(ep_groups[-1], [np.mean(ep_groups[-1])] *
                                  (GROUP_NUM - len(ep_groups[-1])))
        x_groups = [i * GROUP_NUM for i in range(len(ep_groups))]

        self.ax.clear()
        if len(x_groups) > 5:
            ep_avgs = np.mean(ep_groups, 1)
            avg_spl = interp1d(x_groups,
                               ep_avgs,
                               kind='cubic',
                               fill_value="extrapolate")
            ep_std = np.std(ep_groups, 1)
            std_spl = interp1d(x_groups,
                               ep_std,
                               kind='cubic',
                               fill_value="extrapolate")
            self.ax.plot(spline_x, avg_spl(spline_x), lw=0.7, c="blue")
            self.ax.fill_between(spline_x,
                                 avg_spl(spline_x) - std_spl(spline_x),
                                 avg_spl(spline_x) + std_spl(spline_x),
                                 alpha=0.5,
                                 facecolor="red",
                                 interpolate=True)

        self.ax.title.set_text('Training Score')
        self.ax.set_xlabel('Episode')
        self.ax.set_ylabel('Score')
        '''
        policies = np.transpose(self.episode_policies)
        colors = pl.cm.jet(np.linspace(0, 1, len(policies)*2))

        self.ax[1].clear()
        self.ax[1].title.set_text('Policy Choices')
        for i, policy in enumerate(policies):
            if len(x_groups) > 5:
                ep_groups = [policy[i * GROUP_NUM:(i + 1) * GROUP_NUM] for i in range((len(policy) + GROUP_NUM - 1) // GROUP_NUM)]
                # Pad for weird numpy error for now
                ep_groups[-1] = np.append(ep_groups[-1], [np.mean(ep_groups[-1])] * (GROUP_NUM - len(ep_groups[-1])))
                x_groups = [i*GROUP_NUM for i in range(len(ep_groups))]

                ep_avgs = np.mean(ep_groups, 1)
                avg_spl = interp1d(x_groups, ep_avgs, kind='cubic', fill_value="extrapolate")
                ep_std = np.std(ep_groups, 1)
                std_spl = interp1d(x_groups, ep_std, kind='cubic', fill_value="extrapolate")
                self.ax[1].plot(spline_x, avg_spl(spline_x), lw=0.7, c=colors[i], label="{} policy".format(PolEnum(i).name))
                self.ax[1].fill_between(spline_x, avg_spl(spline_x)-std_spl(spline_x), avg_spl(spline_x)+std_spl(spline_x), alpha=0.5, facecolor=colors[-1-i], interpolate=True)

        self.ax[1].legend()
        '''
        self.fig.canvas.draw()
        plt.show(block=False)
        plt.pause(.001)
class DeepSeaTreasureBaselineDQN(object):
    def __init__(self, episodes):
        self.current_episode = 0
        self.episodes = episodes

        self.episode_score = []
        self.episode_qs = []
        self.episode_height = []
        self.episode_loss = []

        self.fig, self.ax = plt.subplots(figsize=(5, 4))
        self.fig.canvas.draw()
        plt.show(block=False)

        self.env = DeepSeaTreasure(width=5,
                                   speed=1000,
                                   graphical_state=False,
                                   render=True,
                                   is_debug=True)
        self.agent = DQNAgent(stateShape=(2, ),
                              actionSpace=self.env.get_action_space(),
                              numPicks=64,
                              memorySize=2000)

    def train(self):
        for _ in range(self.episodes):
            self.episode()
            self.plot()
            self.current_episode += 1
        plt.show(block=True)

        self.env.close()

    def episode(self):
        done = False
        rewardsSum = 0
        qSum = 0
        qActions = 1
        lossSum = 0

        state = self.env.reset().reshape(1, 2)
        maxHeight = -10000

        while not done:
            action, q = self.agent.selectAction(state)
            if q != -100000:
                qSum += q
                qActions += 1

            obs, reward, done, _ = self.env.step_all(action)
            # env.render()

            reward = reward[0] + reward[1]
            '''
            maxHeight = max(obs[0], maxHeight)
            if obs[0] >= 0.5:
                reward += 10
            '''
            nextState = obs.reshape(1, 2)
            rewardsSum = np.add(rewardsSum, reward)

            loss = self.agent.trainDQN()
            self.agent.addMemory((state, action, reward, nextState, done))
            state = nextState
            lossSum += loss

        self.agent.terminal()
        print("now epsilon is {}, the reward is {} with loss {} in episode {}".
              format(self.agent.epsilon, rewardsSum, lossSum,
                     self.current_episode))

        self.episode_score.append(rewardsSum)
        self.episode_qs.append(qSum / qActions)
        self.episode_height.append(maxHeight)
        self.episode_loss.append(lossSum)

    def plot(self):
        spline_x = np.linspace(0,
                               self.current_episode,
                               num=self.current_episode)

        ep_scores = np.array(self.episode_score)
        ep_groups = [
            ep_scores[i * GROUP_NUM:(i + 1) * GROUP_NUM]
            for i in range((len(ep_scores) + GROUP_NUM - 1) // GROUP_NUM)
        ]
        #Pad for weird numpy error for now
        ep_groups[-1] = np.append(ep_groups[-1], [ep_groups[-1][-1]] *
                                  (GROUP_NUM - len(ep_groups[-1])))
        x_groups = [i * GROUP_NUM for i in range(len(ep_groups))]

        self.ax.clear()
        if len(x_groups) > 5:
            ep_avgs = np.mean(ep_groups, 1)
            avg_spl = interp1d(x_groups,
                               ep_avgs,
                               kind='cubic',
                               fill_value="extrapolate")
            ep_std = np.std(ep_groups, 1)
            std_spl = interp1d(x_groups,
                               ep_std,
                               kind='cubic',
                               fill_value="extrapolate")
            self.ax.plot(spline_x, avg_spl(spline_x), lw=0.7, c="blue")
            self.ax.fill_between(spline_x,
                                 avg_spl(spline_x) - std_spl(spline_x),
                                 avg_spl(spline_x) + std_spl(spline_x),
                                 alpha=0.5,
                                 facecolor="red",
                                 interpolate=True)

        self.ax.title.set_text('Training Score')
        self.ax.set_xlabel('Episode')
        self.ax.set_ylabel('Score')

        plt.show(block=False)
        plt.pause(.001)
class DeepSeaGraphicalWAgent(object):
    def __init__(self, episodes):
        self.current_episode = 0
        self.episodes = episodes

        self.episode_score = []
        self.episode_qs = []
        self.episode_height = []
        self.episode_loss = []
        self.episode_ws = []
        self.episode_policies = []

        self.fig, self.ax = plt.subplots(1, 2, figsize=(10, 4))
        self.fig.canvas.draw()
        plt.show(block=False)

        self.numRewards = 2

        self.env = DeepSeaTreasure(width=5,
                                   speed=10000,
                                   graphical_state=True,
                                   render=True,
                                   is_debug=False)
        self.agent = DQNAgent(stateShape=(64, 64, 1),
                              actionSpace=self.env.get_action_space(),
                              numPicks=32,
                              memorySize=10000,
                              numRewards=self.numRewards)

    def train(self):
        for _ in range(self.episodes):
            self.episode()
            self.current_episode += 1

        plt.show(block=True)
        self.env.close()

    def episode(self):
        done = False
        rewardsSum = 0

        lossSums = [0] * (self.numRewards)
        policies = [0] * (self.numRewards)
        qSums = [0] * (self.numRewards)
        wSums = [0] * (self.numRewards)
        actions = 1

        state = self.process_state(self.env.reset())
        maxHeight = -1

        while not done:
            action, policy, qs, ws, random = self.agent.selectAction(state)
            if not random:
                policies[policy] += 1
                qSums = [qSums[i] + qs[i] for i in range(len(policies))]
                wSums = [wSums[i] + ws[i] for i in range(len(policies))]
                actions += 1

            obs, reward, done, _ = self.env.step_all(action)

            nextState = state - self.process_state(obs)
            rewardsSum = np.add(rewardsSum, sum(reward))

            self.agent.addMemory(
                (state, action, policy, reward, nextState, done))
            state = nextState

            loss = self.agent.trainDQN()
            lossSums = [lossSums[i] + loss[i][0] for i in range(len(policies))]

        print("now epsilon is {}, the reward is {} with loss {} in episode {}".
              format(self.agent.epsilon, rewardsSum, lossSums,
                     self.current_episode))

        self.episode_score.append(rewardsSum)
        self.episode_height.append(maxHeight)
        self.episode_loss.append(lossSums)
        self.episode_policies.append(policies)
        self.episode_qs.append([qSum / actions for qSum in qSums])
        self.episode_ws.append([wSum / actions for wSum in wSums])
        self.plot()

        print(
            "Report: \nrewardSum:{}\nheight:{}\nloss:{}\npolicies:{}\nqAverage:{}\nws:{}"
            .format(self.episode_score[-1], self.episode_height[-1],
                    self.episode_loss[-1], self.episode_policies[-1],
                    self.episode_qs[-1], self.episode_ws[-1]))

    def process_state(self, state):
        state = cv2.resize(state.astype('float32'), (64, 64),
                           interpolation=cv2.INTER_AREA)
        state = cv2.cvtColor(state, cv2.COLOR_RGB2GRAY)
        return np.expand_dims(state, 2)

    def plot(self):
        spline_x = np.linspace(0,
                               self.current_episode,
                               num=self.current_episode)

        ep_scores = np.array(self.episode_score)
        ep_groups = [
            ep_scores[i * GROUP_NUM:(i + 1) * GROUP_NUM]
            for i in range((len(ep_scores) + GROUP_NUM - 1) // GROUP_NUM)
        ]
        # Pad for weird numpy error for now
        ep_groups[-1] = np.append(ep_groups[-1], [np.mean(ep_groups[-1])] *
                                  (GROUP_NUM - len(ep_groups[-1])))
        x_groups = [i * GROUP_NUM for i in range(len(ep_groups))]

        self.ax[0].clear()
        if len(x_groups) > 5:
            ep_avgs = np.mean(ep_groups, 1)
            avg_spl = interp1d(x_groups,
                               ep_avgs,
                               kind='cubic',
                               fill_value="extrapolate")
            ep_std = np.std(ep_groups, 1)
            std_spl = interp1d(x_groups,
                               ep_std,
                               kind='cubic',
                               fill_value="extrapolate")
            self.ax[0].plot(spline_x, avg_spl(spline_x), lw=0.7, c="blue")
            self.ax[0].fill_between(spline_x,
                                    avg_spl(spline_x) - std_spl(spline_x),
                                    avg_spl(spline_x) + std_spl(spline_x),
                                    alpha=0.5,
                                    facecolor="red",
                                    interpolate=True)

        self.ax[0].title.set_text('Training Score')
        self.ax[0].set_xlabel('Episode')
        self.ax[0].set_ylabel('Score')

        policies = np.transpose(self.episode_policies)
        colors = pl.cm.jet(np.linspace(0, 1, len(policies) * 2))

        self.ax[1].clear()
        self.ax[1].title.set_text('Policy Choices')
        for i, policy in enumerate(policies):
            if len(x_groups) > 5:
                ep_groups = [
                    policy[i * GROUP_NUM:(i + 1) * GROUP_NUM]
                    for i in range((len(policy) + GROUP_NUM - 1) // GROUP_NUM)
                ]
                # Pad for weird numpy error for now
                ep_groups[-1] = np.append(ep_groups[-1],
                                          [np.mean(ep_groups[-1])] *
                                          (GROUP_NUM - len(ep_groups[-1])))
                x_groups = [i * GROUP_NUM for i in range(len(ep_groups))]

                ep_avgs = np.mean(ep_groups, 1)
                avg_spl = interp1d(x_groups,
                                   ep_avgs,
                                   kind='cubic',
                                   fill_value="extrapolate")
                ep_std = np.std(ep_groups, 1)
                std_spl = interp1d(x_groups,
                                   ep_std,
                                   kind='cubic',
                                   fill_value="extrapolate")
                self.ax[1].plot(spline_x,
                                avg_spl(spline_x),
                                lw=0.7,
                                c=colors[i],
                                label="{} policy".format(PolEnum(i).name))
                self.ax[1].fill_between(spline_x,
                                        avg_spl(spline_x) - std_spl(spline_x),
                                        avg_spl(spline_x) + std_spl(spline_x),
                                        alpha=0.5,
                                        facecolor=colors[-1 - i],
                                        interpolate=True)

        self.ax[1].legend()

        self.fig.canvas.draw()
        plt.show(block=False)
        plt.pause(.001)