class MountainCarModelTester(object):
    def __init__(self, model):
        self.model = model
        self.env = MountainCar(speed=1e8, graphical_state=False,
                               render=True, is_debug=False, random_starts=True)
        self.agent = DQNAgentTester(self.model)

    def test(self):
        scores = np.array([])
        i = 0
        while True:
            done = False

            state = self.env.reset().reshape(1, 2)
            maxHeight = -10000

            while not done:
                action = self.agent.selectAction(state)
                obs, reward, done, totalScore = self.env.step_all(action)
                state = obs.reshape(1, 2)

            if totalScore > 200:
                print("DNF")

            scores = np.append(scores, totalScore)
            print("Average Score: {}".format(scores.mean()))
    def train(self):
        for reward in range(3):
            self.current_episode = 0

            self.episode_score = []
            self.episode_qs = []
            self.episode_height = []
            self.episode_loss = []
            self.episode_policies = []

            self.fig, self.ax = plt.subplots(1, 2, figsize=(10, 4))
            self.fig.canvas.draw()

            self.env = MountainCar(speed=10000,
                                   graphical_state=True,
                                   render=False,
                                   is_debug=False,
                                   frame_stack=4)
            self.agent = DQNAgent(stateShape=(84, 84, 4),
                                  actionSpace=self.env.get_action_space(),
                                  numPicks=32,
                                  memorySize=100000)

            for _ in range(self.episodes):
                self.episode(reward)
                self.current_episode += 1
            self.agent.save(reward)
Ejemplo n.º 3
0
    def __init__(self, episodes):
        self.current_episode = 0
        self.episodes = episodes

        self.episode_score = []
        self.episode_qs = []
        self.episode_height = []
        self.episode_loss = []

        self.fig, self.ax = plt.subplots(1, 2, figsize=(10, 4))
        self.fig.canvas.draw()
        plt.show(block=False)

        self.env = MountainCar(speed=1e8, graphical_state=False,
                               render=False, is_debug=True, random_starts=True)
        self.agent = DQNAgent(stateShape=(
            2,), actionSpace=self.env.get_action_space(), numPicks=32, memorySize=10000)
Ejemplo n.º 4
0
    def __init__(self, episodes):
        self.current_episode = 0
        self.episodes = episodes

        self.episode_score = []
        self.episode_qs = []
        self.episode_height = []
        self.episode_loss = []
        self.episode_ws = []
        self.episode_policies = []

        self.fig, self.ax = plt.subplots(1, 2, figsize=(10, 4))
        self.fig.tight_layout()
        self.fig.canvas.draw()

        self.env = MountainCar(speed=1e8,
                               graphical_state=True,
                               render=False,
                               is_debug=True,
                               frame_stack=3)
        self.numRewards = self.env.get_num_of_objectives()
Ejemplo n.º 5
0
class MultiObjectiveMountainCarGraphical(object):
    def __init__(self, episodes):
        self.current_episode = 0
        self.episodes = episodes

        self.episode_score = []
        self.episode_qs = []
        self.episode_height = []
        self.episode_loss = []

        self.fig, self.ax = plt.subplots(2, 2)
        self.fig.canvas.draw()
        self.plt.show(block=False)

        self.env = MountainCar(speed=1e8,
                               graphical_state=True,
                               render=True,
                               is_debug=True)
        self.agent = DQNAgent(stateShape=(40, 40, 3),
                              actionSpace=self.env.get_action_space(),
                              numPicks=32,
                              memorySize=10000)

    def train(self):
        for _ in range(self.episodes):
            self.episode()
            self.plot()
            self.current_episode += 1

        self.env.close()

    def episode(self):
        done = False
        rewardsSum = 0
        qSum = 0
        qActions = 1
        lossSum = 0

        currState = self.process_screen(self.env.reset())
        lastState = currState
        state = currState - lastState
        state = tf.expand_dims(tf.constant(state), 0)

        maxHeight = -10000

        while not done:
            action, q = self.agent.selectAction(state)
            if q != -100000:
                qSum += q
                qActions += 1

            obs, reward, done, info = self.env.step_all(action)

            reward = reward[0]

            maxHeight = max(info[0], maxHeight)
            if info[0] >= 0.5:
                reward += 10

            lastState = currState
            currState = self.process_screen(obs)

            nextState = currState - lastState
            nextState = tf.expand_dims(tf.constant(nextState), 0)
            rewardsSum = np.add(rewardsSum, reward)

            loss = self.agent.trainDQN()
            self.agent.addMemory((state, action, reward, nextState, done))
            state = nextState
            lossSum += loss

        if rewardsSum != -202:
            self.agent.save()

        print("now epsilon is {}, the reward is {} with loss {} in episode {}".
              format(self.agent.epsilon, rewardsSum, lossSum,
                     self.current_episode))

        self.episode_score.append(rewardsSum)
        self.episode_qs.append(qSum / qActions)
        self.episode_height.append(maxHeight)
        self.episode_loss.append(lossSum)

    def process_screen(observation):
        # Returned screen requested by gym is 400x600x3, but is sometimes larger
        # such as 800x1200x3. Transpose it into torch order (CHW).
        screen = observation
        screen = np.ascontiguousarray(screen, dtype=np.float32) / 255
        screen = torch.from_numpy(screen)
        # Resize, and add a batch dimension (BCHW)
        resize = T.Compose([
            T.ToPILImage(),
            T.Resize((40, 40), interpolation=Image.CUBIC),
            T.ToTensor()
        ])

        return resize(screen).numpy().transpose((2, 1, 0))

    def plot(self):
        self.ax[0][0].title.set_text('Training Score')
        self.ax[0][0].set_xlabel('Episode')
        self.ax[0][0].set_ylabel('Score')
        self.ax[0][0].plot(self.episode_score, 'b')

        self.ax[0][1].title.set_text('Training Height')
        self.ax[0][1].set_xlabel('Episode')
        self.ax[0][1].set_ylabel('Height')
        self.ax[0][1].plot(self.episode_height, 'g')

        self.ax[1][0].title.set_text('Training Loss')
        self.ax[1][0].set_xlabel('Episode')
        self.ax[1][0].set_ylabel('Loss')
        self.ax[1][0].plot(self.episode_loss, 'r')

        self.ax[1][1].title.set_text('Training Q Vals')
        self.ax[1][1].set_xlabel('Episode')
        self.ax[1][1].set_ylabel('Qs')
        self.ax[1][1].plot(self.episode_qs, 'c')
        self.fig.canvas.draw()
        plt.show(block=False)
        plt.pause(.001)
Ejemplo n.º 6
0
        means = torch.cat((torch.zeros(99), means))
        ax1.plot(means.numpy(), 'm')
        means = heights_t[:, 2].unfold(0, 100, 1).mean(1).view(-1)
        means = torch.cat((torch.zeros(99), means))
        ax1.plot(means.numpy(), 'y')

    ax2.title.set_text('Training Loss...')
    ax2.set_xlabel('Episode')
    ax2.set_ylabel('Loss')
    ax2.plot(episode_loss, 'r')
    fig.canvas.draw()
    plt.show(block=False)
    plt.pause(.001)


env = MountainCar(speed=1e8, graphical_state=False, render=True, is_debug=True)
agent = DQNAgent(stateShape=env.get_state_space().get_max(),
                 actionSpace=env.get_action_space(),
                 numPicks=64,
                 memorySize=10000)


def episode():
    done = False
    rewardsSum = [0, 0, 0]
    lossSum = 0

    env.reset()
    state = Variable(FloatTensor([env.get_state()]))

    maxScore = -100000
Ejemplo n.º 7
0
class MultiObjectiveMountainCarDDQN(object):
    def __init__(self, episodes):
        self.current_episode = 0
        self.episodes = episodes

        self.episode_score = []
        self.episode_qs = []
        self.episode_height = []
        self.episode_loss = []

        self.fig, self.ax = plt.subplots(1, 2, figsize=(10, 4))
        self.fig.canvas.draw()
        plt.show(block=False)

        self.env = MountainCar(speed=60,
                               graphical_state=False,
                               render=True,
                               is_debug=True,
                               random_starts=True)
        self.agent = DQNAgent(stateShape=(2, ),
                              actionSpace=self.env.get_action_space(),
                              numPicks=32,
                              memorySize=10000)

    def train(self):
        for _ in range(self.episodes):
            self.episode()
            self.plot()
            self.current_episode += 1
        plt.show(block=True)

    def episode(self):
        done = False
        rewardsSum = 0
        qSum = 0
        qActions = 1
        lossSum = 0

        state = self.env.reset().reshape(1, 2)
        maxHeight = -10000
        win = False

        while not done:
            action, q = self.agent.selectAction(state)
            if q != -100000:
                qSum += q
                qActions += 1

            obs, reward, done, _ = self.env.step_all(action)
            # env.render()

            reward = reward[0]

            maxHeight = max(obs[0], maxHeight)
            if obs[0] >= 0.5:
                win = True
                reward += 10

            nextState = obs.reshape(1, 2)
            rewardsSum = np.add(rewardsSum, reward)

            loss = 0  #self.agent.trainDQN()
            self.agent.addMemory((state, action, reward, nextState, done))
            state = nextState
            lossSum += loss

        if win:
            self.agent.save()

        self.agent.epsilon = max(self.agent.epsilon - self.agent.epsilon_decay,
                                 self.agent.epsilon_min)
        print("now epsilon is {}, the reward is {} with loss {} in episode {}".
              format(self.agent.epsilon, rewardsSum, lossSum,
                     self.current_episode))

        self.episode_score.append(rewardsSum)
        self.episode_qs.append(qSum / qActions)
        self.episode_height.append(maxHeight)
        self.episode_loss.append(lossSum)

    def plot(self):
        spline_x = np.linspace(0,
                               self.current_episode,
                               num=self.current_episode)

        ep_scores = np.array(self.episode_score)
        ep_groups = [
            ep_scores[i * GROUP_NUM:(i + 1) * GROUP_NUM]
            for i in range((len(ep_scores) + GROUP_NUM - 1) // GROUP_NUM)
        ]
        #Pad for weird numpy error for now
        ep_groups[-1] = np.append(ep_groups[-1], [np.mean(ep_groups[-1])] *
                                  (GROUP_NUM - len(ep_groups[-1])))
        x_groups = [i * GROUP_NUM for i in range(len(ep_groups))]

        self.ax[0].clear()
        if len(x_groups) > 5:
            ep_avgs = np.mean(ep_groups, 1)
            avg_spl = interp1d(x_groups,
                               ep_avgs,
                               kind='cubic',
                               fill_value="extrapolate")
            ep_std = np.std(ep_groups, 1)
            std_spl = interp1d(x_groups,
                               ep_std,
                               kind='cubic',
                               fill_value="extrapolate")
            self.ax[0].plot(spline_x, avg_spl(spline_x), lw=0.7, c="blue")
            self.ax[0].fill_between(spline_x,
                                    avg_spl(spline_x) - std_spl(spline_x),
                                    avg_spl(spline_x) + std_spl(spline_x),
                                    alpha=0.5,
                                    facecolor="red",
                                    interpolate=True)

        self.ax[0].title.set_text('Training Score')
        self.ax[0].set_xlabel('Episode')
        self.ax[0].set_ylabel('Score')

        ep_heights = np.array(self.episode_height)
        ep_groups = [
            ep_heights[i * GROUP_NUM:(i + 1) * GROUP_NUM]
            for i in range((len(ep_heights) + GROUP_NUM - 1) // GROUP_NUM)
        ]
        #Pad for weird numpy error for now
        ep_groups[-1] = np.append(ep_groups[-1], [np.mean(ep_groups[-1])] *
                                  (GROUP_NUM - len(ep_groups[-1])))
        x_groups = [i * GROUP_NUM for i in range(len(ep_groups))]

        self.ax[1].clear()
        if len(x_groups) > 5:
            ep_avgs = np.mean(ep_groups, 1)
            avg_spl = interp1d(x_groups,
                               ep_avgs,
                               kind='cubic',
                               fill_value="extrapolate")
            ep_std = np.std(ep_groups, 1)
            std_spl = interp1d(x_groups,
                               ep_std,
                               kind='cubic',
                               fill_value="extrapolate")
            self.ax[1].plot(spline_x, avg_spl(spline_x), lw=0.7, c="blue")
            self.ax[1].fill_between(spline_x,
                                    avg_spl(spline_x) - std_spl(spline_x),
                                    avg_spl(spline_x) + std_spl(spline_x),
                                    alpha=0.5,
                                    facecolor="red",
                                    interpolate=True)

        self.ax[1].title.set_text('Training Height')
        self.ax[1].set_xlabel('Episode')
        self.ax[1].set_ylabel('Height')

        plt.show(block=False)
        plt.pause(.001)
Ejemplo n.º 8
0
    plt.xlabel('Episode')
    plt.ylabel('Score')
    plt.plot(heights_t.numpy())
    # Take 100 episode averages and plot them too
    if len(heights_t) >= 100:
        means = heights_t.unfold(0, 100, 1).mean(1).view(-1)
        means = torch.cat((torch.zeros(99), means))
        plt.plot(means.numpy())

    plt.pause(0.001)  # pause a bit so that plots are updated
    if is_ipython:
        display.clear_output(wait=True)
        display.display(plt.gcf())


env = MountainCar(speed=1e8, graphical_state=True, render=True, is_debug=True)
agent = DQNAgent(stateShape=(3, 84, 84),
                 actionSpace=env.get_action_space(),
                 numPicks=128,
                 memorySize=10000)


def episode():
    done = False
    rewardSum = 0
    lossSum = 0

    env.reset()
    currState = process_screen(env.get_state())
    lastState = currState
    state = currState - lastState
 def __init__(self, model):
     self.model = model
     self.env = MountainCar(speed=1e8, graphical_state=False,
                            render=True, is_debug=False, random_starts=True)
     self.agent = DQNAgentTester(self.model)
Ejemplo n.º 10
0
class MultiObjectiveWMountainCar(object):
    def __init__(self, episodes):
        self.current_episode = 0
        self.episodes = episodes

        self.episode_score = []
        self.episode_qs = []
        self.episode_height = []
        self.episode_loss = []
        self.episode_ws = []
        self.episode_policies = []

        self.fig, self.ax = plt.subplots(1, 2, figsize=(10, 4))
        self.fig.tight_layout()
        self.fig.canvas.draw()

        self.env = MountainCar(speed=1e8,
                               graphical_state=True,
                               render=False,
                               is_debug=True,
                               frame_stack=3)
        self.numRewards = self.env.get_num_of_objectives()

    def train(self):
        self.agent = DQNAgent(stateShape=(84, 84, 3),
                              actionSpace=self.env.get_action_space(),
                              numPicks=32,
                              memorySize=10000,
                              numRewards=self.numRewards,
                              optim=keras.optimizers.Adam(lr=0.0001))

        for _ in range(self.episodes):
            self.episode()
            self.current_episode += 1
        self.plot()

    def episode(self):
        done = False
        rewardsSum = 0
        lossSums = [0] * (self.numRewards)
        policies = [0] * (self.numRewards + 1)

        qSums = [0] * (self.numRewards)
        wSums = [0] * (self.numRewards)
        actions = 1

        state = self.env.reset()

        while not done:
            action, policy, qs, ws, random = self.agent.selectAction(state)
            policies[policy] += 1
            if not random:
                qSums[policy] += qs
                wSums = [wSums[i] + ws[i] for i in range(len(wSums))]
                actions += 1

            obs, reward, done, _ = self.env.step_all(action)

            nextState = obs
            rewardsSum = np.add(rewardsSum, sum(reward))

            self.agent.addMemory(state, action, policy, reward, nextState,
                                 done)
            loss = self.agent.trainDQN()
            state = nextState
            lossSums = [lossSums[i] + loss[i][0] for i in range(len(lossSums))]

        print("now epsilon is {}, the reward is {} with loss {} in episode {}".
              format(self.agent.epsilon, rewardsSum, lossSums,
                     self.current_episode))

        self.episode_score.append(rewardsSum)
        self.episode_loss.append(lossSums)
        self.episode_policies.append(policies)
        self.episode_qs.append([qSum / actions for qSum in qSums])
        self.episode_ws.append([wSum / actions for wSum in wSums])

        print(
            "Report: \nrewardSum:{}\nloss:{}\npolicies:{}\nqAverage:{}\nws:{}".
            format(self.episode_score[-1], self.episode_loss[-1],
                   self.episode_policies[-1], self.episode_qs[-1],
                   self.episode_ws[-1]))
        print("memory len:" + str(len(self.agent.replayMemory[0])))
        print("memory used:" + str(psutil.virtual_memory().used // 1e6))
        tf.keras.backend.clear_session()
        gc.collect()

    def plot(self):
        spline_x = np.linspace(0,
                               self.current_episode,
                               num=self.current_episode)

        ep_scores = np.array(self.episode_score)
        ep_groups = [
            ep_scores[i * GROUP_NUM:(i + 1) * GROUP_NUM]
            for i in range((len(ep_scores) + GROUP_NUM - 1) // GROUP_NUM)
        ]
        # Pad for weird numpy error for now
        ep_groups[-1] = np.append(ep_groups[-1], [np.mean(ep_groups[-1])] *
                                  (GROUP_NUM - len(ep_groups[-1])))
        x_groups = [i * GROUP_NUM for i in range(len(ep_groups))]

        self.ax[0].clear()
        if len(x_groups) > 5:
            ep_avgs = np.mean(ep_groups, 1)
            avg_spl = interp1d(x_groups,
                               ep_avgs,
                               kind='cubic',
                               fill_value="extrapolate")
            ep_std = np.std(ep_groups, 1)
            std_spl = interp1d(x_groups,
                               ep_std,
                               kind='cubic',
                               fill_value="extrapolate")
            self.ax[0].plot(spline_x, avg_spl(spline_x), lw=0.7, c="blue")
            self.ax[0].fill_between(spline_x,
                                    avg_spl(spline_x) - std_spl(spline_x),
                                    avg_spl(spline_x) + std_spl(spline_x),
                                    alpha=0.5,
                                    facecolor="red",
                                    interpolate=True)

        self.ax[0].title.set_text('Training Score')
        self.ax[0].set_xlabel('Episode')
        self.ax[0].set_ylabel('Score')

        policies = np.transpose(self.episode_policies)
        colors = pl.cm.jet(np.linspace(0, 1, len(policies) * 2))

        self.ax[1].clear()
        self.ax[1].title.set_text('Policy Choices')
        for i, policy in enumerate(policies):
            if len(x_groups) > 5:
                ep_groups = [
                    policy[i * GROUP_NUM:(i + 1) * GROUP_NUM]
                    for i in range((len(policy) + GROUP_NUM - 1) // GROUP_NUM)
                ]
                # Pad for weird numpy error for now
                ep_groups[-1] = np.append(ep_groups[-1],
                                          [np.mean(ep_groups[-1])] *
                                          (GROUP_NUM - len(ep_groups[-1])))
                x_groups = [i * GROUP_NUM for i in range(len(ep_groups))]

                ep_avgs = np.mean(ep_groups, 1)
                avg_spl = interp1d(x_groups,
                                   ep_avgs,
                                   kind='cubic',
                                   fill_value="extrapolate")
                ep_std = np.std(ep_groups, 1)
                std_spl = interp1d(x_groups,
                                   ep_std,
                                   kind='cubic',
                                   fill_value="extrapolate")
                self.ax[1].plot(spline_x,
                                avg_spl(spline_x),
                                lw=0.7,
                                c=colors[i],
                                label="{} policy".format(PolEnum(i).name))
                self.ax[1].fill_between(spline_x,
                                        avg_spl(spline_x) - std_spl(spline_x),
                                        avg_spl(spline_x) + std_spl(spline_x),
                                        alpha=0.5,
                                        facecolor=colors[-1 - i],
                                        interpolate=True)

        self.ax[1].legend()

        self.fig.canvas.draw()
        plt.savefig("momc_w_pddqn_{}.png".format(self.current_episode))

    def plot_compare(self):
        spline_x = np.linspace(0,
                               self.current_episode,
                               num=self.current_episode)

        ep_adam_scores = np.array(self.adam_scores)
        ep_rms_scores = np.array(self.rms_scores)
        ep_adam_groups = [
            ep_adam_scores[i * GROUP_NUM:(i + 1) * GROUP_NUM]
            for i in range((len(ep_adam_scores) + GROUP_NUM - 1) // GROUP_NUM)
        ]
        ep_rms_groups = [
            ep_rms_scores[i * GROUP_NUM:(i + 1) * GROUP_NUM]
            for i in range((len(ep_rms_scores) + GROUP_NUM - 1) // GROUP_NUM)
        ]
        # Pad for weird numpy error for now
        ep_adam_groups[-1] = np.append(ep_adam_groups[-1],
                                       [np.mean(ep_adam_groups[-1])] *
                                       (GROUP_NUM - len(ep_adam_groups[-1])))
        ep_rms_groups[-1] = np.append(ep_rms_groups[-1],
                                      [np.mean(ep_rms_groups[-1])] *
                                      (GROUP_NUM - len(ep_rms_groups[-1])))
        x_groups = [i * GROUP_NUM for i in range(len(ep_adam_groups))]

        self.ax.clear()
        if len(x_groups) > 5:
            ep_adam_avgs = np.mean(ep_adam_groups, 1)
            ep_rms_avgs = np.mean(ep_rms_groups, 1)

            avg_adam_spl = interp1d(x_groups,
                                    ep_adam_avgs,
                                    kind='cubic',
                                    fill_value="extrapolate")
            avg_rms_spl = interp1d(x_groups,
                                   ep_rms_avgs,
                                   kind='cubic',
                                   fill_value="extrapolate")

            ep_adam_std = np.std(ep_adam_groups, 1)
            ep_rms_std = np.std(ep_rms_groups, 1)

            std_adam_spl = interp1d(x_groups,
                                    ep_adam_std,
                                    kind='cubic',
                                    fill_value="extrapolate")
            std_rms_spl = interp1d(x_groups,
                                   ep_rms_std,
                                   kind='cubic',
                                   fill_value="extrapolate")

            self.ax.plot(spline_x,
                         avg_adam_spl(spline_x),
                         lw=0.7,
                         c="blue",
                         label="Adam")
            self.ax.fill_between(
                spline_x,
                avg_adam_spl(spline_x) - std_adam_spl(spline_x),
                avg_adam_spl(spline_x) + std_adam_spl(spline_x),
                alpha=0.5,
                facecolor="red",
                interpolate=True)

            self.ax.plot(spline_x,
                         avg_rms_spl(spline_x),
                         lw=0.7,
                         c="orange",
                         label="RMSProp")
            self.ax.fill_between(spline_x,
                                 avg_rms_spl(spline_x) - std_rms_spl(spline_x),
                                 avg_rms_spl(spline_x) + std_rms_spl(spline_x),
                                 alpha=0.5,
                                 facecolor="green",
                                 interpolate=True)

        self.ax.title.set_text('Training Score')
        self.ax.set_xlabel('Episode')
        self.ax.set_ylabel('Score')
        self.ax.legend()
        plt.show(block=True)
class MountainCarGraphicalDDQN(object):
    def __init__(self, episodes):
        self.episodes = episodes

    def train(self):
        for reward in range(3):
            self.current_episode = 0

            self.episode_score = []
            self.episode_qs = []
            self.episode_height = []
            self.episode_loss = []
            self.episode_policies = []

            self.fig, self.ax = plt.subplots(1, 2, figsize=(10, 4))
            self.fig.canvas.draw()

            self.env = MountainCar(speed=10000,
                                   graphical_state=True,
                                   render=False,
                                   is_debug=False,
                                   frame_stack=4)
            self.agent = DQNAgent(stateShape=(84, 84, 4),
                                  actionSpace=self.env.get_action_space(),
                                  numPicks=32,
                                  memorySize=100000)

            for _ in range(self.episodes):
                self.episode(reward)
                self.current_episode += 1
            self.agent.save(reward)

    def episode(self, r):
        done = False
        rewardsSum = 0

        lossSum = 0
        qSums = 0
        actions = 1

        state = self.env.reset()
        maxHeight = -1

        while not done:
            action, qs = self.agent.selectAction(state)
            if qs != -100000:
                qSums += qs
                actions += 1

            obs, reward, done, height = self.env.step_all(action)
            maxHeight = max(height, maxHeight)

            rewardsSum = np.add(rewardsSum, sum(reward))

            reward[r] += height
            if height >= 0.5:
                for i in range(len(reward)):
                    reward[i] += 10

            nextState = obs

            self.agent.addMemory(state, action, reward[r], nextState, done)
            state = nextState

            loss = self.agent.trainDQN()
            lossSum += loss

        if self.current_episode % 10 == 0:
            self.agent.epsilon = max(
                self.agent.epsilon - self.agent.epsilon_decay,
                self.agent.epsilon_min)
        if self.current_episode % self.agent.sync == 0:
            self.agent.targetNetwork.set_weights(
                self.agent.trainNetwork.get_weights())

        print("now epsilon is {}, the reward is {} with loss {} in episode {}".
              format(self.agent.epsilon, rewardsSum, lossSum,
                     self.current_episode))

        self.episode_score.append(rewardsSum)
        self.episode_height.append(maxHeight)
        self.episode_loss.append(lossSum)
        self.episode_qs.append([qSums / actions])

        if self.current_episode % 100 == 0:
            self.plot(r)

        print("Report: \nrewardSum:{}\nheight:{}\nloss:{}\nqAverage:{}".format(
            self.episode_score[-1], self.episode_height[-1],
            self.episode_loss[-1], self.episode_qs[-1]))

        tf.keras.backend.clear_session()
        gc.collect()

    def plot(self, r):
        spline_x = np.linspace(0,
                               self.current_episode,
                               num=self.current_episode)

        ep_scores = np.array(self.episode_score)
        ep_groups = [
            ep_scores[i * GROUP_NUM:(i + 1) * GROUP_NUM]
            for i in range((len(ep_scores) + GROUP_NUM - 1) // GROUP_NUM)
        ]
        # Pad for weird numpy error for now
        ep_groups[-1] = np.append(ep_groups[-1], [np.mean(ep_groups[-1])] *
                                  (GROUP_NUM - len(ep_groups[-1])))
        x_groups = [i * GROUP_NUM for i in range(len(ep_groups))]

        self.ax[0].clear()
        self.ax[0].title.set_text('Training Score')
        self.ax[0].set_xlabel('Episode')
        self.ax[0].set_ylabel('Score')
        if len(x_groups) > 5:
            ep_avgs = np.mean(ep_groups, 1)
            avg_spl = interp1d(x_groups,
                               ep_avgs,
                               kind='cubic',
                               fill_value="extrapolate")
            ep_std = np.std(ep_groups, 1)
            std_spl = interp1d(x_groups,
                               ep_std,
                               kind='cubic',
                               fill_value="extrapolate")
            self.ax[0].plot(spline_x, avg_spl(spline_x), lw=0.7, c="blue")
            self.ax[0].fill_between(spline_x,
                                    avg_spl(spline_x) - std_spl(spline_x),
                                    avg_spl(spline_x) + std_spl(spline_x),
                                    alpha=0.5,
                                    facecolor="red",
                                    interpolate=True)

        self.ax[1].clear()
        self.ax[1].title.set_text('Training Height')
        self.ax[1].set_xlabel('Episode')
        self.ax[1].set_ylabel('Height')
        if len(x_groups) > 5:
            ep_heights = np.array(self.episode_height)
            ep_groups = [
                ep_heights[i * GROUP_NUM:(i + 1) * GROUP_NUM]
                for i in range((len(ep_heights) + GROUP_NUM - 1) // GROUP_NUM)
            ]
            # Pad for weird numpy error for now
            ep_groups[-1] = np.append(ep_groups[-1], [np.mean(ep_groups[-1])] *
                                      (GROUP_NUM - len(ep_groups[-1])))
            x_groups = [i * GROUP_NUM for i in range(len(ep_groups))]

            ep_avgs = np.mean(ep_groups, 1)
            avg_spl = interp1d(x_groups,
                               ep_avgs,
                               kind='cubic',
                               fill_value="extrapolate")
            ep_std = np.std(ep_groups, 1)
            std_spl = interp1d(x_groups,
                               ep_std,
                               kind='cubic',
                               fill_value="extrapolate")
            self.ax[1].plot(spline_x, avg_spl(spline_x), lw=0.7, c="blue")
            self.ax[1].fill_between(spline_x,
                                    avg_spl(spline_x) - std_spl(spline_x),
                                    avg_spl(spline_x) + std_spl(spline_x),
                                    alpha=0.5,
                                    facecolor="red",
                                    interpolate=True)

        self.fig.canvas.draw()
        plt.savefig("graphical_momc_perddqn_{}_{}.png".format(
            r, self.current_episode))