Beispiel #1
0
class MarioBaseline(object):
    def __init__(self, episodes, checkpoint, current_episode, epsilon):
        self.current_episode = current_episode
        self.episodes = episodes

        self.episode_score = []
        self.episode_qs = []
        self.episode_distance = []
        self.episode_loss = []

        self.env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')
        self.env = JoypadSpace(self.env, SIMPLE_MOVEMENT)

        # Apply Frame Wrappers
        self.env = SkipFrame(self.env, 4)
        self.env = GrayScaleObservation(self.env)
        self.env = ResizeObservation(self.env, 84)
        self.env = FrameStack(self.env, 4)

        self.agent = DQNAgent(stateShape=(4, 84, 84),
                              actionSpace=self.env.action_space,
                              numPicks=32,
                              memorySize=20000,
                              epsilon=epsilon,
                              checkpoint=checkpoint)

    def train(self):
        for _ in range(self.episodes):
            self.episode()
            self.current_episode += 1

        self.env.close()

    def episode(self):
        done = False
        rewardsSum = 0
        qSum = 0
        qActions = 1
        lossSum = 0

        state = np.array(self.env.reset())
        maxDistance = -1000000

        while not done:
            action, q = self.agent.selectAction(state)
            '''
            if q != -100000:
                qSum += q
                qActions += 1
            '''
            obs, reward, done, info = self.env.step(action)

            if info['x_pos'] > maxDistance:
                maxDistance = info['x_pos']

            next_state = np.array(obs)
            rewardsSum = np.add(rewardsSum, reward)

            self.agent.addMemory(FloatTensor(state), LongTensor([action]),
                                 FloatTensor([reward]),
                                 FloatTensor(next_state), LongTensor([done]))
            loss = self.agent.trainDQN()
            state = next_state
            lossSum += loss

            if self.agent.step % self.agent.sync == 0:
                self.agent.targetNetwork.load_state_dict(
                    self.agent.trainNetwork.state_dict())

            self.agent.epsilon = self.agent.epsilon_min + (
                self.agent.epsilon_start - self.agent.epsilon_min) * math.exp(
                    -1 * ((self.agent.step + 1) / self.agent.epsilon_decay))

        if self.current_episode % 200 == 0:
            self.agent.save(self.current_episode)

        print(
            "now epsilon is {}, the reward is {} with loss {} in episode {}, step {}, dist {}"
            .format(self.agent.epsilon, rewardsSum, lossSum,
                    self.current_episode, self.agent.step, maxDistance))

        self.episode_score.append(rewardsSum)
        self.episode_qs.append(qSum / qActions)
        self.episode_distance.append(maxDistance)
        self.episode_loss.append(lossSum)
Beispiel #2
0
class MarioBaseline(object):
    def __init__(self, episodes, checkpoint, current_episode, epsilon):
        self.current_episode = current_episode
        self.episodes = episodes

        self.episode_score = []
        self.episode_qs = []
        self.episode_distance = []
        self.episode_loss = []
        self.episode_policies = []

        self.fig, self.ax = plt.subplots(1, 2, figsize=(12, 4))
        self.fig.canvas.draw()

        self.env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')
        # Apply Observation Wrappers
        self.env = GrayScaleObservation(self.env)
        self.env = ResizeObservation(self.env, 84)
        # Apply Control Wrappers
        self.env = JoypadSpace(self.env, SIMPLE_MOVEMENT)
        self.env = NoopResetEnv(self.env)
        # Apply Frame Wrappers
        self.env = SkipFrame(self.env, 4)
        self.env = FrameStack(self.env, 4)

        self.agent = DQNAgent(stateShape=(4, 84, 84),
                              actionSpace=self.env.action_space,
                              numPicks=32,
                              memorySize=20000,
                              numRewards=4,
                              epsilon=epsilon,
                              checkpoint=checkpoint)

    def train(self):
        for _ in range(self.episodes):
            self.episode()
            self.current_episode += 1

        self.env.close()

    def episode(self):
        done = False
        rewardsSum = 0
        qSum = 0
        qActions = 1
        lossSum = 0
        policies = [0] * (4 + 1)
        lossSums = [0] * (4)

        state = np.array(self.env.reset())
        maxDistance = -1000000
        lastX = 0
        lastT = 0
        lastC = 0

        while not done:
            action, policy, qs, ws, random = self.agent.selectAction(state)
            policies[policy] += 1
            obs, _, done, info = self.env.step(action)
            #self.env.render()

            if info['x_pos'] > maxDistance:
                maxDistance = info['x_pos']
            rewardX = info['x_pos'] - lastX
            lastX = info['x_pos']
            rewardT = info['time'] - lastT
            if rewardT > 0: rewardT = 0
            lastT = info['time']
            rewardC = info['coins'] - lastC
            lastC = info['coins']
            rewardD = self.env.unwrapped._death_penalty

            next_state = np.array(obs)
            rewardsSum = np.add(rewardsSum, rewardX)
            rewardsSum = np.add(rewardsSum, rewardT)
            rewardsSum = np.add(rewardsSum, rewardC)
            rewardsSum = np.add(rewardsSum, rewardD)

            self.agent.addMemory(state, action, policy,
                                 [rewardX, rewardT, rewardC, rewardD],
                                 next_state, done)
            loss = self.agent.trainDQN()
            state = next_state
            lossSums = [lossSums[i] + loss[i][0] for i in range(len(lossSums))]

        self.agent.epsilon = self.agent.epsilon_min + (
            1 - self.agent.epsilon_min) * math.exp(-1 * (
                (self.agent.step + 1) / self.agent.epsilon_decay))

        print(
            "now epsilon is {}, the reward is {} with loss {} in episode {}, step {}, dist {}"
            .format(self.agent.epsilon, rewardsSum, lossSums,
                    self.current_episode, self.agent.step, maxDistance))

        self.episode_score.append(rewardsSum)
        self.episode_policies.append(policies)

        if self.current_episode % 200 == 0:
            self.agent.save(self.current_episode)
            self.plot()

    def plot(self):
        spline_x = np.linspace(0,
                               self.current_episode,
                               num=self.current_episode)

        ep_scores = np.array(self.episode_score)
        ep_groups = [
            ep_scores[i * GROUP_NUM:(i + 1) * GROUP_NUM]
            for i in range((len(ep_scores) + GROUP_NUM - 1) // GROUP_NUM)
        ]
        # Pad for weird numpy error for now
        ep_groups[-1] = np.append(ep_groups[-1], [np.mean(ep_groups[-1])] *
                                  (GROUP_NUM - len(ep_groups[-1])))
        x_groups = [i * GROUP_NUM for i in range(len(ep_groups))]

        self.ax[0].clear()
        if len(x_groups) > 5:
            ep_avgs = np.mean(ep_groups, 1)
            avg_spl = interp1d(x_groups,
                               ep_avgs,
                               kind="cubic",
                               fill_value="extrapolate")
            ep_std = np.std(ep_groups, 1)
            std_spl = interp1d(x_groups,
                               ep_std,
                               kind="cubic",
                               fill_value="extrapolate")
            self.ax[0].plot(spline_x, avg_spl(spline_x), lw=0.7, c="blue")
            self.ax[0].fill_between(
                spline_x,
                avg_spl(spline_x) - std_spl(spline_x),
                avg_spl(spline_x) + std_spl(spline_x),
                alpha=0.5,
                facecolor="red",
                interpolate=True,
            )

        self.ax[0].title.set_text("Training Score")
        self.ax[0].set_xlabel("Episode")
        self.ax[0].set_ylabel("Score")

        policies = np.transpose(self.episode_policies)
        colors = pl.cm.jet(np.linspace(0, 1, len(policies) * 2))

        self.ax[1].clear()
        self.ax[1].title.set_text("Policy Choices")
        for i, policy in enumerate(policies):
            if len(x_groups) > 5:
                ep_groups = [
                    policy[i * GROUP_NUM:(i + 1) * GROUP_NUM]
                    for i in range((len(policy) + GROUP_NUM - 1) // GROUP_NUM)
                ]
                # Pad for weird numpy error for now
                ep_groups[-1] = np.append(
                    ep_groups[-1],
                    [np.mean(ep_groups[-1])] *
                    (GROUP_NUM - len(ep_groups[-1])),
                )
                x_groups = [i * GROUP_NUM for i in range(len(ep_groups))]

                ep_avgs = np.mean(ep_groups, 1)
                avg_spl = interp1d(x_groups,
                                   ep_avgs,
                                   kind="cubic",
                                   fill_value="extrapolate")
                ep_std = np.std(ep_groups, 1)
                std_spl = interp1d(x_groups,
                                   ep_std,
                                   kind="cubic",
                                   fill_value="extrapolate")
                self.ax[1].plot(
                    spline_x,
                    avg_spl(spline_x),
                    lw=0.7,
                    c=colors[i],
                    label="{} policy".format(PolEnum(i).name),
                )
                self.ax[1].fill_between(
                    spline_x,
                    avg_spl(spline_x) - std_spl(spline_x),
                    avg_spl(spline_x) + std_spl(spline_x),
                    alpha=0.5,
                    facecolor=colors[-1 - i],
                    interpolate=True,
                )

        self.ax[1].legend()

        self.fig.canvas.draw()
        plt.savefig("mario_w_pddqn_{}.png".format(self.current_episode))
Beispiel #3
0
class MarioBaseline(object):
    def __init__(self, episodes):
        self.current_episode = 0
        self.episodes = episodes

        self.episode_score = []
        self.episode_qs = []
        self.episode_distance = []
        self.episode_loss = []

        self.fig, self.ax = plt.subplots(2, 2)
        self.fig.canvas.draw()
        plt.show(block=False)

        self.env = gym_super_mario_bros.make('SuperMarioBros-v0')
        # Apply Observation Wrappers
        self.env = GrayScaleObservation(self.env)
        self.env = ResizeObservation(self.env, 84)
        # Apply Control Wrappers
        self.env = JoypadSpace(self.env, SIMPLE_MOVEMENT)
        self.env = NoopResetEnv(self.env)
        # Apply Frame Wrappers
        self.env = SkipFrame(self.env, 4)
        self.env = FrameStack(self.env, 4)

        self.agent = DQNAgent(stateShape=(84, 84, 4),
                              actionSpace=self.env.action_space, numPicks=32, memorySize=100000)

    def train(self):
        for _ in range(self.episodes):
            self.episode()
            self.plot()
            self.current_episode += 1

        self.env.close()

    def episode(self):
        done = False
        rewardsSum = 0
        qSum = 0
        qActions = 1
        lossSum = 0

        state = np.array(self.env.reset()).transpose(3, 1, 2, 0)
        maxDistance = -1000000
        lastX = 0

        while not done:
            action, q = self.agent.selectAction(state)
            if q != -100000:
                qSum += q
                qActions += 1

            obs, reward, done, info = self.env.step(action)
            self.env.render()

            if info['x_pos'] < lastX:
                reward -= 1
            if info['flag_get']:
                reward += 10

            if info['x_pos'] > maxDistance:
                maxDistance = info['x_pos']

            nextState = np.array(obs).transpose(3, 1, 2, 0)
            rewardsSum = np.add(rewardsSum, reward)

            self.agent.addMemory((state, action, reward, nextState, done))
            loss = self.agent.trainDQN()
            state = nextState
            lossSum += loss

        if self.current_episode % 200 == 0:
            self.agent.save(self.current_episode)

        print("now epsilon is {}, the reward is {} with loss {} in episode {}".format(
            self.agent.epsilon, rewardsSum, lossSum, self.current_episode))

        self.episode_score.append(rewardsSum)
        self.episode_qs.append(qSum/qActions)
        self.episode_distance.append(maxDistance)
        self.episode_loss.append(lossSum)

    def plot(self):
        self.ax[0][0].title.set_text('Training Score')
        self.ax[0][0].set_xlabel('Episode')
        self.ax[0][0].set_ylabel('Score')
        self.ax[0][0].plot(self.episode_score, 'b')

        self.ax[0][1].title.set_text('Training Distance')
        self.ax[0][1].set_xlabel('Episode')
        self.ax[0][1].set_ylabel('Distance')
        self.ax[0][1].plot(self.episode_distance, 'g')

        self.ax[1][0].title.set_text('Training Loss')
        self.ax[1][0].set_xlabel('Episode')
        self.ax[1][0].set_ylabel('Loss')
        self.ax[1][0].plot(self.episode_loss, 'r')

        self.ax[1][1].title.set_text('Training Q Vals')
        self.ax[1][1].set_xlabel('Episode')
        self.ax[1][1].set_ylabel('Qs')
        self.ax[1][1].plot(self.episode_qs, 'c')
        self.fig.canvas.draw()
        plt.show(block=False)
        plt.pause(.001)
Beispiel #4
0
        file.write(f"Prioritized Replay:\t {PRIORITIZED_REPLAY}")

    # Environment
    env = FrameStack(AtariPreprocessing(gym.make(ENV_NAME)), FRAMES_NUMBER)

    # Dimensions of observations
    obs_dim = env.observation_space.shape

    # Amount of actions
    n_outputs = env.action_space.n

    # Neural networks
    policy_net = DuelingDQN(obs_dim, n_outputs, device).to(device) if DUELING_DQN \
            else DQN(obs_dim, n_outputs, device).to(device)
    target_net = DuelingDQN(obs_dim, n_outputs, device).to(device) if DUELING_DQN \
            else DQN(obs_dim, n_outputs, device).to(device)
    target_net.load_state_dict(policy_net.state_dict())

    # Optimizer
    optimizer = optim.Adam(policy_net.parameters(), lr=LEARNING_RATE)

    # Replay memory
    memory = PER(MEMORY_SIZE) if PRIORITIZED_REPLAY \
        else ExperienceReplay(MEMORY_SIZE)

    # Train policy network
    train()

    # Close environment
    env.close()