Esempio n. 1
0
def test_frame_stack(env_id, num_stack, lz4_compress):
    env = gym.make(env_id)
    shape = env.observation_space.shape
    env = FrameStack(env, num_stack, lz4_compress)
    assert env.observation_space.shape == (num_stack, ) + shape
    assert env.observation_space.dtype == env.env.observation_space.dtype

    obs = env.reset()
    obs = np.asarray(obs)
    assert obs.shape == (num_stack, ) + shape
    for i in range(1, num_stack):
        assert np.allclose(obs[i - 1], obs[i])

    obs, _, _, _ = env.step(env.action_space.sample())
    obs = np.asarray(obs)
    assert obs.shape == (num_stack, ) + shape
    for i in range(1, num_stack - 1):
        assert np.allclose(obs[i - 1], obs[i])
    assert not np.allclose(obs[-1], obs[-2])

    obs, _, _, _ = env.step(env.action_space.sample())
    assert len(obs) == num_stack
Esempio n. 2
0
def test_frame_stack(env_id, num_stack, lz4_compress):
    env = gym.make(env_id)
    shape = env.observation_space.shape
    env = FrameStack(env, num_stack, lz4_compress)
    assert env.observation_space.shape == (num_stack, ) + shape
    assert env.observation_space.dtype == env.env.observation_space.dtype

    dup = gym.make(env_id)

    obs = env.reset(seed=0)
    dup_obs = dup.reset(seed=0)
    assert np.allclose(obs[-1], dup_obs)

    for _ in range(num_stack**2):
        action = env.action_space.sample()
        dup_obs, _, _, _ = dup.step(action)
        obs, _, _, _ = env.step(action)
        assert np.allclose(obs[-1], dup_obs)

    assert len(obs) == num_stack
Esempio n. 3
0
File: train.py Progetto: nik-sm/dqn
class Agent:
    def __init__(self,
                 game: str,
                 replay_buffer_capacity: int,
                 replay_start_size: int,
                 batch_size: int,
                 discount_factor: float,
                 lr: float,
                 device: str = 'cuda:0',
                 env_seed: int = 0,
                 frame_buffer_size: int = 4,
                 print_self=True):

        self.device = device
        self.discount_factor = discount_factor
        self.game = game
        self.batch_size = batch_size

        self.replay_buf = ReplayBuffer(capacity=replay_buffer_capacity)

        self.env = FrameStack(
            AtariPreprocessing(
                gym.make(self.game),
                # noop_max=0,
                # terminal_on_life_loss=True,
                scale_obs=False),
            num_stack=frame_buffer_size)
        self.env.seed(env_seed)
        self.reset()

        self.n_action = self.env.action_space.n
        self.policy_net = DQN(self.n_action).to(self.device)
        self.target_net = DQN(self.n_action).to(self.device).eval()
        self.optimizer = RMSprop(
            self.policy_net.parameters(),
            alpha=0.95,
            # momentum=0.95,
            eps=0.01)

        if print_self:
            print(self)
        self._fill_replay_buf(replay_start_size)

    def __repr__(self):
        return '\n'.join([
            'Agent:', f'Game: {self.game}', f'Device: {self.device}',
            f'Policy net: {self.policy_net}', f'Target net: {self.target_net}',
            f'Replay buf: {self.replay_buf}'
        ])

    def _fill_replay_buf(self, replay_start_size):
        for _ in trange(replay_start_size,
                        desc='Fill replay_buf randomly',
                        leave=True):
            self.step(1.0)

    def reset(self):
        """Reset the end, pre-populate self.frame_buf and self.state"""
        self.state = self.env.reset()

    @torch.no_grad()
    def step(self, epsilon, clip_reward=True):
        """
        Choose an action based on current state and epsilon-greedy policy
        """
        # Choose action
        if random.random() <= epsilon:
            q_values = None
            action = self.env.action_space.sample()
        else:
            torch_state = torch.tensor(self.state,
                                       dtype=torch.float32,
                                       device=self.device).unsqueeze(0) / 255.0
            q_values = self.policy_net(torch_state)
            action = int(q_values.argmax(dim=1).item())

        # Apply action
        next_state, reward, done, _ = self.env.step(action)
        if clip_reward:
            reward = max(-1.0, min(reward, 1.0))

        # Store into replay buffer
        self.replay_buf.append(
            (torch.tensor(
                np.array(self.state), dtype=torch.float32, device="cpu") /
             255., action, reward,
             torch.tensor(
                 np.array(next_state), dtype=torch.float32, device="cpu") /
             255., done))

        # Advance to next state
        self.state = next_state
        if done:
            self.reset()

        return reward, q_values, done

    def q_update(self):
        self.optimizer.zero_grad()
        states, actions, rewards, next_states, dones = [
            x.to(self.device) for x in self.replay_buf.sample(self.batch_size)
        ]

        with torch.no_grad():
            y = torch.where(
                dones, rewards, rewards +
                self.discount_factor * self.target_net(next_states).max(1)[0])

        predicted_values = self.policy_net(states).gather(
            1, actions.unsqueeze(-1)).squeeze(-1)
        loss = huber(y, predicted_values, 2.)
        loss.backward()
        self.optimizer.step()
        return (y - predicted_values).abs().mean()
Esempio n. 4
0
class MarioBaseline(object):
    def __init__(self, episodes, checkpoint, current_episode, epsilon):
        self.current_episode = current_episode
        self.episodes = episodes

        self.episode_score = []
        self.episode_qs = []
        self.episode_distance = []
        self.episode_loss = []

        self.env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')
        self.env = JoypadSpace(self.env, SIMPLE_MOVEMENT)

        # Apply Frame Wrappers
        self.env = SkipFrame(self.env, 4)
        self.env = GrayScaleObservation(self.env)
        self.env = ResizeObservation(self.env, 84)
        self.env = FrameStack(self.env, 4)

        self.agent = DQNAgent(stateShape=(4, 84, 84),
                              actionSpace=self.env.action_space,
                              numPicks=32,
                              memorySize=20000,
                              epsilon=epsilon,
                              checkpoint=checkpoint)

    def train(self):
        for _ in range(self.episodes):
            self.episode()
            self.current_episode += 1

        self.env.close()

    def episode(self):
        done = False
        rewardsSum = 0
        qSum = 0
        qActions = 1
        lossSum = 0

        state = np.array(self.env.reset())
        maxDistance = -1000000

        while not done:
            action, q = self.agent.selectAction(state)
            '''
            if q != -100000:
                qSum += q
                qActions += 1
            '''
            obs, reward, done, info = self.env.step(action)

            if info['x_pos'] > maxDistance:
                maxDistance = info['x_pos']

            next_state = np.array(obs)
            rewardsSum = np.add(rewardsSum, reward)

            self.agent.addMemory(FloatTensor(state), LongTensor([action]),
                                 FloatTensor([reward]),
                                 FloatTensor(next_state), LongTensor([done]))
            loss = self.agent.trainDQN()
            state = next_state
            lossSum += loss

            if self.agent.step % self.agent.sync == 0:
                self.agent.targetNetwork.load_state_dict(
                    self.agent.trainNetwork.state_dict())

            self.agent.epsilon = self.agent.epsilon_min + (
                self.agent.epsilon_start - self.agent.epsilon_min) * math.exp(
                    -1 * ((self.agent.step + 1) / self.agent.epsilon_decay))

        if self.current_episode % 200 == 0:
            self.agent.save(self.current_episode)

        print(
            "now epsilon is {}, the reward is {} with loss {} in episode {}, step {}, dist {}"
            .format(self.agent.epsilon, rewardsSum, lossSum,
                    self.current_episode, self.agent.step, maxDistance))

        self.episode_score.append(rewardsSum)
        self.episode_qs.append(qSum / qActions)
        self.episode_distance.append(maxDistance)
        self.episode_loss.append(lossSum)
Esempio n. 5
0
class MarioBaseline(object):
    def __init__(self, episodes):
        self.current_episode = 0
        self.episodes = episodes

        self.episode_score = []
        self.episode_qs = []
        self.episode_distance = []
        self.episode_loss = []

        self.fig, self.ax = plt.subplots(2, 2)
        self.fig.canvas.draw()
        plt.show(block=False)

        self.env = gym_super_mario_bros.make('SuperMarioBros-v0')
        # Apply Observation Wrappers
        self.env = GrayScaleObservation(self.env)
        self.env = ResizeObservation(self.env, 84)
        # Apply Control Wrappers
        self.env = JoypadSpace(self.env, SIMPLE_MOVEMENT)
        self.env = NoopResetEnv(self.env)
        # Apply Frame Wrappers
        self.env = SkipFrame(self.env, 4)
        self.env = FrameStack(self.env, 4)

        self.agent = DQNAgent(stateShape=(84, 84, 4),
                              actionSpace=self.env.action_space, numPicks=32, memorySize=100000)

    def train(self):
        for _ in range(self.episodes):
            self.episode()
            self.plot()
            self.current_episode += 1

        self.env.close()

    def episode(self):
        done = False
        rewardsSum = 0
        qSum = 0
        qActions = 1
        lossSum = 0

        state = np.array(self.env.reset()).transpose(3, 1, 2, 0)
        maxDistance = -1000000
        lastX = 0

        while not done:
            action, q = self.agent.selectAction(state)
            if q != -100000:
                qSum += q
                qActions += 1

            obs, reward, done, info = self.env.step(action)
            self.env.render()

            if info['x_pos'] < lastX:
                reward -= 1
            if info['flag_get']:
                reward += 10

            if info['x_pos'] > maxDistance:
                maxDistance = info['x_pos']

            nextState = np.array(obs).transpose(3, 1, 2, 0)
            rewardsSum = np.add(rewardsSum, reward)

            self.agent.addMemory((state, action, reward, nextState, done))
            loss = self.agent.trainDQN()
            state = nextState
            lossSum += loss

        if self.current_episode % 200 == 0:
            self.agent.save(self.current_episode)

        print("now epsilon is {}, the reward is {} with loss {} in episode {}".format(
            self.agent.epsilon, rewardsSum, lossSum, self.current_episode))

        self.episode_score.append(rewardsSum)
        self.episode_qs.append(qSum/qActions)
        self.episode_distance.append(maxDistance)
        self.episode_loss.append(lossSum)

    def plot(self):
        self.ax[0][0].title.set_text('Training Score')
        self.ax[0][0].set_xlabel('Episode')
        self.ax[0][0].set_ylabel('Score')
        self.ax[0][0].plot(self.episode_score, 'b')

        self.ax[0][1].title.set_text('Training Distance')
        self.ax[0][1].set_xlabel('Episode')
        self.ax[0][1].set_ylabel('Distance')
        self.ax[0][1].plot(self.episode_distance, 'g')

        self.ax[1][0].title.set_text('Training Loss')
        self.ax[1][0].set_xlabel('Episode')
        self.ax[1][0].set_ylabel('Loss')
        self.ax[1][0].plot(self.episode_loss, 'r')

        self.ax[1][1].title.set_text('Training Q Vals')
        self.ax[1][1].set_xlabel('Episode')
        self.ax[1][1].set_ylabel('Qs')
        self.ax[1][1].plot(self.episode_qs, 'c')
        self.fig.canvas.draw()
        plt.show(block=False)
        plt.pause(.001)
Esempio n. 6
0
class MarioBaseline(object):
    def __init__(self, episodes, checkpoint, current_episode, epsilon):
        self.current_episode = current_episode
        self.episodes = episodes

        self.episode_score = []
        self.episode_qs = []
        self.episode_distance = []
        self.episode_loss = []
        self.episode_policies = []

        self.fig, self.ax = plt.subplots(1, 2, figsize=(12, 4))
        self.fig.canvas.draw()

        self.env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')
        # Apply Observation Wrappers
        self.env = GrayScaleObservation(self.env)
        self.env = ResizeObservation(self.env, 84)
        # Apply Control Wrappers
        self.env = JoypadSpace(self.env, SIMPLE_MOVEMENT)
        self.env = NoopResetEnv(self.env)
        # Apply Frame Wrappers
        self.env = SkipFrame(self.env, 4)
        self.env = FrameStack(self.env, 4)

        self.agent = DQNAgent(stateShape=(4, 84, 84),
                              actionSpace=self.env.action_space,
                              numPicks=32,
                              memorySize=20000,
                              numRewards=4,
                              epsilon=epsilon,
                              checkpoint=checkpoint)

    def train(self):
        for _ in range(self.episodes):
            self.episode()
            self.current_episode += 1

        self.env.close()

    def episode(self):
        done = False
        rewardsSum = 0
        qSum = 0
        qActions = 1
        lossSum = 0
        policies = [0] * (4 + 1)
        lossSums = [0] * (4)

        state = np.array(self.env.reset())
        maxDistance = -1000000
        lastX = 0
        lastT = 0
        lastC = 0

        while not done:
            action, policy, qs, ws, random = self.agent.selectAction(state)
            policies[policy] += 1
            obs, _, done, info = self.env.step(action)
            #self.env.render()

            if info['x_pos'] > maxDistance:
                maxDistance = info['x_pos']
            rewardX = info['x_pos'] - lastX
            lastX = info['x_pos']
            rewardT = info['time'] - lastT
            if rewardT > 0: rewardT = 0
            lastT = info['time']
            rewardC = info['coins'] - lastC
            lastC = info['coins']
            rewardD = self.env.unwrapped._death_penalty

            next_state = np.array(obs)
            rewardsSum = np.add(rewardsSum, rewardX)
            rewardsSum = np.add(rewardsSum, rewardT)
            rewardsSum = np.add(rewardsSum, rewardC)
            rewardsSum = np.add(rewardsSum, rewardD)

            self.agent.addMemory(state, action, policy,
                                 [rewardX, rewardT, rewardC, rewardD],
                                 next_state, done)
            loss = self.agent.trainDQN()
            state = next_state
            lossSums = [lossSums[i] + loss[i][0] for i in range(len(lossSums))]

        self.agent.epsilon = self.agent.epsilon_min + (
            1 - self.agent.epsilon_min) * math.exp(-1 * (
                (self.agent.step + 1) / self.agent.epsilon_decay))

        print(
            "now epsilon is {}, the reward is {} with loss {} in episode {}, step {}, dist {}"
            .format(self.agent.epsilon, rewardsSum, lossSums,
                    self.current_episode, self.agent.step, maxDistance))

        self.episode_score.append(rewardsSum)
        self.episode_policies.append(policies)

        if self.current_episode % 200 == 0:
            self.agent.save(self.current_episode)
            self.plot()

    def plot(self):
        spline_x = np.linspace(0,
                               self.current_episode,
                               num=self.current_episode)

        ep_scores = np.array(self.episode_score)
        ep_groups = [
            ep_scores[i * GROUP_NUM:(i + 1) * GROUP_NUM]
            for i in range((len(ep_scores) + GROUP_NUM - 1) // GROUP_NUM)
        ]
        # Pad for weird numpy error for now
        ep_groups[-1] = np.append(ep_groups[-1], [np.mean(ep_groups[-1])] *
                                  (GROUP_NUM - len(ep_groups[-1])))
        x_groups = [i * GROUP_NUM for i in range(len(ep_groups))]

        self.ax[0].clear()
        if len(x_groups) > 5:
            ep_avgs = np.mean(ep_groups, 1)
            avg_spl = interp1d(x_groups,
                               ep_avgs,
                               kind="cubic",
                               fill_value="extrapolate")
            ep_std = np.std(ep_groups, 1)
            std_spl = interp1d(x_groups,
                               ep_std,
                               kind="cubic",
                               fill_value="extrapolate")
            self.ax[0].plot(spline_x, avg_spl(spline_x), lw=0.7, c="blue")
            self.ax[0].fill_between(
                spline_x,
                avg_spl(spline_x) - std_spl(spline_x),
                avg_spl(spline_x) + std_spl(spline_x),
                alpha=0.5,
                facecolor="red",
                interpolate=True,
            )

        self.ax[0].title.set_text("Training Score")
        self.ax[0].set_xlabel("Episode")
        self.ax[0].set_ylabel("Score")

        policies = np.transpose(self.episode_policies)
        colors = pl.cm.jet(np.linspace(0, 1, len(policies) * 2))

        self.ax[1].clear()
        self.ax[1].title.set_text("Policy Choices")
        for i, policy in enumerate(policies):
            if len(x_groups) > 5:
                ep_groups = [
                    policy[i * GROUP_NUM:(i + 1) * GROUP_NUM]
                    for i in range((len(policy) + GROUP_NUM - 1) // GROUP_NUM)
                ]
                # Pad for weird numpy error for now
                ep_groups[-1] = np.append(
                    ep_groups[-1],
                    [np.mean(ep_groups[-1])] *
                    (GROUP_NUM - len(ep_groups[-1])),
                )
                x_groups = [i * GROUP_NUM for i in range(len(ep_groups))]

                ep_avgs = np.mean(ep_groups, 1)
                avg_spl = interp1d(x_groups,
                                   ep_avgs,
                                   kind="cubic",
                                   fill_value="extrapolate")
                ep_std = np.std(ep_groups, 1)
                std_spl = interp1d(x_groups,
                                   ep_std,
                                   kind="cubic",
                                   fill_value="extrapolate")
                self.ax[1].plot(
                    spline_x,
                    avg_spl(spline_x),
                    lw=0.7,
                    c=colors[i],
                    label="{} policy".format(PolEnum(i).name),
                )
                self.ax[1].fill_between(
                    spline_x,
                    avg_spl(spline_x) - std_spl(spline_x),
                    avg_spl(spline_x) + std_spl(spline_x),
                    alpha=0.5,
                    facecolor=colors[-1 - i],
                    interpolate=True,
                )

        self.ax[1].legend()

        self.fig.canvas.draw()
        plt.savefig("mario_w_pddqn_{}.png".format(self.current_episode))