Exemple #1
0
class DQN_trainer:
    def __init__(self, config):
        # env info
        self.env = wrap_deepmind(gym.make('Breakout-v0'), skip=config.action_repeat, no_op_max=config.no_op_max)
        if config.is_monitor:
            self.env = gym.wrappers.Monitor(self.env, 'recording')
        self.action_num = self.env.action_space.n
        self.obs_shape = self.env.observation_space.shape  # [h, w, c]
        self.last_obs = self.env.reset()

        # reply buffer
        self.reply_buffer = ReplayBuffer(config.replay_memory_size, config.agent_history_length)

        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        # initial model [batch_size, h, w, m*c]
        self.eval_model = DQNet(self.obs_shape[2] * config.agent_history_length, self.action_num).to(self.device)
        self.target_model = DQNet(self.obs_shape[2] * config.agent_history_length, self.action_num).to(self.device)

        # train param
        self.exploration = np.linspace(config.initial_exploration, config.final_exploration,
                                       config.final_exploration_frame)
        self.final_exploration_frame = config.final_exploration_frame
        self.discount_factor = config.discount_factor
        self.max_epoch = config.max_epoch
        self.learning_starts = config.learning_starts
        self.update_freq = config.update_freq
        self.target_update_freq = config.target_update_freq
        self.batch_size = config.batch_size

        self.model_path = config.model_path
        self.load_model_freq = config.load_model_freq

        self.criterion = torch.nn.MSELoss()
        self.optimizer = torch.optim.RMSprop(self.eval_model.parameters(), lr=config.learning_rate)

        self.viz = visdom.Visdom(env="DQN_train", log_to_filename="./logs/viz_dqn_train.log")
        self.log_freq = config.log_freq

    def collect_memories(self):
        """
        before DQN begins to learn, collect adequate memories.
        :return:
        """
        print("-------------------collect memories------------------------")
        for step in tqdm(range(self.learning_starts)):
            # store observation
            cur_index = self.reply_buffer.store_memory_obs(self.last_obs)
            # encoded_obs = self.reply_buffer.encoder_recent_observation()  # numpy: [m*c, h, w]
            #
            # image_num = int(encoded_obs.shape[0] / self.obs_shape[2])
            # images_numpy = np.array([[encoded_obs[i]] for i in range(image_num)])
            # self.viz.images(torch.from_numpy(images_numpy), win="observations")
            #
            # time.sleep(0.5)

            # choose action randomly
            action = self.env.action_space.sample()
            # interact with env
            obs, reward, done, info = self.env.step(action)
            # clip reward
            reward = np.clip(reward, -1.0, 1.0)
            # store other info
            self.reply_buffer.store_memory_effect(cur_index, action, reward, done)

            if done:
                obs = self.env.reset()

            self.last_obs = obs
        print("---------------------------end-----------------------------")

    def train(self):
        """
        train DQN agent
        :return:
        """
        total_reward = 0
        total_step = 0
        total_ave100_reward = 0
        total_ave100_step = 0
        episode = 0
        episode_100 = 0

        train_ave_loss = 0
        log_step = 0

        self.last_obs = self.env.reset()
        print("-------------------train DQN agent------------------------")
        for step in tqdm(range(1, self.max_epoch)):
            cur_index = self.reply_buffer.store_memory_obs(self.last_obs)
            encoded_obs = self.reply_buffer.encoder_recent_observation()  # numpy: [m*c, h, w]

            # visualize last k frames
            image_num = int(encoded_obs.shape[0] / self.obs_shape[2])
            images_numpy = np.array([[encoded_obs[i]] for i in range(image_num)])
            self.viz.images(torch.from_numpy(images_numpy), win="observations")

            sample = np.random.random()
            # change from 1.0 to 0.1 linearly
            epsilon = self.exploration[min([step, self.final_exploration_frame])]
            if sample > epsilon:
                # numpy: [m*c, h, w] => tensor: [1, m*c, h, w]
                encoded_obs = change_to_tensor(encoded_obs).unsqueeze(0)
                pred_action_values = self.eval_model(encoded_obs)  # [1, 4]
                _, action = pred_action_values.max(dim=1)
                action = action.item()
            else:
                action = self.env.action_space.sample()

            obs, reward, done, info = self.env.step(action)

            total_reward += reward
            total_step += 1

            # reward = np.clip(reward, -1.0, 1.0)

            self.reply_buffer.store_memory_effect(cur_index, action, reward, done)

            if done:
                obs = self.env.reset()
                episode += 1
                total_ave100_reward += total_reward
                total_ave100_step += total_step
                total_reward = 0
                total_step = 0

            self.last_obs = obs

            # train the model
            if step % self.update_freq == 0:
                obs_batch, next_obs_batch, action_batch, reward_batch, done_batch = self.reply_buffer.sample_memories(
                    self.batch_size)
                # numpy to tensor
                obs_batch, next_obs_batch = change_to_tensor(obs_batch), change_to_tensor(next_obs_batch)
                action_batch, reward_batch = change_to_tensor(action_batch, torch.int64), change_to_tensor(reward_batch)

                # estimate Q values
                q_values = self.eval_model(obs_batch)  # [b, action_num]
                q_pred = q_values.gather(dim=1, index=action_batch)  # [b, 1]

                # target Q values
                q_next = self.target_model(next_obs_batch).detach()
                # Bellman equation
                q_target = reward_batch + self.discount_factor * q_next.max(dim=1)[0].view(self.batch_size, -1)

                loss = self.criterion(q_pred, q_target)
                train_ave_loss += loss.item()

                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()

            # update target net
            if step % self.target_update_freq == 0:
                self.target_model.load_state_dict(OrderedDict(self.eval_model.state_dict()))

            # save model
            if (step / self.update_freq) % self.load_model_freq == 0:
                torch.save(self.eval_model, self.model_path + '/step%d-trainLoss%.4f.pth' % (step, loss.item()))

            # visualize data
            if (step / self.update_freq) % self.log_freq == 0:
                log_step += 1
                train_ave_loss = train_ave_loss / self.log_freq
                self.viz.line([train_ave_loss], [log_step], win='train_average_loss', update='append', opts=dict(
                                title="train_average_loss",
                                xlabel="log_step",
                                ylabel="average_loss"
                            ))
                train_ave_loss = 0

            if episode % 100 == 0:
                episode_100 += 1
                total_ave100_reward = total_ave100_reward / 100
                total_ave100_step = total_ave100_step / 100
                self.viz.line([total_ave100_reward], [episode_100], win='average100_reward', update='append', opts=dict(
                    title="average100_reward",
                    xlabel="episode_100",
                    ylabel="average_reward"
                ))
                self.viz.line([total_ave100_step], [episode_100], win='average100_step', update='append', opts=dict(
                    title="average100_step",
                    xlabel="episode_100",
                    ylabel="average_step"
                ))

        print("---------------------------end-----------------------------")
Exemple #2
0
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    # initial model [batch_size, h, w, m*c]
    eval_model = DQNet(1 * 4, 4).to(device)
    eval_model.load_state_dict(torch.load(model_path))

    starts = 10

    total_reward = 0.0
    total_steps = 0
    last_obs = env.reset()
    episode = 1

    while True:

        cur_index = reply_buffer.store_memory_obs(last_obs)
        encoded_obs = reply_buffer.encoder_recent_observation(
        )  # numpy: [m*c, h, w]

        sample = np.random.random()
        if sample > 0.05:
            # numpy: [m*c, h, w] => tensor: [1, m*c, h, w]
            encoded_obs = change_to_tensor(encoded_obs).unsqueeze(0)
            pred_action_values = eval_model(encoded_obs)  # [1, 4]
            _, action = pred_action_values.max(dim=1)
            action = action.item()
        else:
            action = env.action_space.sample()

        # action = env.action_space.sample()
        last_obs, reward, done, _ = env.step(action)
        reply_buffer.store_memory_effect(cur_index, action, reward, done)