Esempio n. 1
0
    def learn(self,
              num_episodes=300,
              batch_size=None,
              print_progess_frequency=10,
              min_replay_samples=50,
              repeat_train=16,
              imshow=True):
        # 智能體學習的主方法
        if batch_size is not None:
            self.batch_size = batch_size

        # 學習一開始將steps_done清零,逐步降低隨機決策比率
        self.steps_done = 0
        train_cnt = 0
        success_cnt = 0
        keep_success_cnt = 0
        start_train_episode = 0
        start_train = False
        # 收集初始資料
        while start_train == False:
            # 重置環境
            self.env.reset()
            # 獎賞清零
            total_rewards = 0
            state = self.get_observation()

            for t in count():
                # 基於目前狀態產生行動
                action = self.select_action(state, model_only=False)
                # 基於行動產生獎賞以及判斷是否結束(此時已經更新至下一個時間點)
                reward, done = self.get_rewards(action)

                # 累積獎賞
                total_rewards += reward

                # 任務完成強制終止(以300為基礎)
                conplete = (not done and t + 1 >= 300)

                if imshow:
                    # 更新視覺化螢幕
                    self.env.render()
                # 取得下一時間點觀察值
                next_state = None if done and not conplete else self.get_observation(
                )

                # 將四元組儲存於記憶中
                # 如果要減少「好案例」的儲存比例請移除註解
                self.memory.push(state, action, next_state, reward)
                if len(self.memory) % 100 == 0:
                    print("Replay Samples:{0}".format(len(self.memory)))
                if len(self.memory) == min_replay_samples:
                    print('Start Train!!', flush=True)
                    # 需要記憶中的案例數大於批次數才開始訓練
                    start_train = (len(self.memory) >= min_replay_samples)
                    break

                # 切換至下一狀態
                state = copy.deepcopy(next_state)

                if done or conplete:
                    break

        # 開始訓練模式
        self.training_context['steps'] = 0
        self.steps_done = 0
        for i_episode in range(num_episodes):
            for i in range(repeat_train):
                # 經驗回放獲得訓練用批次數據
                self.output_fn = self.experience_replay

                # 訓練模型
                self.train_model(None,
                                 None,
                                 current_epoch=i_episode,
                                 current_batch=i,
                                 total_epoch=num_episodes,
                                 total_batch=repeat_train,
                                 is_collect_data=True if t >= 0 else False,
                                 is_print_batch_progress=False,
                                 is_print_epoch_progress=False,
                                 log_gradients=False,
                                 log_weights=False,
                                 accumulate_grads=False)

            # 定期更新target_net權值
            if i_episode % self.target_update == 0:
                self.target_net.load_state_dict(self.policy_net.state_dict(),
                                                strict=True)
                self.save_model(save_path=self.training_context['save_path'])

            # 重置環境
            self.env.reset()
            # 獎賞清零
            total_rewards = 0
            state = self.get_observation()
            tmp_memory = []

            for t in count():
                # 透過優化器進行一步優化

                # 基於目前狀態產生行動
                action = self.select_action(state, model_only=True)
                # 基於行動產生獎賞以及判斷是否結束(此時已經更新至下一個時間點)
                reward, done = self.get_rewards(action)
                # 累積獎賞
                total_rewards += reward

                # 任務完成強制終止(以300為基礎)
                conplete = (not done and t + 1 >= 300)

                if imshow:
                    # 更新視覺化螢幕
                    self.env.render()
                # 取得下一時間點觀察值
                next_state = None if done else self.get_observation()

                # 將四元組儲存於記憶中
                tmp_memory.append((state, action, next_state, reward))

                # 切換至下一狀態
                state = next_state

                if done or conplete:
                    if t >= 200:
                        success_cnt += 1
                    else:
                        success_cnt = 0

                    # 判斷是否連續可達300分,如果是則停止學習

                    if t + 1 >= 300:
                        keep_success_cnt += 1
                    else:
                        keep_success_cnt = 0
                    if keep_success_cnt >= 2:
                        self.training_context['stop_update'] = 1
                    else:
                        self.training_context['stop_update'] = 0

                    # 紀錄累積獎賞
                    self.epoch_metric_history.collect('total_rewards',
                                                      i_episode,
                                                      float(total_rewards))
                    self.epoch_metric_history.collect('original_rewards',
                                                      i_episode, float(t))
                    # 紀錄完成比率(以200為基礎)
                    self.epoch_metric_history.collect(
                        'task_complete', i_episode,
                        1.0 if t + 1 >= 200 else 0.0)
                    # 定期列印學習進度
                    if i_episode > 0 and i_episode % print_progess_frequency == 0:
                        self.print_epoch_progress(print_progess_frequency)
                    # 定期繪製損失函數以及評估函數對時間的趨勢圖
                    if i_episode > 0 and i_episode % (
                            5 * print_progess_frequency) == 0:
                        print(
                            'negative_reward_ratio:',
                            less(
                                self.training_context['train_data']
                                ['reward_batch'], 0).mean().item())
                        print(
                            'predict_rewards:',
                            self.training_context['train_data']
                            ['predict_rewards'].copy()[:5, 0])
                        print(
                            'target_rewards:',
                            self.training_context['train_data']
                            ['target_rewards'].copy()[:5, 0])
                        print(
                            'reward_batch:',
                            self.training_context['train_data']
                            ['reward_batch'].copy()[:5])
                        loss_metric_curve(self.epoch_loss_history,
                                          self.epoch_metric_history,
                                          legend=['dqn'],
                                          calculate_base='epoch',
                                          imshow=imshow)

                    if success_cnt == 50:
                        self.save_model(
                            save_path=self.training_context['save_path'])
                        print('50 episodes success, training finish! ')
                        return True

                    break
            # print([item[3] for item in tmp_memory])
            sample_idx = []
            indexs = list(range(len(tmp_memory)))
            if len(tmp_memory) > 10:
                # 只保留失敗前的3筆以及隨機抽樣sqrt(len(tmp_memory))+5筆
                sample_idx.extend(indexs[-1 * min(3, len(tmp_memory)):])
                sample_idx.extend(
                    random_choice(indexs[:-3], int(sqrt(len(tmp_memory)))))

            sample_idx = list(set(sample_idx))
            for k in range(len(tmp_memory)):
                state, action, next_state, reward = tmp_memory[k]
                if k in sample_idx or (k + 3 < len(tmp_memory) and
                                       tmp_memory[k + 1][3] < 1) or reward < 1:
                    self.memory.push(state, action, next_state, reward)

        print('Complete')
        self.env.render()
        self.env.close()
        plt.ioff()
        plt.show()
Esempio n. 2
0
    def learn(self,
              num_episodes=300,
              batch_size=None,
              print_progess_frequency=10,
              imshow=True):
        """The main method for the agent learn

        Returns:
            object:
        """
        if batch_size is not None:
            self.batch_size = batch_size

        self.steps_done = 0
        for i_episode in range(num_episodes):
            # reset enviorment
            self.env.reset()
            # clear rewards
            total_rewards = 0
            state = self.get_observation()

            # 需要記憶中的案例數大於批次數才開始訓練
            start_train = (len(self.memory) > self.batch_size)
            for t in count():
                # 基於目前狀態產生行動
                action = self.select_action(state)
                # 基於行動產生獎賞以及判斷是否結束(此時已經更新至下一個時間點)
                reward, done = self.get_rewards(action)
                # 累積獎賞
                total_rewards += reward

                # 任務完成強制終止(以300為基礎)
                conplete = (not done and t + 1 >= 300)

                if imshow:
                    # 更新視覺化螢幕
                    self.env.render()
                # get next state
                next_state = self.get_observation()

                # 將四元組儲存於記憶中,建議要減少「好案例」的儲存比例
                if reward < 1 or (reward == 1 and i_episode < 20) or (
                        reward == 1 and i_episode >= 20 and t < 100
                        and random.random() < 0.1 and i_episode >= 20
                        and t >= 100 and random.random() < 0.2):
                    self.memory.push(state, action, next_state, reward)

                # switch next t
                state = deepcopy(next_state)

                if start_train:
                    # get batch data from experimental replay
                    trainData = self.experience_replay(self.batch_size)
                    # switch model to training mode
                    self.policy_net.train()
                    self.train_model(
                        trainData,
                        None,
                        current_epoch=i_episode,
                        current_batch=t,
                        total_epoch=num_episodes,
                        total_batch=t + 1 if done or conplete else t + 2,
                        is_collect_data=True if done or conplete else False,
                        is_print_batch_progress=False,
                        is_print_epoch_progress=False,
                        log_gradients=False,
                        log_weights=False,
                        accumulate_grads=False)

                if done or conplete:
                    if start_train:

                        # self.epoch_metric_history.collect('episode_durations',i_episode,float(t))
                        # 紀錄累積獎賞
                        self.epoch_metric_history.collect(
                            'total_rewards', i_episode, float(total_rewards))
                        # 紀錄完成比率(以200為基礎)
                        self.epoch_metric_history.collect(
                            'task_complete', i_episode,
                            1.0 if t + 1 >= 200 else 0.0)
                        # 定期列印學習進度
                        if i_episode % print_progess_frequency == 0:
                            self.print_epoch_progress(print_progess_frequency)
                        # 定期繪製損失函數以及評估函數對時間的趨勢圖
                        if i_episode > 0 and (i_episode + 1) % (
                                5 * print_progess_frequency) == 0:
                            print('epsilon:', self.epsilon)
                            print(
                                'predict_rewards:',
                                self.training_context['train_data']
                                ['predict_rewards'][:5])
                            print(
                                'target_rewards:',
                                self.training_context['train_data']
                                ['target_rewards'][:5])
                            print(
                                'reward_batch:',
                                self.training_context['train_data']
                                ['reward_batch'][:5])
                            loss_metric_curve(self.epoch_loss_history,
                                              self.epoch_metric_history,
                                              legend=['dqn'],
                                              calculate_base='epoch',
                                              imshow=imshow)

                    break

            # 定期更新target_net權值
            if start_train and i_episode % self.target_update == 0:
                self.target_net.load_state_dict(self.policy_net.state_dict(),
                                                strict=True)
                self.save_model(save_path=self.training_context['save_path'])

        print('Complete')
        self.env.render()
        self.env.close()
        plt.ioff()
        plt.show()
Esempio n. 3
0
    def play(self,
             num_episodes,
             batch_size=1,
             min_replay_samples=1,
             print_progess_frequency=5,
             training=True,
             train_timing='on_episode_start',
             train_every_nstep=1,
             repeat_train=1,
             need_render=True):
        if train_timing not in [
                'on_episode_start', 'on_step_end', 'on_step_start'
        ]:
            raise ValueError(
                'Only on_episode_start,on_step_end are valid  train_timing options'
            )

        if training:
            self._model.train()
        else:
            self._model.eval()
        if self.use_experience_replay:
            self.collect_samples(min_replay_samples=min_replay_samples)
        else:
            self.collect_samples(
                min_replay_samples=1,
                need_render=True if self.replay_unit == 'episode' else False)
            print('start train....')
        self.state_pool = []
        self.reward_pool = []
        self.action_pool = []

        self.total_reward = 0
        self.t = 0
        self.i_episode = 0
        if hasattr(self.env, 'recording_enabled'):
            self.env.recording_enabled = True
        for i_episode in range(num_episodes):
            self.i_episode = i_episode

            if training and train_timing == 'on_episode_start' and i_episode % train_every_nstep == 0:
                self.training_model(i_episode,
                                    0,
                                    num_episodes=num_episodes,
                                    repeat_train=repeat_train,
                                    train_timing=train_timing,
                                    batch_size=batch_size)
            self.env.reset()
            self.total_rewards = 0
            state = self.get_observation()
            for t in count():
                self.t = t
                # # Train on_step_start
                # if training and train_timing == 'on_step_start' and t % train_every_nstep == 0:
                #     self.training_model(i_episode, t,num_episodes=num_episodes, repeat_train=repeat_train, batch_size=batch_size)

                action = self.select_action(state, model_only=True)
                observation, reward, done, info = self.get_rewards(action)

                self.total_rewards += reward

                next_state = self.get_observation() if not done else None

                if need_render:
                    self.env.render()
                if self.replay_unit == 'step':
                    if self.push_into_memory_criteria(
                            state, action, next_state, reward) or done:
                        self.memory.push(state, action, next_state, reward)
                elif self.replay_unit == 'episode':
                    self.state_pool.append(state)
                    self.action_pool.append(action)
                    self.reward_pool.append(reward)
                    if done:
                        if self.push_into_memory_criteria(
                                self.state_pool, self.action_pool, None,
                                self.reward_pool):
                            self.memory.push(self.state_pool, self.action_pool,
                                             None, self.reward_pool)
                        self.state_pool = []
                        self.action_pool = []
                        self.reward_pool = []

                complete = self.episode_complete_criteria()
                # Train on_step_end
                if training and train_timing == 'on_step_end' and t % train_every_nstep == 0:
                    self.training_model(i_episode,
                                        t,
                                        num_episodes=num_episodes,
                                        done=done or complete,
                                        repeat_train=repeat_train,
                                        train_timing=train_timing,
                                        batch_size=batch_size,
                                        accumulate_grads=accumulate_grads)

                state = next_state
                if done or complete:
                    self.epoch_metric_history.collect(
                        'rewards', i_episode, float(self.total_rewards))
                    self.epoch_metric_history.collect('t', i_episode,
                                                      float(t + 1))
                    if self.use_experience_replay:
                        self.epoch_metric_history.collect(
                            'replay_buffer_utility', i_episode,
                            float(len(self.memory)) / self.memory.capacity)

                    if print_progess_frequency == 1 or (
                            i_episode > 0 and
                        (i_episode + 1) % print_progess_frequency == 0):
                        self.print_epoch_progress(print_progess_frequency)
                        # n1 = self.action_logs['model'][0]
                        # n2 = self.action_logs['model'][1]
                        # n3 = self.action_logs['random'][0]
                        # n4 = self.action_logs['random'][1]
                        # print('model: 0:{0} 1:{1}  random: 0:{2} 1:{3}  random: {4}'.format(float(n1) / (n1 + n2), float(n2) / (n1 + n2), float(n3) / builtins.max(n3 + n4,1),
                        #                                                                                       float(n4) / builtins.max(n3 + n4,1), float(n3 + n4) / builtins.max(n1 + n2 + n3 + n4,1)))
                        #
                        # self.action_logs = OrderedDict()
                        # self.action_logs['model'] = OrderedDict()
                        # self.action_logs['random'] = OrderedDict()
                        # self.action_logs['model'][0] = 0
                        # self.action_logs['model'][1] = 0
                        # self.action_logs['random'][0] = 0
                        # self.action_logs['random'][1] = 0
                    # 定期繪製損失函數以及評估函數對時間的趨勢圖
                    if i_episode > 0 and (i_episode + 1) % (
                            5 * print_progess_frequency) == 0:
                        loss_metric_curve(
                            self.epoch_loss_history,
                            self.epoch_metric_history,
                            metrics_names=list(
                                self.epoch_metric_history.keys()),
                            calculate_base='epoch',
                            imshow=True)

                    if self.task_complete_criteria():
                        self.save_model(
                            save_path=self.training_context['save_path'])
                        print(
                            'episode {0} meet task complete criteria, training finish! '
                            .format(i_episode))
                        return True

                    break

        print('Complete')
        self.env.render()
        self.env.close()