Beispiel #1
0
def save_training_data(agent, learn_episode_index, scores_history):
    save_start_time = datetime.datetime.now()
    pickle_save(learn_episode_index, 'learn_episode_index', agent.chkpt_dir)
    pickle_save(scores_history, 'scores_history_train', agent.chkpt_dir)
    agent.save_models()
    print('Save time: %s' %
          str(datetime.datetime.now() - save_start_time).split('.')[0])
def train_agent(custom_env,
                agent,
                n_episodes,
                enable_models_saving,
                load_checkpoint,
                visualize=False,
                record=False):

    scores_history, learn_episode_index, max_avg = load_training_data(
        agent, load_checkpoint)

    env = custom_env.env

    if record:
        env = wrappers.Monitor(
            env,
            'recordings/DDPG/',
            force=True,
            video_callable=lambda episode_id: episode_id == 0 or episode_id ==
            (n_episodes - 1))

    print('\n', 'Training Started', '\n')
    train_start_time = datetime.datetime.now()

    starting_ep = learn_episode_index + 1
    for i in range(starting_ep, n_episodes):
        ep_start_time = datetime.datetime.now()

        done = False
        ep_score = 0

        agent.noise.reset()

        observation = env.reset()
        s = custom_env.get_state(observation, None)

        if visualize and i == n_episodes - 1:
            env.render()

        while not done:
            a = agent.choose_action(s, training_mode=True)
            observation_, r, done, info = env.step(a)
            r = custom_env.update_reward(r, done, info)
            s_ = custom_env.get_state(observation_, s)
            ep_score += r
            agent.store_transition(s, a, r, s_, done)
            agent.learn_wrapper()
            observation, s = observation_, s_

            if visualize and i == n_episodes - 1:
                env.render()

        scores_history.append(ep_score)
        pickle_save(scores_history, 'scores_history_train_total',
                    agent.chkpt_dir)

        current_avg = print_training_progress(i,
                                              ep_score,
                                              scores_history,
                                              ep_start_time=ep_start_time)

        if enable_models_saving and current_avg is not None and \
                (max_avg is None or current_avg >= max_avg):
            max_avg = current_avg
            pickle_save(max_avg, 'max_avg', agent.chkpt_dir)
            save_training_data(agent, i, scores_history)

        if visualize and i == n_episodes - 1:
            env.close()

    print(
        '\n', 'Training Ended ~~~ Episodes: %d ~~~ Runtime: %s' %
        (n_episodes - starting_ep,
         str(datetime.datetime.now() - train_start_time).split('.')[0]), '\n')

    return scores_history
    def perform_q_learning(self, visualize=False, record=False, pickle=False):
        if record:
            self.env = wrappers.Monitor(self.env,
                                        'recordings/Q-L/',
                                        force=True,
                                        video_callable=lambda episode_id:
                                        episode_id == 0 or episode_id ==
                                        (self.episodes - 1))

        Q = init_q(self.states, self.action_space_size,
                   self.custom_env.file_name, pickle)

        accumulated_scores = 0

        print('\n', 'Game Started', '\n')

        for i in range(self.episodes):
            done = False
            ep_steps = 0
            ep_score = 0

            observation = self.env.reset()

            s = self.custom_env.get_state(observation)

            if visualize and i == self.episodes - 1:
                self.env.render()

            while not done:
                a = eps_greedy_q(Q, s, self.action_space_size, self.EPS,
                                 self.env)

                observation_, reward, done, info = self.env.step(a)
                ep_steps += 1
                ep_score += reward
                accumulated_scores += reward

                s_ = self.custom_env.get_state(observation_)
                a_ = max_action_q(Q, s_, self.action_space_size)
                Q[s, a] += self.ALPHA * (reward + self.GAMMA * Q[s_, a_] -
                                         Q[s, a])
                # Q[s, a] += self.ALPHA * (reward + self.GAMMA * np.max(Q[s_, :]) - Q[s, a])  # if Q is a numpy.ndarray

                observation, s = observation_, s_

                if visualize and i == self.episodes - 1:
                    self.env.render()

            if self.episodes < 10 or (i + 1) % (self.episodes // 10) == 0:
                print('episode %d - eps: %.2f, score: %d, steps: %d' %
                      (i + 1, self.EPS, ep_score, ep_steps))

            self.EPS = decrement_eps(self.EPS, self.eps_min, self.eps_dec,
                                     self.eps_dec_type)

            self.totalSteps[i] = ep_steps
            self.totalScores[i] = ep_score
            self.totalAccumulatedScores[i] = accumulated_scores

            if visualize and i == self.episodes - 1:
                self.env.close()

        print('\n', 'Game Ended', '\n')

        if pickle:
            pickle_save(Q, self.custom_env.file_name + '-q-table')

        return Q, self.totalScores, self.totalAccumulatedScores