def save_training_data(agent, learn_episode_index, scores_history): save_start_time = datetime.datetime.now() pickle_save(learn_episode_index, 'learn_episode_index', agent.chkpt_dir) pickle_save(scores_history, 'scores_history_train', agent.chkpt_dir) agent.save_models() print('Save time: %s' % str(datetime.datetime.now() - save_start_time).split('.')[0])
def train_agent(custom_env, agent, n_episodes, enable_models_saving, load_checkpoint, visualize=False, record=False): scores_history, learn_episode_index, max_avg = load_training_data( agent, load_checkpoint) env = custom_env.env if record: env = wrappers.Monitor( env, 'recordings/DDPG/', force=True, video_callable=lambda episode_id: episode_id == 0 or episode_id == (n_episodes - 1)) print('\n', 'Training Started', '\n') train_start_time = datetime.datetime.now() starting_ep = learn_episode_index + 1 for i in range(starting_ep, n_episodes): ep_start_time = datetime.datetime.now() done = False ep_score = 0 agent.noise.reset() observation = env.reset() s = custom_env.get_state(observation, None) if visualize and i == n_episodes - 1: env.render() while not done: a = agent.choose_action(s, training_mode=True) observation_, r, done, info = env.step(a) r = custom_env.update_reward(r, done, info) s_ = custom_env.get_state(observation_, s) ep_score += r agent.store_transition(s, a, r, s_, done) agent.learn_wrapper() observation, s = observation_, s_ if visualize and i == n_episodes - 1: env.render() scores_history.append(ep_score) pickle_save(scores_history, 'scores_history_train_total', agent.chkpt_dir) current_avg = print_training_progress(i, ep_score, scores_history, ep_start_time=ep_start_time) if enable_models_saving and current_avg is not None and \ (max_avg is None or current_avg >= max_avg): max_avg = current_avg pickle_save(max_avg, 'max_avg', agent.chkpt_dir) save_training_data(agent, i, scores_history) if visualize and i == n_episodes - 1: env.close() print( '\n', 'Training Ended ~~~ Episodes: %d ~~~ Runtime: %s' % (n_episodes - starting_ep, str(datetime.datetime.now() - train_start_time).split('.')[0]), '\n') return scores_history
def perform_q_learning(self, visualize=False, record=False, pickle=False): if record: self.env = wrappers.Monitor(self.env, 'recordings/Q-L/', force=True, video_callable=lambda episode_id: episode_id == 0 or episode_id == (self.episodes - 1)) Q = init_q(self.states, self.action_space_size, self.custom_env.file_name, pickle) accumulated_scores = 0 print('\n', 'Game Started', '\n') for i in range(self.episodes): done = False ep_steps = 0 ep_score = 0 observation = self.env.reset() s = self.custom_env.get_state(observation) if visualize and i == self.episodes - 1: self.env.render() while not done: a = eps_greedy_q(Q, s, self.action_space_size, self.EPS, self.env) observation_, reward, done, info = self.env.step(a) ep_steps += 1 ep_score += reward accumulated_scores += reward s_ = self.custom_env.get_state(observation_) a_ = max_action_q(Q, s_, self.action_space_size) Q[s, a] += self.ALPHA * (reward + self.GAMMA * Q[s_, a_] - Q[s, a]) # Q[s, a] += self.ALPHA * (reward + self.GAMMA * np.max(Q[s_, :]) - Q[s, a]) # if Q is a numpy.ndarray observation, s = observation_, s_ if visualize and i == self.episodes - 1: self.env.render() if self.episodes < 10 or (i + 1) % (self.episodes // 10) == 0: print('episode %d - eps: %.2f, score: %d, steps: %d' % (i + 1, self.EPS, ep_score, ep_steps)) self.EPS = decrement_eps(self.EPS, self.eps_min, self.eps_dec, self.eps_dec_type) self.totalSteps[i] = ep_steps self.totalScores[i] = ep_score self.totalAccumulatedScores[i] = accumulated_scores if visualize and i == self.episodes - 1: self.env.close() print('\n', 'Game Ended', '\n') if pickle: pickle_save(Q, self.custom_env.file_name + '-q-table') return Q, self.totalScores, self.totalAccumulatedScores