class Training: def __init__(self): self.n_episode = [] self.n_epsilon = [] self.n_dist = [] self.avg_err = [] self.logging_data = [] # Parameters self.n_episodes = rospy.get_param("/n_episodes") self.n_step = rospy.get_param("/n_steps") self.mode_action = rospy.get_param('/mode_action') self.mem_size = rospy.get_param('/mem_size') self.batch_size = rospy.get_param('/batch_size') self.mode_optimize = rospy.get_param('/mode_optimize') self.avg_err_fre = rospy.get_param('/avg_err_fre') self.save_fre = rospy.get_param("/save_fre") self.load_checkpoint = rospy.get_param("/load_checkpoint") # create environment self.env = Env() self.n_states = self.env.observation_space self.n_actions = self.env.action_space.n # create Deep Q-Network self.dqn = DQN(self.n_states, self.n_actions) self.memory = ExperienceReplay(self.mem_size) # plot self.color1 = 'tab:green' self.color2 = 'tab:blue' self.color3 = 'tab:orange' self.color4 = 'tab:red' self.style_plot = random.choice(plt.style.available) plt.style.use(self.style_plot) plt.ion() ########### # Figure 1 - Rewards self.fig1 = plt.figure(1) # fig = plt.figure(figsize=(12,5)) self.ax1 = self.fig1.add_subplot(1, 1, 1) self.ax2 = self.ax1.twinx() title_1 = 'Rewards - (Mode: Training)' self.ax1.set_title(title_1) self.ax1.set_xlabel('Episode') self.ax1.set_ylabel('Reward', color=self.color1) self.ax2.set_ylabel('Epsilon', color=self.color2) self.ax1.tick_params(axis='y', labelcolor=self.color1) self.ax2.tick_params(axis='y', labelcolor=self.color2) ########### # Figure 2 - Error self.fig2 = plt.figure(2) self.ax3 = self.fig2.add_subplot(1, 1, 1) title_2 = 'Error Distance - (Mode: Training)' self.ax3.set_title(title_2) self.ax3.set_xlabel('Episode') self.ax3.set_ylabel('Meter') self.init_file() def moving_average(self, x, w): return np.convolve(x, np.ones(w), 'valid') / w def init_file(self): rospack = rospkg.RosPack() data_path = rospack.get_path("pioneer_dragging") + "/data" username = getpass.getuser() # n_folder = len(os.walk(data_path).__next__()[1]) n_folder = glob("{}/{}*".format(data_path, username)) n_folder = len(n_folder) + 1 if self.load_checkpoint: n_folder -= 1 self.data_path = "{}/{}-{}".format(data_path, username, n_folder) if not os.path.exists(self.data_path): os.mkdir(self.data_path) # config file if not self.load_checkpoint: config_path = rospack.get_path( "pioneer_dragging") + "/config/dragging_params.yaml" config_log = '{}/{}-params.yaml'.format(self.data_path, n_folder) os.system('cp {} {}'.format(config_path, config_log)) plot_style = {'plot_style': self.style_plot} with open(config_log, 'r') as yamlfile: cur_yaml = yaml.safe_load(yamlfile) # Note the safe_load cur_yaml.update(plot_style) if cur_yaml: with open(config_log, 'w') as yamlfile: yaml.safe_dump(cur_yaml, yamlfile) # Also note the safe_dump # history file self.history_log = '{}/{}-log.txt'.format(self.data_path, n_folder) # model file self.dqn.file_models = '{}/{}-pytorch-RL.tar'.format( self.data_path, n_folder) # memory file self.memory.file_mem = '{}/{}-memory.data'.format( self.data_path, n_folder) # figures file self.figure1 = '{}/{}-Rewards(Training).png'.format( self.data_path, n_folder) self.figure2 = '{}/{}-Error(Training).png'.format( self.data_path, n_folder) def plot_result(self, i_episode, cumulated_reward, epsilon, error_dist, loaded=False): ### Figure 1 # plot bar (cumulated reward) self.ax1.bar(i_episode, cumulated_reward, color=self.color1) # plot line (epsilon decay ) if loaded: self.ax2.plot(i_episode, epsilon, color=self.color2) self.n_episode = i_episode.tolist() self.n_epsilon = epsilon.tolist() self.n_dist = error_dist.tolist() else: self.n_episode.append(i_episode) self.n_epsilon.append(epsilon) self.ax2.plot(self.n_episode, self.n_epsilon, color=self.color2) self.n_dist.append(error_dist) ### Figure 2 # plot bar (error distance) self.ax3.bar(i_episode, error_dist, color=self.color3) # window_err = np.array(self.n_dist) # window_err = np.mean(window_err) # self.avg_err.append(window_err) # self.ax3.plot(self.n_episode, self.avg_err, color=self.color4) # plot line (average error distance) if len(self.n_dist) % self.avg_err_fre == 0: avg_err = self.moving_average(np.array(self.n_dist), self.avg_err_fre) self.ax3.plot(avg_err, color=self.color4) plt.draw() plt.pause(0.1) def run(self): start_time = time.time() if self.load_checkpoint: self.memory.load() self.dqn.load_model() # history log loaded self.logging_data = [ line.rstrip('\n') for line in open(self.history_log) ] hist_data = pd.read_csv(self.history_log, sep=",") i_episode = hist_data['i_episode'] cumulated_reward = hist_data['cumulated_reward'] epsilon = hist_data['epsilon'] error_dist = hist_data['error_dist'] self.plot_result(i_episode, cumulated_reward, epsilon, error_dist, loaded=True) i_episode = hist_data['i_episode'].iloc[-1] + 1 self.dqn.epsilon = hist_data['epsilon'].iloc[-1] rospy.loginfo('[RL] Loaded checkpoint') else: i_episode = 0 ######################################### ###### Reinfrocement Training loop ###### for i_episode in range(i_episode, self.n_episodes): state = self.env.reset(i_episode) cumulated_reward = 0 steps = 0 step_time = time.time() while not rospy.is_shutdown(): steps += 1 action, epsilon = self.dqn.select_action(state, i_episode) # print('num_steps: {}, epsilon: {}, steps_done: {}'.format(steps, epsilon, dqn.steps_done)) # action = env.action_space.sample() rospy.loginfo('[RL] action: {}'.format(action)) next_state, reward, done, info = self.env.step(action) self.memory.push(state, action, next_state, reward, done) cumulated_reward += reward ################################ ######### optimize ############# if self.mode_optimize == 'normal_dqn': # without experience replay memory self.dqn.optimize(state, action, next_state, reward, done) elif self.mode_optimize == 'dqn_replay_memory': # with experience replay memory if len(self.memory) > self.batch_size: state_mem, action_mem, next_state_mem, reward_mem, done_mem = self.memory.sample( self.batch_size) self.dqn.optimize_with_replay_memory( state_mem, action_mem, next_state_mem, reward_mem, done_mem) elif self.mode_optimize == 'dqn_taget_net': # with experience target net if len(self.memory) > self.batch_size: state_mem, action_mem, next_state_mem, reward_mem, done_mem = self.memory.sample( self.batch_size) self.dqn.optimize_with_DQN(state_mem, action_mem, next_state_mem, reward_mem, done_mem) elif self.mode_optimize == 'dueling_dqn': # with double DQN if len(self.memory) > self.batch_size: state_mem, action_mem, next_state_mem, reward_mem, done_mem = self.memory.sample( self.batch_size) self.dqn.optimize_with_dueling_DQN( state_mem, action_mem, next_state_mem, reward_mem, done_mem) if not done: state = next_state else: break # DQN update param self.dqn.update_param(i_episode) # Plotting error_dist = self.env.calc_dist() self.plot_result(i_episode, cumulated_reward, epsilon, error_dist) # Save Checkpoint temp_data = "{},{},{},{}".format(i_episode, cumulated_reward, epsilon, error_dist) self.logging_data.append(temp_data) if i_episode % self.save_fre == 0: rospy.loginfo('[RL] Save checkpoint: {}'.format(i_episode)) self.dqn.save_model() # save models self.memory.save() # save replay memory # logging file with open(self.history_log, 'w') as f: if not self.load_checkpoint: f.write( "i_episode,cumulated_reward,epsilon,error_dist\n") for item in self.logging_data: f.write("%s\n" % item) # save figures self.fig1.savefig(self.figure1, dpi=self.fig1.dpi) self.fig2.savefig(self.figure2, dpi=self.fig2.dpi) rospy.loginfo('[RL] Save figure1: {}'.format(self.figure1)) rospy.loginfo('[RL] Save figure2: {}'.format(self.figure2)) # Timing elapsed_time = time.time() - step_time total_time = time.time() - start_time print('\n********') print("Elapsed time: {}".format( time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))) print("Total time: {}".format( time.strftime("%H:%M:%S", time.gmtime(total_time)))) # Finish Training self.env.close() print() rospy.loginfo('[RL] Exit ...') total_time = time.time() - start_time print('\n*********************') print("Total time: ", time.strftime("%H:%M:%S", time.gmtime(total_time))) rospy.loginfo('[RL] Style plot: {}'.format(self.style_plot)) plt.show(block=True)
pylab.savefig("./save/pendulum_a2c.png") print() print( "*** episode:{0} is Done!!! || score:{1} || step:{2} || info:{3} ***" .format(episode, score, step, info)) steps.append(step) print() #env.episode_reset() if episode % 50 == 0: print("average of recent 50 steps :", str(np.mean(steps[-50:]))) print() #if -10 <= score: #agent.actor.save_weights("./save/pendulum_actor.h5") #agent.critic.save_weights("./save/pendulum_critic.h5") #max_score = score # 이전 10개 에피소드의 점수 평균이 -30보다 크면 학습 중단 #if np.mean(scores[-min(10, len(scores)):]) > -30: # agent.actor.save_weights("./save/pendulum_actor.h5") # agent.critic.save_weights("./save/pendulum_critic.h5") # env.close() # sys.exit() env.close() sys.exit() except KeyboardInterrupt as e: env.close() sys.exit()
class Testing: def __init__(self): self.n_episode = [] self.n_epsilon = [] self.n_dist = [] self.avg_err = [] # Parameters self.n_episodes = rospy.get_param("/n_episodes") self.avg_err_fre = rospy.get_param('/avg_err_fre') # create environment self.env = Env() self.n_states = self.env.observation_space self.n_actions = self.env.action_space.n # create Deep Q-Network self.dqn = DQN(self.n_states, self.n_actions) # load DQN weight rospack = rospkg.RosPack() data_path = rospack.get_path("pioneer_dragging") + "/data" username = rospy.get_param("/username") n_folder = rospy.get_param("/n_folder") self.dqn.file_models = "{0}/{1}-{2}/{2}-pytorch-RL.tar".format( data_path, username, n_folder) self.dqn.load_model() # plot self.color1 = 'tab:green' self.color2 = 'tab:blue' self.color3 = 'tab:orange' self.color4 = 'tab:red' self.style_plot = random.choice(plt.style.available) plt.style.use(self.style_plot) plt.ion() ########### # Figure 1 - Rewards self.fig1 = plt.figure(1) self.ax1 = self.fig1.add_subplot(1, 1, 1) self.ax2 = self.ax1.twinx() title_1 = 'Rewards - (Mode: Testing)' self.ax1.set_title(title_1) self.ax1.set_xlabel('Episode') self.ax1.set_ylabel('Reward', color=self.color1) self.ax2.set_ylabel('Epsilon', color=self.color2) self.ax1.tick_params(axis='y', labelcolor=self.color1) self.ax2.tick_params(axis='y', labelcolor=self.color2) ########### # Figure 2 - Error self.fig2 = plt.figure(2) self.ax3 = self.fig2.add_subplot(1, 1, 1) title_2 = 'Error Distance - (Mode: Testing)' self.ax3.set_title(title_2) self.ax3.set_xlabel('Episode') self.ax3.set_ylabel('Meter') def moving_average(self, x, w): return np.convolve(x, np.ones(w), 'valid') / w def plot_result(self, i_episode, cumulated_reward, error_dist): ### Figure 1 # plot bar (cumulated reward) self.ax1.bar(i_episode, cumulated_reward, color=self.color1) ### Figure 2 # plot bar (error distance) self.ax3.bar(i_episode, error_dist, color=self.color3) self.n_dist.append(error_dist) # plot line (average error distance) if len(self.n_dist) % self.avg_err_fre == 0: avg_err = self.moving_average(np.array(self.n_dist), self.avg_err_fre) self.ax3.plot(avg_err, color=self.color4) plt.draw() plt.pause(0.1) def run(self): start_time = time.time() ######################################### ###### Reinfrocement Training loop ###### for i_episode in range(self.n_episodes): state = self.env.reset(i_episode) cumulated_reward = 0 step_time = time.time() while not rospy.is_shutdown(): action = self.dqn.test_action(state) # action = env.action_space.sample() rospy.loginfo('[RL] action: {}'.format(action)) next_state, reward, done, _ = self.env.step(action) cumulated_reward += reward if not done: state = next_state else: break ################ # Plotting error_dist = self.env.calc_dist() self.plot_result(i_episode, cumulated_reward, error_dist) ################ # Timing elapsed_time = time.time() - step_time total_time = time.time() - start_time print('\n********') print("Elapsed time: {}".format( time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))) print("Total time: {}".format( time.strftime("%H:%M:%S", time.gmtime(total_time)))) ################ # Finish Testing self.env.close() print() rospy.loginfo('[RL] Exit ...') total_time = time.time() - start_time print('\n*********************') print("Total time: ", time.strftime("%H:%M:%S", time.gmtime(total_time))) rospy.loginfo('[RL] Style plot: {}'.format(self.style_plot)) plt.show(block=True)