def plot_results(log_folder, title='Learning Curve', smoothing=True): """ plot the results :param log_folder: (str) the save location of the results to plot :param title: (str) the title of the task to plot from stable-baselines example """ x, y = ts2xy(load_results(log_folder), 'timesteps') if smoothing: y = movingAverage(y, window=50) else: title = 'Learning Curve no smoothing' # Truncate x x = x[len(x) - len(y):] fig = plt.figure(title) plt.plot(x, y) plt.xlabel('Number of Timesteps') plt.ylabel('Rewards') plt.title(title + " Smoothed") plt.show()
def _get_random_source_policies(): opt_source_policies = [] subopt_source_policies = [] for source_path in os.listdir(sources_dir): if env_id in source_path and source_path[-1] == '1': path = sources_dir + source_path _, y = ts2xy(load_results(path), 'episodes') if np.mean(y[-100:]) > _OPT_THRESH[env_id]: opt_source_policies.append('{}/{}.pkl'.format(path, env_id)) if np.mean(y[-100:]) < _SUBOPT_THRESH[env_id]: subopt_source_policies.append('{}/{}.pkl'.format(path, env_id)) if len(opt_source_policies) < num_opt_sources: raise ValueError( '{} number of optimal source policies is less than the requested number {}' .format(opt_source_policies, num_opt_sources)) if len(subopt_source_policies) < num_subopt_sources: raise ValueError( '{} number of suboptimal source policies is less than the requested number {}' .format(subopt_source_policies, num_subopt_sources)) source_policies = np.random.choice(opt_source_policies, num_opt_sources, replace=False).tolist() source_policies += np.random.choice(subopt_source_policies, num_subopt_sources, replace=False).tolist() return source_policies
def callback(_locals, _globals): """ Callback called at each step (for DQN an others) or after n steps (see ACER or PPO2) :param _locals: (dict) :param _globals: (dict) """ global n_steps, best_mean_reward # Print stats every 1000 calls if (n_steps + 1) % 10 == 0: # Evaluate policy training performance x, y = ts2xy(load_results(log_dir), 'timesteps') if len(x) > 0: mean_reward = np.mean(y[-100:]) print(x[-1], 'timesteps') print( "Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}" .format(best_mean_reward, mean_reward)) # New best model, you could save the agent here if mean_reward > best_mean_reward: best_mean_reward = mean_reward # Example for saving best model print("Saving new best model") _locals['self'].save(log_dir + 'best_model_prof.pkl') n_steps += 1 # Returning False will stop training early return True
def callback(_locals, _globals): """ Callback called after n steps :param _locals: (dict) :param _globals: (dict) """ global best_mean_reward, n_episodes, saving_interval n_episodes += 1 if n_episodes % saving_interval == 0: x, y = ts2xy(load_results(log_dir), 'episodes') if len(x) > 0: mean_reward = np.mean(y[-int(saving_interval):]) logger.info("{}: Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}\n".format(x[-1], best_mean_reward, mean_reward)) with open("mean_reward.txt", "a") as text_file: print("{}: Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}".format(x[-1], best_mean_reward, mean_reward), file=text_file) _locals['self'].save(pickle_dir + 'ppo2_recent_model.pkl') if mean_reward >= best_mean_reward: best_mean_reward = mean_reward logger.debug("Saving new best model") _locals['self'].save(pickle_dir + 'ppo2_best_model.pkl') return True
def callback(_locals, _globals): def shift(arr, num, fill_value=np.nan): result = np.empty_like(arr) if num > 0: result[:num] = fill_value result[num:] = arr[:-num] elif num < 0: result[num:] = fill_value result[:num] = arr[-num:] else: result = arr return result global best_mean_reward # Evaluate policy training performance copyfile( "/tmp/monitor/{0}/{1}/monitor.csv".format( dir_dict['_hyper_weights_index'], 0), "{0}monitor.csv".format(dir_dict['log'])) x, y = ts2xy(load_results(dir_dict['log']), 'timesteps') if len(x) > 0: mean_reward = np.mean((y[-100:] - shift(y[-100:], 1, fill_value=0.0))) print(x[-1], 'timesteps') print( "Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}". format(best_mean_reward, mean_reward)) # New best model, you could save the agent here if mean_reward > best_mean_reward: best_mean_reward = mean_reward # Example for saving best model print("Saving new best model") _locals['self'].save(dir_dict['model'] + 'best_model.pkl') return True
def auto_save_callback(_locals, _globals): """ Callback called at each step (for DQN an others) or after n steps (see ACER or PPO2) :param _locals: (dict) :param _globals: (dict) """ # get callback variables, with default values if unintialized callback_vars = get_callback_vars(_locals["self"], n_steps=0, best_mean_reward=-np.inf) # skip every 20 steps if callback_vars["n_steps"] % 20 == 0: # Evaluate policy training performance x, y = ts2xy(load_results(log_dir), 'timesteps') if len(x) > 0: mean_reward = np.mean(y[-100:]) # New best model, you could save the agent here if mean_reward > callback_vars["best_mean_reward"]: callback_vars["best_mean_reward"] = mean_reward # Example for saving best model print("Saving new best model at {} timesteps".format(x[-1])) _locals['self'].save(log_dir + 'best_model') callback_vars["n_steps"] += 1 return True
def callback(_locals, _globals): """ Callback called at each step (for DQN an others) or after n steps (see ACER or PPO2) :param _locals: (dict) :param _globals: (dict) """ global n_steps, best_mean_reward, save_path, log_dir # Print stats every 10 calls if (n_steps + 1) % args.save_interval == 0: # Evaluate policy training performance x, y = ts2xy(load_results(log_dir), "timesteps") if len(x) > 0: mean_reward = np.mean(y[-100:]) print(x[-1], 'timesteps') print( "Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}" .format(best_mean_reward, mean_reward)) # New best model, you could save the agent here if mean_reward > best_mean_reward: best_mean_reward = mean_reward # Example for saving best model print("Saving new best model") model.save(save_path) n_steps += 1 return True
def multi_callback(_locals, _globals): """ Callback called at each step (for DQN an others) or after n steps (see ACER or PPO2) :param _locals: (dict) :param _globals: (dict) """ global n_steps, best_mean_reward, log_dir # Print stats every 1000 calls if (n_steps + 1) % 1000 == 0: seed = _locals['seed'] experiment_dir = log_dir + 'seed_{}/'.format(seed) # Evaluate policy performance x, y = ts2xy(load_results(experiment_dir), 'timesteps') if len(x) > 0: mean_reward = np.mean(y[-100:]) # New best model, you could save the agent here if mean_reward > best_mean_reward: print(x[-1], 'timesteps') print( "Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}" .format(mean_reward, best_mean_reward)) globals()['best_mean_reward'] = mean_reward # Example for saving best model print("Saving new best model") _locals['self'].save(experiment_dir + 'best_model_{}_steps.pkl'.format(n_steps)) print("Saving checkpoint model") _locals['self'].save(experiment_dir + 'model_{}_steps.pkl'.format(n_steps)) n_steps += 1 return True
def callback(_locals, _globals): """ Callback called at each step (for DQN an others) or after n steps (see ACER or PPO2) :param _locals: (dict) :param _globals: (dict) """ global best_mean_reward, n_steps mean_reward = 0 if (n_steps + 1) % 100000 == 0: print("Saving new best model") _locals['self'].save(model_directory + 'sac-model_' + str(n_steps + 1) + '.pkl') if (n_steps + 1) % 1000 == 0: x, y = ts2xy(load_results(log_directory), 'timesteps') if len(x) > 0: mean_reward = numpy.mean(y[-100:]) print(x[-1], 'timesteps') print( "Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}" .format(best_mean_reward, mean_reward)) if mean_reward > best_mean_reward: best_mean_reward = mean_reward print("Saving new best model") _locals['self'].save(model_directory + 'sac-model_' + str(n_steps + 1) + '.pkl') n_steps += 1 return True
def callback(_locals, _globals): """ Callback called at each step (for DQN an others) or after n steps (see ACER or PPO2) :param _locals: (dict) :param _globals: (dict) """ nonlocal n_steps, best_mean_reward, hist_rew # Print stats every 1000 calls if (n_steps + 1) % 5 == 0: # Evaluate policy performance x, y = ts2xy(load_results(log_dir), 'timesteps') if len(x) > 0: # mean_rew_plot(y, len(x)) hist_rew = y.copy() mean_reward = np.mean(y[-100:]) if (n_steps + 1) % 100 == 0: print(x[-1], 'timesteps') print( "Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}" .format(best_mean_reward, mean_reward)) # New best model, you could save the agent here if mean_reward > best_mean_reward: best_mean_reward = mean_reward # Example for saving best model print("Saving new best model") _locals['self'].save(log_dir + "/deep_{0:.0E}.pkl".format(lr)) n_steps += 1 return False
def log_callback(_locals, _globals): global n_steps, best_mean_reward, log_dir # Print stats every 1000 calls if n_steps % 3000 == 0: # Evaluate policy training performance x, y = ts2xy(load_results(log_dir), 'timesteps') if len(x) > 0: mean_reward = np.mean(y[-100:]) print(x[-1], 'timesteps') print( "Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}" .format(best_mean_reward, mean_reward)) # New best model, you could save the agent here if mean_reward > best_mean_reward: best_mean_reward = mean_reward # Example for saving best model print("Saving new model") _locals['self'].save( os.path.join( output_dir, str(n_steps) + '_model_r_' + str(best_mean_reward) + '.pkl')) n_steps += 1 return True
def tsplot_result(log_dirs_dict, num_timesteps, title='Learning Curve'): # print('load_results', load_results(log_dir)) import seaborn as sns import pandas as pd datas = [] for key in log_dirs_dict: log_dirs = log_dirs_dict[key] for index, dir in enumerate(log_dirs): init_data = load_results(dir) init_data = init_data[init_data.l.cumsum() <= num_timesteps] x, y = ts2xy(init_data, 'timesteps') y = movingAverage(y, window=100) x = x[len(x) - len(y):] # x = x[:len(y)] print('y', y) x, y = subsample(t=x, vt=y, bins=np.linspace(0, num_timesteps, int(1000) + 1)) x = np.append(x, np.array([0])) y = np.append(y, np.array([0])) print('y after subsample', y) # y = movingAverage(y, window=10) # # x = x[len(x) - len(y):] # x = x[:len(y)] data = pd.DataFrame({'Timesteps': x, 'Reward': y, 'subject': np.repeat(index, len(x)), 'Algorithm': np.repeat(key, len(x))}) datas.append(data) data_df = pd.concat(datas, ignore_index=True) print('data', data_df) sns.tsplot(data=data_df, time='Timesteps', value='Reward', unit='subject', condition='Algorithm')
def plot_results(log_folder, title='Learning Curve'): """ plot the results :param log_folder: (str) the save location of the results to plot :param title: (str) the title of the task to plot """ x, y = ts2xy(load_results(log_folder), 'timesteps') # x, y = ts2xy(load_results(log_folder), 'episodes') # x, y = ts2xy(load_results(log_folder), 'walltime_hrs') print(x) print(y) y = moving_average(y, window=10) # Truncate x x = x[len(x) - len(y):] fig = plt.figure(title) plt.plot(x, y) plt.xlabel('Number of Timesteps') plt.ylabel('Rewards') plt.title(title + " Smoothed") plt.show()
def plot_results(log_folder, plot_dir, title='Learning Curve'): """ plot the results :param log_folder: (str) the save location of the results to plot :param title: (str) the title of the task to plot """ os.makedirs(plot_dir, exist_ok=True) pdb.set_trace() x, y = ts2xy(load_results(log_folder), 'timesteps') y = movingAverage(y, window=50) # Truncate x x = x[len(x) - len(y):] fig = plt.figure(title) plt.plot(x, y) plt.xlabel('Number of Timesteps') plt.ylabel('Rewards') plt.title(title + " Smoothed") if not plot_dir.endswith("/"): plot_dir += "/" plt.savefig(str(plot_dir) + "results_dqn_" + str(env_name) + "_trained_timesteps_" + str(train_timesteps))
def plot_results(log_folder, title="Learning Curve"): """ Parameters ---------- log_folder : str the save location of the results to plot title : str the title of the task to plot Returns ------- """ x, y = ts2xy(load_results(log_folder), "timesteps") y = moving_average(y, window=50) # Truncate x x = x[len(x) - len(y):] fig = plt.figure(title) plt.plot(x, y) plt.xlabel("Number of Timesteps") plt.ylabel("Rewards") plt.title(title + " Smoothed") plt.savefig(title + ".png") plt.close()
def callback(_locals, _globals): global nupdates global best_mean_reward nupdates += 1 if nupdates % period_check == 0: x, y = ts2xy(load_results(log_dir), 'timesteps') if len(y) > 0: mean_reward = np.mean(y[-period_check:]) max_reward = max(y[-period_check:]) min_reward = min(y[-period_check:]) update_model = mean_reward > best_mean_reward if update_model: best_mean_reward = mean_reward _locals['self'].save('model') print( 'time: {}, nupdates: {}, max_reward: {:.2f}, min_reward: {:.2f}, mean: {:.2f}, best_mean: {:.2f}, model_update: {}' .format(datetime.datetime.now(), nupdates - 1, max_reward, min_reward, mean_reward, best_mean_reward, update_model)) return True
def plotting_callback(_locals, _globals): """ Callback called at each step (for DQN an others) or after n steps (see ACER or PPO2) :param _locals: (dict) :param _globals: (dict) """ # get callback variables, with default values if unintialized callback_vars = get_callback_vars(_locals["self"], plot=None) # get the monitor's data x, y = ts2xy(load_results(log_dir), 'timesteps') if callback_vars["plot"] is None: # make the plot plt.ion() fig = plt.figure(figsize=(6, 3)) ax = fig.add_subplot(111) line, = ax.plot(x, y) callback_vars["plot"] = (line, ax, fig) plt.show() else: # update and rescale the plot callback_vars["plot"][0].set_data(x, y) callback_vars["plot"][-2].relim() callback_vars["plot"][-2].set_xlim([_locals["total_timesteps"] * -0.02, _locals["total_timesteps"] * 1.02]) callback_vars["plot"][-2].autoscale_view(True, True, True) callback_vars["plot"][-1].canvas.draw()
def train_callback(_locals, _globals): """ Callback called at each step (for DQN an others) or after n steps (see ACER or PPO2) :param _locals: (dict) :param _globals: (dict) """ global n_callback, best_mean_reward, agent_name, path_to_models # Print stats every 1000 calls if (n_callback + 1) % 10 == 0: # Evaluate policy performance x, y = ts2xy(load_results('%s/%s/' % (path_to_models, agent_name)), 'timesteps') if len(x) > 0: mean_reward = np.mean(y[-100:]) print(x[-1], 'timesteps') print( "Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}" .format(best_mean_reward, mean_reward)) # New best model, you could save the agent here if mean_reward > best_mean_reward: best_mean_reward = mean_reward # Example for saving best model print("Saving new best model") _locals['self'].save(path_to_models + '/%s/%s.pkl' % (agent_name, agent_name)) n_callback += 1 return True
def plotting_callback(_locals, _globals): """ Callback called at each step (for DQN an others) or after n steps (see ACER or PPO2) :param _locals: (dict) :param _globals: (dict) if ENVIRONMENT == 'possensor': env = tm700_possensor_gym(renders=RENDERS, isDiscrete=DISCRETE) env = Monitor(env, os.path.join(log_dir, 'monitor.csv'), allow_early_resets=True) """ # get callback variables, with default values if unintialized callback_vars = get_callback_vars(_locals["self"], plot=None) # get the monitor's data x, y = ts2xy(load_results(log_dir), 'timesteps') if callback_vars["plot"] is None: # make the plot plt.ion() fig = plt.figure(figsize=(6, 3)) ax = fig.add_subplot(111) line, = ax.plot(x, y) callback_vars["plot"] = (line, ax, fig) plt.show() else: # update and rescale the plot callback_vars["plot"][0].set_data(x, y) callback_vars["plot"][-2].relim() callback_vars["plot"][-2].set_xlim([ _locals["total_timesteps"] * -0.02, _locals["total_timesteps"] * 1.02 ]) callback_vars["plot"][-2].autoscale_view(True, True, True) callback_vars["plot"][-1].canvas.draw()
def _on_step(self) -> bool: rospy.logdebug("on_step callback") if self.n_calls % self.check_freq == 0: # Retrieve training reward x, y = ts2xy(load_results(self.log_dir), 'timesteps') if len(x) > 0: # Mean training reward over the last 100 episodes mean_reward = np.mean(y[-100:]) if self.verbose > 0: print("Num timesteps: {}".format(self.num_timesteps)) print( "Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}" .format(self.best_mean_reward, mean_reward)) print( f"self.reward_bound is {self.reward_bound} and mean_ward is {mean_reward}" ) # New best model, you could save the agent here if mean_reward > self.best_mean_reward: self.best_mean_reward = mean_reward # Example for saving best model if self.verbose > 0: print("Saving new best model to {}".format( self.save_path)) self.model.save(self.save_path) # early stop the training if self.reward_bound is not None and mean_reward > self.reward_bound: print("early stop!") return False return True
def _on_step(self) -> bool: if self.n_calls % self.check_freq == 0: # Retrieve training reward x, y = ts2xy(load_results(self.log_dir), 'timesteps') if len(x) > 0: # Mean training reward over the last 1000 episodes mean_reward = np.mean(y[-1:]) if self.verbose > 0: print("Num timesteps: {}".format(self.num_timesteps)) print( "Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}" .format(self.best_mean_reward, mean_reward)) self.model.save(self.model_path) # New best model, you could save the agent here if mean_reward > self.best_mean_reward: self.best_mean_reward = mean_reward # Example for saving best model if self.verbose > 0: print("Saving new best model to {}".format( self.save_path)) self.model.save(self.save_path) return True
def callback(_locals, _globals): """ Callback called at each step (for DQN an others) or after n steps (see ACER or PPO2) :param _locals: (dict) :param _globals: (dict) """ global n_steps, best_mean_reward # Print stats every 1000 calls if (n_steps + 1) % 500 == 0: # Evaluate policy training performance x, y = ts2xy(load_results(os.path.join(output_dir,'log')), 'timesteps') if len(x) > 0: mean_reward = np.mean(y[-100:]) print(x[-1], 'timesteps') print("Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}".format(best_mean_reward, mean_reward)) # New best model, you could save the agent here if mean_reward > best_mean_reward: best_mean_reward = mean_reward if (n_steps + 1) % 5000 == 0: # Save model print("Saving model at iter {}".format(x[-1])) _locals['self'].save(os.path.join(output_dir, str(x[-1])+'model.pkl')) n_steps += 1 # Returning False will stop training early return True
def plot_results(log_folder, model_name, plt_dir, title='Learning Curve'): """ plot the results :param log_folder: (str) the save location of the results to plot :param title: (str) the title of the task to plot """ m_name_csv = model_name + ".csv" old_file_name = os.path.join(log_folder, "monitor.csv") new_file_name = os.path.join(log_folder, m_name_csv) save_name = os.path.join(plt_dir, model_name) x, y = ts2xy(load_results(log_folder), 'timesteps') shutil.copy(old_file_name, new_file_name) y = moving_average(y, window=50) # Truncate x x = x[len(x) - len(y):] fig = plt.figure(title) plt.plot(x, y) plt.xlabel('Number of Timesteps') plt.ylabel('Rewards') plt.title(title + " Smoothed") plt.savefig(save_name + ".png") plt.savefig(save_name + ".eps") print("plots saved...") plt.show()
def callback(_locals, _globals): """ Callback called at each step (for DQN an others) or after n steps (see ACER or PPO2) :param _locals: (dict) :param _globals: (dict) """ global n_updates global best_mean_reward # Print stats every 1000 calls if (n_updates + 1) % 1000 == 0: # Evaluate policy training performance x, y = ts2xy(load_results(log_dir), 'timesteps') if len(x) > 0: #100ステップ毎に過去100件の平均報酬を計算し、ベスト平均報酬を越えていたらエージェントを保存しています。 mean_reward = np.mean(y[-100:]) #print(x[-1], 'timesteps') #print("Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}".format(best_mean_reward, mean_reward)) # New best model, you could save the agent here if mean_reward > best_mean_reward: best_mean_reward = mean_reward # Example for saving best model print("Saving new best model") _locals['self'].save(log_dir + 'best_model.pkl') n_updates += 1 return True
def plot_results(log_folder, out_folder = None, title='Learning_Curve', save_plot = True): """ plot the results :param log_folder: (str) the save location of the results to plot :param out_folder: (str) the location where the plot is saved (default: log_folder) :param title: (str) the title of the task to plot :param save_plot: (bool) save the plot as pdf? """ x, y = ts2xy(load_results(log_folder), 'timesteps') y = moving_average(y, window=50) # Truncate x x = x[len(x) - len(y):] matplotlib.rc('font', size=14) matplotlib.rc('text', usetex=True) #fig1 = plt.figure() #figsize=(10, 10)) plt.plot(x, y) plt.xlabel('Number of Timesteps') plt.ylabel('Cumulative reward') #plt.xscale('log') plt.yscale('symlog') plt.grid() #plt.title("Learning curve smoothed") if (save_plot): if (out_folder is None): plt.savefig(log_folder + title + ".pdf", dpi=300) else: plt.savefig(out_folder + title + ".pdf", dpi=300)
def callback(_locals, _globals): global nupdates global best_mean_reward # 10更新毎 if (nupdates + 1) % 10 == 0: # 平均エピソード長、平均報酬の取得 x, y = ts2xy(load_results(log_dir), 'timesteps') if len(y) > 0: # 最近10件の平均報酬 mean_reward = np.mean(y[-10:]) # 平均報酬がベスト報酬以上の時はモデルを保存 update_model = mean_reward > best_mean_reward if update_model: best_mean_reward = mean_reward _locals['self'].save('airstriker_model') # ログ print( 'time: {}, nupdates: {}, mean: {:.2f}, best_mean: {:.2f}, model_update: {}' .format(datetime.datetime.now(pytz.timezone('Asia/Tokyo')), nupdates, mean_reward, best_mean_reward, update_model)) nupdates += 1 return True
def callback(_locals, _globals): """ Callback called at each step (for DQN an others) or after n steps (see ACER or PPO2) :param _locals: (dict) :param _globals: (dict) """ global n_steps, best_mean_reward, console_log_dir, model_file_name, log_name # This is invoked in every update if (n_steps + 1) % 1 == 0: # Evaluate policy performance x, y = ts2xy(load_results(console_log_dir), 'timesteps') if len(x) > 0: mean_reward = np.mean(y[-100:]) print(x[-1], 'timesteps', file=sys.stderr) print( "Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}" .format(best_mean_reward, mean_reward), file=sys.stderr) # New best model, you could save the agent here if mean_reward > best_mean_reward: best_mean_reward = mean_reward # Example for saving best model n_steps += 1 return True
def callback(_locals, _globals): """ Callback called at each step (for DQN an others) or after n steps (see ACER or PPO2) :param _locals: (dict) :param _globals: (dict) """ global n_steps, best_mean_reward # Print stats every 1000 calls if (n_steps + 1) % 10 == 0: x, y = ts2xy(load_results(log_dir), 'timesteps') if len(x) > 100: #pdb.set_trace() mean_reward = np.mean(y[-100:]) print(x[-1], 'timesteps') print( "Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}" .format(best_mean_reward, mean_reward)) # New best model, we save the agent here if mean_reward > best_mean_reward: best_mean_reward = mean_reward # Example for saving best model print("Saving new best model") _locals['self'].save(os.path.join(log_dir, 'best_model.pkl')) else: print("Saving latest model") _locals['self'].save(os.path.join(log_dir, 'latest_model.pkl')) else: print('{} monitor entries'.format(len(x))) pass n_steps += 1 # Returning False will stop training early return True
def _on_step(self) -> bool: if self.n_calls % self.check_freq == 0: # Retrieve training reward x, y = ts2xy(load_results(self.log_dir), 'timesteps') if len(x) > 0: # Mean training reward over the last 100 episodes mean_reward = np.mean(y[-100:]) if self.verbose > 0: print("Num timesteps: {}".format(self.num_timesteps)) print( "Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}" .format(self.best_mean_reward, mean_reward)) # New best model, you could save the agent here if mean_reward > self.best_mean_reward: self.best_mean_reward = mean_reward self.model_index += 1 # Example for saving best model if self.verbose > 0: print("Saving new best model at {} timesteps".format( x[-1])) print(Fore.YELLOW + "Saving new best model to {}".format( self.save_path + str(self.model_index)) + Style.RESET_ALL) self.model.save(self.save_path + str(self.model_index) + ".zip") return True
def callback(_locals, _globals): """ Callback called at each step Params: _locals: (dict) _globals: (dict) """ global n_steps, best_mean_reward if (n_steps + 1) % 10 == 0: print('Saving the Model.. (every 10 updates)') _locals['self'].save('ppo_pong') n_steps += 1 print('n_steps = ', n_steps) x, y = ts2xy(load_results(log_dir), 'timesteps') if len(x) > 0: # Mean reward over last 10 episodes if (len(x) % 10 == 0): mean_reward = np.mean(y[-10:]) print( "Best mean reward over last 10 episodes: {:.2f} - Mean reward over last 10 episodes: {:.2f}" .format(best_mean_reward, mean_reward)) if mean_reward > best_mean_reward: best_mean_reward = mean_reward # Timesteps passed print(x[-1], 'timesteps') # Returning False will stop training early return True