def save_model(exp_path, model, model_type, env, basicdate): logger.info("Saving model") helper.save_model_with_env_settings(exp_path,model,model_type,env,basicdate) video_path = os.path.join(exp_path, 'training.mp4') helper.make_video(model,env,video_path)
def save_model(exp_path, model, model_type, env, basicdate): logger.info("Saving model") # helper.save_model(exp_path, model, model_type, env.driving_agent, env.render_height, env.render_width, image_based, basicdate) helper.save_model_with_env_settings(exp_path,model,model_type,env,basicdate) video_path = os.path.join(exp_path, 'training.mp4') helper.make_video(model,env,video_path)
def evaluate_algorithms(exp_name, base_path, tb_enabled, algorithms, victory_threshold, victory_trials, max_seconds, testing_interval, use_non_image): basicdate = str(datetime.now().strftime("%Y%m%d_%H%M%S")) exp_name = "{}_{}".format(exp_name, basicdate) exp_path = os.path.join(base_path, exp_name) logger.info("Storing results in {}".format(exp_path)) writer = None if tb_enabled: writer = SummaryWriter(exp_path) for algo in algorithms: tb_log_name = "{}_non_image".format(algo) if use_non_image else algo logger.info("Evaluating algorithm: {}; non-image: {}".format( algo, use_non_image)) if use_non_image: image_based = False env = plark_env_non_image_state.PlarkEnvNonImageState( driving_agent='pelican', config_file_path= '/Components/plark-game/plark_game/game_config/10x10/panther_easy.json' ) policy = "MlpPolicy" # CnnPolicy doesn't work with MultiDiscrete observation space else: image_based = True env = plark_env.PlarkEnv( driving_agent='pelican', config_file_path= '/Components/plark-game/plark_game/game_config/10x10/panther_easy.json' ) policy = "CnnPolicy" model = helper.make_new_model(algo, policy, env) helper.train_until(model, env, victory_threshold, victory_trials, max_seconds, testing_interval, tb_writer=writer, tb_log_name=tb_log_name) helper.save_model_with_env_settings(exp_path, model, algo, env, image_based, basicdate) writer.close()
def train_agent(exp_path, model, env, testing_interval, max_steps, model_type, basicdate, tb_writer, tb_log_name, early_stopping=True, previous_steps=0): steps = 0 logger.info("Beginning training for {} steps".format(max_steps)) model.set_env(env) while steps < max_steps: logger.info("Training for {} steps".format(testing_interval)) model.learn(testing_interval) steps = steps + testing_interval agent_filepath, _, _ = helper.save_model_with_env_settings( exp_path, model, model_type, env, basicdate) if early_stopping: victory_count, avg_reward = helper.check_victory(model, env, trials=10) if tb_writer is not None and tb_log_name is not None: tb_steps = steps + previous_steps logger.info( "Writing to tensorboard for {} after {} steps".format( tb_log_name, tb_steps)) tb_writer.add_scalar('{}_avg_reward'.format(tb_log_name), avg_reward, tb_steps) tb_writer.add_scalar('{}_victory_count'.format(tb_log_name), victory_count, tb_steps) if victory_count > 7: logger.info("Stopping training early") break #Stopping training as winning #Save agent logger.info('steps = ' + str(steps)) agent_filepath, _, _ = helper.save_model_with_env_settings( exp_path, model, model_type, env, basicdate) agent_filepath = os.path.dirname(agent_filepath) return agent_filepath, steps
def compare_envs(exp_name, base_path, tb_enabled, victory_threshold, victory_trials, max_seconds, testing_interval, num_parallel_envs, non_image): basicdate = str(datetime.now().strftime("%Y%m%d_%H%M%S")) exp_name = "{}_{}".format(exp_name, basicdate) exp_path = os.path.join(base_path, exp_name) logger.info("Storing results in {}".format(exp_path)) writer = None if tb_enabled: writer = SummaryWriter(exp_path) for parallel in [False, True]: algo = "PPO2" policy = "MlpPolicy" if non_image else "CnnPolicy" tb_log_name = "{}_parallel".format(algo) if parallel else algo logger.info("Evaluating {}; parallel: {}".format(algo, parallel)) if parallel: logger.info("Evaluating using {} parallel environments".format( num_parallel_envs)) env_fn = createNonImageEnv if non_image else createImageEnv env = SubprocVecEnv([env_fn for _ in range(num_parallel_envs)]) else: env = createNonImageEnv() if non_image else createImageEnv() model = helper.make_new_model(algo, policy, env) helper.train_until(model, env, victory_threshold, victory_trials, max_seconds, testing_interval, tb_writer=writer, tb_log_name=tb_log_name) helper.save_model_with_env_settings(exp_path, model, algo, env, basicdate) writer.close()
def train_agent(self, exp_path, # Path for saving the agent model, env): # Can be either single env or vec env logger.info("Beginning individual training for {} steps".format(self.training_steps)) model.set_env(env) model.learn(self.training_steps) logger.info('Finished train agent') savepath = self.basicdate + '_pnm_iteration_' + str(self.pnm_iteration) agent_filepath ,_, _= helper.save_model_with_env_settings(exp_path, model, self.model_type, env, savepath) agent_filepath = os.path.dirname(agent_filepath) return agent_filepath
def save(): logger.info(str(retrain_iter)) logger.info(str(retrain_values)) plt.figure(figsize=(9, 3)) plt.subplot(131) plt.bar(retrain_iter, retrain_values) plt.subplot(132) plt.scatter(retrain_iter, retrain_values) plt.subplot(133) plt.plot(retrain_iter, retrain_values) plt.suptitle('Retraining Progress') image_based = False model_path, model_dir, modellabel = helper.save_model_with_env_settings( basepath, model, modeltype, env, image_based, basicdate) fig_path = os.path.join(model_dir, 'Training_Progress.png') plt.savefig(fig_path) print('Model saved to ', model_path)
def run_pnm(self): panther_agent_filepath, pelican_agent_filepath = self.initialAgents() # Initialize old NE stuff for stopping criterion value_to_pelican = 0. mixture_pelicans = np.array([1.]) mixture_panthers = np.array([1.]) # Create DataFrames for plotting purposes df_cols = ["NE_Payoff", "Pelican_BR_Payoff", "Panther_BR_Payoff", "Pelican_supp_size", "Panther_supp_size"] df = pd.DataFrame(columns = df_cols) # second df for period rigorous exploitability checks exploit_df_cols = ["iter", "NE_Payoff", "Pelican_BR_Payoffs", "Panther_BR_Payoffs"] exploit_df = pd.DataFrame(columns = exploit_df_cols) # Train best responses until Nash equilibrium is found or max_iterations are reached logger.info('Parallel Nash Memory (PNM)') for self.pnm_iteration in range(self.max_pnm_iterations): start = time.time() logger.info("*********************************************************") logger.info('PNM iteration ' + str(self.pnm_iteration + 1) + ' of ' + str(self.max_pnm_iterations)) logger.info("*********************************************************") self.pelicans.append(pelican_agent_filepath) self.panthers.append(panther_agent_filepath) if self.pnm_iteration == 0: self.compute_initial_payoffs() # Computing the payoff matrices and solving the corresponding LPs # Only compute for pelican in the sparse env, that of panther is the negative traspose (game is zero-sum) logger.info('Computing payoffs and mixtures') self.compute_payoff_matrix(self.pelicans, self.panthers) logger.info("=================================================") logger.info("New matrix game:") logger.info("As numpy array:") logger.info('\n' + str(self.payoffs)) logger.info("As dataframe:") tmp_df = pd.DataFrame(self.payoffs).rename_axis('Pelican', axis = 0).rename_axis('Panther', axis = 1) logger.info('\n' + str(tmp_df)) # save payoff matrix np.save('%s/payoffs_%d.npy' % (self.pnm_logs_exp_path, self.pnm_iteration), self.payoffs) def get_support_size(mixture): # return size of the support of mixed strategy mixture return sum([1 if m > 0 else 0 for m in mixture]) # Check if we found a stable NE, in that case we are done (and fitting DF) if self.pnm_iteration > 0: # Both BR payoffs (from against last time's NE) in terms of pelican payoff br_value_pelican = np.dot(mixture_pelicans, self.payoffs[-1, :-1]) br_value_panther = np.dot(mixture_panthers, self.payoffs[:-1, -1]) ssize_pelican = get_support_size(mixture_pelicans) ssize_panther = get_support_size(mixture_panthers) logger.info("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@") logger.info("\n\ Pelican BR payoff: %.3f,\n\ Value of Game: %.3f,\n\ Panther BR payoff: %.3f,\n\ Pelican Supp Size: %d,\n\ Panther Supp Size: %d,\n" % ( br_value_pelican, value_to_pelican, br_value_panther, ssize_pelican, ssize_panther )) logger.info("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@") values = dict(zip(df_cols, [value_to_pelican, br_value_pelican, br_value_panther, ssize_pelican, ssize_panther])) df = df.append(values, ignore_index = True) # Write to csv file df_path = os.path.join(self.exp_path, 'values_iter_%02d.csv' % self.pnm_iteration) df.to_csv(df_path, index = False) helper.get_fig(df) fig_path = os.path.join(self.exp_path, 'values_iter_%02d.pdf' % self.pnm_iteration) plt.savefig(fig_path) print("==========================================") print("WRITTEN VALUES DF TO CSV: %s" % df_path) print("==========================================") # here value_to_pelican is from the last time the subgame was solved if abs(br_value_pelican - value_to_pelican) < self.stopping_eps and\ abs(br_value_panther - value_to_pelican) < self.stopping_eps: print('Stable Nash Equilibrium found') break logger.info("SOLVING NEW GAME:") # solve game for pelican (mixture_pelicans, value_to_pelican) = lp_solve.solve_zero_sum_game(self.payoffs) # with np.printoptions(precision=3): logger.info(mixture_pelicans) mixture_pelicans /= np.sum(mixture_pelicans) # with np.printoptions(precision=3): logger.info("After normalisation:") logger.info(mixture_pelicans) np.save('%s/mixture_pelicans_%d.npy' % (self.pnm_logs_exp_path, self.pnm_iteration), mixture_pelicans) # solve game for panther (mixture_panthers, value_panthers) = lp_solve.solve_zero_sum_game(-self.payoffs.transpose()) # with np.printoptions(precision=3): logger.info(mixture_panthers) mixture_panthers /= np.sum(mixture_panthers) # with np.printoptions(precision=3): logger.info("After normalisation:") logger.info(mixture_panthers) np.save('%s/mixture_panthers_%d.npy' % (self.pnm_logs_exp_path, self.pnm_iteration), mixture_panthers) # end of logging matrix game and solution logger.info("=================================================") # Train from skratch or retrain an existing model for pelican logger.info('Training pelican') self.pelican_model = self.bootstrap(self.pelicans, self.pelican_env, mixture_pelicans) pelican_agent_filepath = self.train_agent_against_mixture('pelican', self.pelicans_tmp_exp_path, self.pelican_model, self.pelican_env, self.panthers, mixture_panthers, self.training_steps) # Train from scratch or retrain an existing model for panther logger.info('Training panther') self.panther_model = self.bootstrap(self.panthers, self.panther_env, mixture_panthers) panther_agent_filepath = self.train_agent_against_mixture('panther', self.panthers_tmp_exp_path, self.panther_model, self.panther_env, self.pelicans, mixture_pelicans, self.training_steps) logger.info("PNM iteration lasted: %d seconds" % (time.time() - start)) if self.pnm_iteration > 0 and self.pnm_iteration % self.testing_interval == 0: # Find best pelican (protagonist) against panther (opponent) mixture candidate_pelican_rbbr_fpaths, candidate_pelican_rbbr_win_percentages = self.iter_train_against_mixture( self.exploit_n_rbbrs, # Number of resource bounded best responses self.pelicans_tmp_exp_path, self.pelican_model, # driving_agent, # agent that we train self.pelican_env, # env, # Can either be a single env or subvecproc self.pelicans, # Filepaths to existing models mixture_pelicans, # mixture for bootstrapping self.panthers, # opponent_policy_fpaths, # policies of opponent of driving agent mixture_panthers) # opponent_mixture) logger.info("################################################") logger.info('candidate_pelican_rbbr_win_percentages: %s' % np.round(candidate_pelican_rbbr_win_percentages,2)) logger.info("################################################") br_values_pelican = np.round(candidate_pelican_rbbr_win_percentages,2).tolist() candidate_panther_rbbr_fpaths, candidate_panther_rbbr_win_percentages = self.iter_train_against_mixture( self.exploit_n_rbbrs, # Number of resource bounded best responses self.panthers_tmp_exp_path, self.panther_model, # driving_agent, # agent that we train self.panther_env, # env, # Can either be a single env or subvecproc self.panthers, # Filepaths to existing models mixture_panthers, # mixture for bootstrapping self.pelicans, # opponent_policy_fpaths, # policies of opponent of driving agent mixture_pelicans) # opponent_mixture) logger.info("################################################") logger.info('candidate_panther_rbbr_win_percentages: %s' % np.round(candidate_panther_rbbr_win_percentages,2)) logger.info("################################################") br_values_panther = [1-p for p in np.round(candidate_panther_rbbr_win_percentages,2)] values = dict(zip(exploit_df_cols, [self.pnm_iteration, value_to_pelican, br_values_pelican, br_values_panther])) exploit_df = exploit_df.append(values, ignore_index = True) # add medians exploit_df['pelican_median'] = exploit_df['Pelican_BR_Payoffs'].apply(np.median) exploit_df['panther_median'] = exploit_df['Panther_BR_Payoffs'].apply(np.median) # Write to csv file df_path = os.path.join(self.exp_path, 'exploit_iter_%02d.csv' % self.pnm_iteration) tmp_df = exploit_df.set_index('iter') tmp_df.to_csv(df_path, index = True) helper.get_fig_with_exploit(df, tmp_df) fig_path = os.path.join(self.exp_path, 'values_with_exploit_iter_%02d.pdf' % self.pnm_iteration) plt.savefig(fig_path) print("==========================================") print("WRITTEN EXPLOIT DF TO CSV: %s" % df_path) print("==========================================") if self.video_flag: # occasionally ouput useful things along the way # Make videos verbose = False video_path = os.path.join(self.exp_path, 'pelican_pnm_iter_%02d.mp4' % self.pnm_iteration) basewidth,hsize = helper.make_video_VEC_ENV(self.pelican_model, self.pelican_env, video_path, fps=self.fps, basewidth=self.basewidth, n_steps=self.video_steps, verbose=verbose) video_path = os.path.join(self.exp_path, 'panther_pnm_iter_%02d.mp4' % self.pnm_iteration) basewidth,hsize = helper.make_video_VEC_ENV(self.panther_model, self.panther_env, video_path, fps=self.fps, basewidth=self.basewidth, n_steps=self.video_steps, verbose=verbose) # Saving final mixture and corresponding agents logger.info("################################################") logger.info("Saving final pelican mixtures and agents:") support_pelicans = np.nonzero(mixture_pelicans)[0] mixture_pelicans = mixture_pelicans[support_pelicans] np.save(self.exp_path + '/final_mixture_pelicans.npy', mixture_pelicans) logger.info("Final pelican mixture saved to: %s" % self.exp_path + '/final_mixture_pelicans.npy') for i, idx in enumerate(mixture_pelicans): self.pelican_model = helper.loadAgent(glob.glob(self.pelicans[i]+ "/*.zip")[0], self.model_type) agent_filepath ,_, _= helper.save_model_with_env_settings(self.pelicans_tmp_exp_path, self.pelican_model, self.model_type, self.pelican_env, self.basicdate + "_ps_" + str(i)) logger.info("Saving pelican %d to %s" % (i, agent_filepath)) support_panthers = np.nonzero(mixture_panthers)[0] mixture_panthers = mixture_panthers[support_panthers] np.save(self.exp_path + '/final_mixture_panthers.npy', mixture_panthers) logger.info("Final panther mixture saved to: %s" % self.exp_path + '/final_mixture_panthers.npy') for i, idx in enumerate(mixture_panthers): self.panther_model = helper.loadAgent(glob.glob(self.panthers[i]+ "/*.zip")[0], self.model_type) agent_filepath ,_, _= helper.save_model_with_env_settings(self.panthers_tmp_exp_path, self.panther_model, self.model_type, self.panther_env, self.basicdate + "_ps_" + str(i)) logger.info("Saving panther %d to %s" % (i, agent_filepath))
def train_agent_against_mixture(self, driving_agent, # agent that we train exp_path, model, env, # Can either be a single env or subvecproc opponent_policy_fpaths, # policies of opponent of driving agent opponent_mixture, training_steps, filepath_addon=''): # mixture of opponent of driving agent ################################################################ # Heuristic to compute number of opponents to sample as mixture ################################################################ # Min positive probability min_prob = min([pr for pr in opponent_mixture if pr > 0]) target_n_opponents = self.num_parallel_envs * int(1.0 / min_prob) n_opponents = min(target_n_opponents, self.max_n_opponents_to_sample) if self.parallel: # Ensure that n_opponents is a multiple of n_opponents = self.num_parallel_envs * round(n_opponents / self.num_parallel_envs) logger.info("=============================================") logger.info("Sampling %d opponents" % n_opponents) logger.info("=============================================") # Sample n_opponents opponents = np.random.choice(opponent_policy_fpaths, size = n_opponents, p = opponent_mixture) logger.info("=============================================") logger.info("Opponents has %d elements" % len(opponents)) logger.info("=============================================") # If we use parallel envs, we run all the training against different sampled opponents in parallel if self.parallel: # Method to load new opponents via filepath setter = 'set_panther_using_path' if driving_agent == 'pelican' else 'set_pelican_using_path' for i, opponent in enumerate(opponents): # Stick this in the right slot, looping back after self.num_parallel_envs env.env_method(setter, opponent, indices = [i % self.num_parallel_envs]) # When we have filled all self.num_parallel_envs, then train if i > 0 and (i + 1) % self.num_parallel_envs == 0: logger.info("Beginning parallel training for {} steps".format(self.training_steps)) model.set_env(env) model.learn(training_steps) # Otherwise we sample different opponents and we train against each of them separately else: for opponent in opponents: if driving_agent == 'pelican': env.set_panther_using_path(opponent) else: env.set_pelican_using_path(opponent) logger.info("Beginning sequential training for {} steps".format(self.training_steps)) model.set_env(env) model.learn(self.training_steps) # Save agent logger.info('Finished train agent') savepath = self.basicdate + '_pnm_iteration_'+ str(self.pnm_iteration) + filepath_addon agent_filepath, _, _= helper.save_model_with_env_settings(exp_path, model, self.model_type, env, savepath) agent_filepath = os.path.dirname(agent_filepath) return agent_filepath
model = DQN('CnnPolicy', env) model.learn(50) logger.info('STARTING STAGE 1 INITIAL EVALUATION') stg1_mean_reward, n_steps = evaluate_policy(model, env, n_eval_episodes=1, deterministic=False, render=False, callback=None, reward_threshold=None, return_episode_rewards=False) logger.info('FINISHING STAGE 1 INITIAL EVALUATION') stage1result = retrain(stg1_mean_reward, stage_one_threshold, 0 ,env, model) logger.info("Stage One Threshold Met") if stage1result == True: logger.info("Stage 2 Training Started") env = plark_env_guided_reward.PlarkEnvGuidedReward(config_file_path=easy_config) model.set_env(env) model.learn(50) logger.info('STARTING STAGE 2 INITIAL EVALUATION') stg2_mean_reward, n_steps = evaluate_policy(model, env, n_eval_episodes=1, deterministic=False, render=False, callback=None, reward_threshold=None, return_episode_rewards=False) logger.info('FINISHING STAGE 2 INITIAL EVALUATION') stage2result = retrain(stg2_mean_reward, stage_two_threshold, 0 ,env, model) logger.info("Stage Two Threshold Met") if stage2result == True: logger.info("Stage 3 Training Started") env = plark_env_guided_reward.PlarkEnvGuidedReward(config_file_path=medium_config) model.set_env(env) model.learn(50) logger.info('STARTING STAGE 3 EVALUATION') stg3_mean_reward, n_steps = evaluate_policy(model, env, n_eval_episodes=1, deterministic=False, render=False, callback=None, reward_threshold=None, return_episode_rewards=False) logger.info('FINISHED STAGE 3 EVALUATION') stage3result = retrain(stg3_mean_reward, stage_three_threshold, 0 ,env, model) if stage3result == True: logger.info("Stage Three Threshold Met") logger.info("Multi-Stage-Training-Complete") model_path,model_dir, modellabel = helper.save_model_with_env_settings(basepath,model,modeltype,env,basicdate)