game.new_episode() while not game.is_episode_finished(): model.step(training=False) # Sleep between episodes score = game.get_total_reward() test_scores.append(score) test_scores = np.array(test_scores) print("%d test episodes played." % episodes_to_watch) print( "Results: mean score: %.1f +/- %.1f," % (test_scores.mean(), test_scores.std()), "min: %.1f," % test_scores.min(), "max: %.1f," % test_scores.max()) if (test_scores.mean() > best_mean): best_mean = test_scores.mean() model.save() # saving results with open(log_name + "_loss.txt", "a+") as file: file.write(str(losses.mean()) + '\n') with open(log_name + "_train_score.txt", "a+") as file: file.write(str(train_scores.mean()) + '\n') with open(log_name + "_test_score.txt", "a+") as file: file.write(str(test_scores.mean()) + '\n') game.close() print("======================================") print("Training finished.")
def run_full_experiment(config): # archiving old experience db.archive_exp(db.get_all_exp()) db.delete_all_exp() util.setup_file_logger(name=config.run_id, filename=config.run_id) logger = logging.getLogger(config.run_id) start_time = time.time() # Define players model_1 = DQN(run_id=config.run_id, **config.DQN_params) model_2 = model_1.copy() epsilon = Epsilon(epsilon_func=config.epsilon_func, max_epsilon=config.max_epsilon, min_epsilon=config.min_epsilon, eval_epsilon=config.eval_epsilon, num_cycles=config.num_cycles, decrement=config.epsilon_decrement) player_list = [ Agent(name=config.bot_1_name, model=model_1, epsilon=epsilon), Agent(name=config.bot_2_name, model=model_2, epsilon=epsilon) ] winner_list = [] previous_experience_id = 0 util.save_config(config=config, path=config.run_id) # For each cycle logger.info('Beginning run titled: ' + config.run_id) logger.info(cs.DIVIDER) for i in range(1, config.num_cycles + 1): # For each episode, play through episode and insert each state/action pair into the database logger.info('Beginning cycle: ' + str(i) + ' / ' + str(config.num_cycles) + '\tCumulative Time Elapsed: ' + util.get_pretty_time(time.time() - start_time)) logger.info( f'Current Epsilon: {epsilon.get_epsilon(current_cycle=i):.3f}') cycle_start_time = time.time() # Async parallelization. May want to consider doing cpu_count - 1 to allow user to do things while it runs. Sux cuz of memory copying I think. # with mp.Pool(mp.cpu_count() - 1) as pool: # game_output = pool.starmap_async(parallel.play_game, [(config.game, player_list, config.run_id, i) for j in range(config.episodes_per_cycle)]).get() # Old serial method winner_list += pu.play_games(num_games=config.episodes_per_cycle, name=config.game, players=player_list, run_id=config.run_id, current_cycle=i, config=config) logger.info('Data collection complete.\tTotal Episode Time: ' + util.get_pretty_time(time.time() - cycle_start_time)) logger.info('Loading experience and training model...') training_start_time = time.time() # Import data from database based on experience replay buffer and train model pu.train_model(model=model_1, config=config) logger.info('Model training complete.\tTotal Training Time: ' + util.get_pretty_time(time.time() - training_start_time)) # Update model_2 if i % config.player_2_update_freq == 0: logger.info(cs.DIVIDER) logger.info( 'Storing history and setting model 2 equal to model 1...') player_list[0].model.policy_net.store_history() player_list[1].set_model(model=model_1.copy()) # Benchmark if i % config.benchmark_freq == 0: logger.info(cs.DIVIDER) logger.info('Benchmarking...') # List of player 1's win rate against player 2 by cycle benchmark_cycle_win_rate = 1 - sum(winner_list) / len(winner_list) winner_list = [] # Reset winner list # Play against random bot and measure win rate random_win_rate = benchmark.benchmark_test( primary_model=model_1, benchmark_model=RandomBot(), benchmark_bot_name=config.random_bot_name, num_games=config.random_bot_cycles, run_id=config.run_id if config.log_random_benchmark else None) logger.info( f'Winrate vs. Random Bot: {random_win_rate * 100:.1f}%') # Play against expert policy bot and measure win rate # expert_policy_win_rate = benchmark.benchmark_test(primary_model=model_1, benchmark_model=ExpertPolicy(), benchmark_bot_name=config.expert_policy_bot_name, # num_games=config.random_bot_cycles, run_id=config.run_id if config.log_expert_policy_benchmark else None) # logger.info(f'Winrate vs. Expert Policy: {expert_policy_win_rate * 100:.1f}%') # Collect average reward from database average_reward = benchmark.get_average_reward( run_id=config.run_id, previous_experience_id=previous_experience_id, agent_id=config.bot_1_name, opponent_id=config.bot_2_name) db.insert_metrics(run_id=config.run_id, win_rate=benchmark_cycle_win_rate, win_rate_random=random_win_rate, win_rate_expert_policy=0.0, average_reward=average_reward) previous_experience_id = db.get_max_id(config.run_id) # Checkpoint if config.checkpoint_freq is not None and i % config.checkpoint_freq == 0: logger.info(cs.DIVIDER) logger.info('Model checkpoint reached. Saving checkpoint...') model_1.save(folder=os.path.join(config.checkpoint_folder, config.run_id), title=util.get_checkpoint_model_name(cycle=i)) logger.info('Cycle complete.\tTotal Cycle Time: ' + util.get_pretty_time(time.time() - cycle_start_time)) logger.info(cs.DIVIDER) logging.info('Training complete.\tTotal Run Time: ' + util.get_pretty_time(time.time() - start_time) + '\tSaving model and exiting...') model_1.save(title=config.run_id)