def eval(self, env, agent, n_games, name, is_eval=False, using_wandb=False, do_render=False): if not is_eval: # if it's not eval, it's testing agent.load_models() with torch.no_grad(): step_count = 0 is_win = False agent.is_training(False) eval_wins = 0 eval_scores = [] for test_game_idx in range(n_games): done = False eval_score = 0 state = env.reset() while not done: step_count += 1 if do_render: self.env.render() # print(agent.epsilon) # if test_game_idx % 10 == 0: # env.print_debug() shape_n, source, canvas, pointer = state # source, canvas, pointer = state state = np.append(source.reshape(-1), canvas.reshape(-1)) # state = source.reshape(-1) state = np.append(state, pointer) state = np.array( state, dtype=np.float32 ) # prevent automatic casting to float64 (don't know why that happened though...) action, pen_state = agent.choose_action(state) # action = random.randint(0,4) state_next, reward, done, is_win = env.step_simultaneous( action, pen_state) shape_n_next, source_next, canvas_next, pointer_next = state_next state = state_next eval_score += reward eval_score = round(eval_score, 2) eval_scores.append(eval_score) if is_win: eval_wins += 1 # test_win_pct = (eval_wins/n_eval_games) * 100 if np.mean(eval_scores ) >= self.eval_best_score and agent.epsilon == 0: self.eval_best_score = np.mean(eval_scores) if is_eval: agent.save_models() # if eval_score >= self.eval_best_score and agent.epsilon == 0: # self.eval_best_score = eval_score # if eval_wins >= self.eval_best_win_n and agent.epsilon == 0: # self.eval_best_win_n = eval_wins # # TODO: What do we prefer? An agent that achieves higher reward but does not draw 100% correct, or an agent that draws well but takes more time? Reward functions, however, could change. # if is_eval: # agent.save_models() eval_or_test_name = 'eval' if is_eval else 'test' print('############################\n' + eval_or_test_name + '\n', n_games, 'games avg SCORE:', np.mean(eval_scores), 'win pct (%)', (eval_wins / n_games) * 100, '\n##################\n') if using_wandb: wandb.log({ str(n_games) + " " + str(eval_or_test_name) + " games, win pct (%)": (eval_wins / n_games) * 100 }) wandb.log({ str(n_games) + " " + str(eval_or_test_name) + " games, avg rewards": np.mean(eval_scores) }) plot_scores_testing(eval_scores, n_games, os.path.join(self.plots_path, name) + '_eval.png') # 'plots/' + name + '_eval.png')
def test_working(self, agent, name, n_test_games, n_test_games_to_avg): # n_test_games = config.n_test_games # n_test_games_to_avg = config.n_test_games_to_avg n_states = self.env.num_states # keep track of wins starts_per_states = {i: 0 for i in range(n_states)} wins_per_states = {i: 0 for i in range(n_states)} losses_per_states = {i: 0 for i in range(n_states)} '''########### TESTING ###########''' # TODO: un def test diverso da quello di sotto gia fatto # TODO: creare choose action per testing print('########### TESTING ###########') # agent.eval_Q.eval() test_wins = 0 test_scores = [] agent.load_models() with torch.no_grad(): agent.is_training(training=False) for test_game_idx in range(n_test_games): print('testssss') done = False test_score = 0 game_result = 'lost' state = self.env.reset() starting_state = self.env.starting_pos starts_per_states[starting_state] += 1 while not done: self.env.render() # print(agent.epsilon) #if test_game_idx % 50 == 0: # self.env.print_debug() shape_n, source, canvas, pointer = state # source, canvas, pointer = state state = np.append(source.reshape(-1), canvas.reshape(-1)) state = np.append(state, pointer) state = np.array(state, dtype=np.float32) # prevent automatic casting to float64 (don't know why that happened though...) # action = agent.choose_action(state) action, act_scores = agent.choose_action_debug(state) act_scores = act_scores[1] # need to take advantages if working with DuelingDDQN show_q_values(source.shape[0],pointer[0], pointer[1], act_scores.detach().cpu().numpy()[0]) # action = random.randint(0,4) state_next, reward, done, is_win = self.env.step(action) print(action, act_scores, reward) shape_n_next, source_next, canvas_next, pointer_next = state_next # source_next, canvas_next, pointer_next = state_next state = state_next test_score += reward test_score = round(test_score, 2) test_scores.append(test_score) if np.array_equal(source_next, canvas_next): test_wins += 1 game_result = 'won' wins_per_states[starting_state] += 1 else: losses_per_states[starting_state] += 1 print('############################\n game', test_game_idx, '\nscore:', test_scores[-1], '- game', game_result) # test_win_pct = (test_wins / n_test_games) * 100 print('############################\n', test_game_idx, 'games tested.\n', n_test_games, 'games avg SCORE:', np.mean(test_scores), '\n win pct (%):', (test_wins / (test_game_idx + 1)) * 100) wandb.log({str(n_test_games) + " test games, avg score": np.mean(test_scores[n_test_games_to_avg-1:])}) wandb.log({str(n_test_games) + " test games, win pct": test_wins / n_test_games * 100}) plot_scores_testing(test_scores, n_test_games_to_avg, os.path.join(self.plots_path, name) + '_test.png') # 'plots/' + name + '_test.png') print('Starts per states') print(starts_per_states) print('Wins per states') print(wins_per_states) print('#############') print('Losses per states') print(losses_per_states)
def train(name, env, agent, plots_path, max_steps, n_train_games_to_avg, eval_games_freq, n_eval_games, using_wandb=False): scores = [] epsilon_history = [] best_score = -1000 best_win_pct = 0 eval_best_win_n = 0 test_win_pct = 0 best_train_avg_score = -1000 wins = 0 max_steps = max_steps n_steps = 0 game_n = 0 while True: agent.is_training() if n_steps >= max_steps: break game_n += 1 done = False score = 0 state = env.reset() is_win = False while not done: n_steps += 1 # TODO: shape_n not used, for now shape_n, source, canvas, pointer = state # source, canvas, pointer = state state = np.append(source.reshape(-1), canvas.reshape(-1)) state = np.append(state, pointer) state = np.array( state, dtype=np.float32 ) # prevent automatic casting to float64 (don't know why that happened though...) action, pen_state = agent.choose_action(state) # action = random.randint(0,4) state_next, reward, done, is_win = env.step_simultaneous( action, pen_state) shape_n_next, source_next, canvas_next, pointer_next = state_next # source_next, canvas_next, pointer_next = state_next # if done: # if np.array_equal(source_next, canvas_next): # if reward == 100: # print('win') # wins += 1 flat_state_next = np.append(source_next.reshape(-1), canvas_next.reshape(-1)) flat_state_next = np.append(flat_state_next, pointer_next) # TODO: Try not casting done to int agent.store_transition(state, action, pen_state, reward, flat_state_next, int(done)) agent.learn() state = state_next score += reward score = round(score, 2) if is_win: wins += 1 # Code below runs after each game if game_n % 200 == 0: print(score) scores.append(score) epsilon_history.append(agent.epsilon) # if np.mean(scores[-n_train_games_to_avg:]) >= best_train_avg_score: # best_train_avg_score = np.mean(scores[-n_train_games_to_avg:]) # agent.save_models() if game_n % n_train_games_to_avg == 0: print('############################\ntraining recap after', n_steps, 'steps and', game_n, 'games.\n', '50 games avg SCORE:', np.mean(scores[-n_train_games_to_avg:]), 'eps:', agent.epsilon, '50 games win pct', wins / n_train_games_to_avg, '\n##################\n') plot_scores(scores, epsilon_history, n_train_games_to_avg, os.path.join(plots_path, name) + '.png') # 'plots/' + name + '.png') if using_wandb: wandb.log({ "50 games avg reward": np.mean(scores[-n_train_games_to_avg:]) }) wandb.log( {"50 games n wins": wins / n_train_games_to_avg * 100}) wandb.log({"epsilon": agent.epsilon}) wins = 0 '''########### EVALUATION ###########''' # TODO: un def test diverso da quello di sotto gia fatto # TODO: Creare choose action per testing if game_n % eval_games_freq == 0: with torch.no_grad(): is_win = False agent.is_training(False) best_eval_score = -100 # agent.eval_Q.eval() eval_wins = 0 eval_scores = [] for test_game_idx in range(n_eval_games): done = False eval_score = 0 state = env.reset() while not done: # print(agent.epsilon) # if test_game_idx % 10 == 0: # env.print_debug() shape_n, source, canvas, pointer = state # source, canvas, pointer = state state = np.append(source.reshape(-1), canvas.reshape(-1)) state = np.append(state, pointer) state = np.array( state, dtype=np.float32 ) # prevent automatic casting to float64 (don't know why that happened though...) action, pen_state = agent.choose_action(state) # action = random.randint(0,4) state_next, reward, done, is_win = env.step_simultaneous( action, pen_state) shape_n_next, source_next, canvas_next, pointer_next = state_next state = state_next eval_score += reward eval_score = round(eval_score, 2) eval_scores.append(eval_score) if is_win: eval_wins += 1 # test_win_pct = (eval_wins/n_eval_games) * 100 # if np.mean(eval_scores) >= best_eval_score: # best_eval_score = np.mean(eval_scores) # agent.save_models() if eval_wins >= eval_best_win_n and agent.epsilon == 0: eval_best_win_n = eval_wins # TODO: What do we prefer? An agent that achieves higher reward but does not draw 100% correct, or an agent that draws well but takes more time? Reward functions, however, could change. agent.save_models() print('############################\nevaluation after', n_steps, 'iterations.\n', n_eval_games, 'games avg SCORE:', np.mean(eval_scores), 'win pct (%)', (eval_wins / n_eval_games) * 100, '\n##################\n') if using_wandb: wandb.log({ str(n_eval_games) + " eval games, win pct (%)": (eval_wins / n_eval_games) * 100 }) wandb.log({ str(n_eval_games) + " eval games, avg rewards": np.mean(eval_scores) }) plot_scores_testing( eval_scores, n_eval_games, os.path.join(plots_path, name) + '_eval.png') # 'plots/' + name + '_eval.png')