Example #1
0
    def eval(self,
             env,
             agent,
             n_games,
             name,
             is_eval=False,
             using_wandb=False,
             do_render=False):
        if not is_eval:  # if it's not eval, it's testing
            agent.load_models()
        with torch.no_grad():
            step_count = 0
            is_win = False
            agent.is_training(False)
            eval_wins = 0
            eval_scores = []
            for test_game_idx in range(n_games):
                done = False
                eval_score = 0
                state = env.reset()
                while not done:
                    step_count += 1
                    if do_render:
                        self.env.render()
                    # print(agent.epsilon)
                    # if test_game_idx % 10 == 0:
                    #    env.print_debug()
                    shape_n, source, canvas, pointer = state
                    # source, canvas, pointer = state
                    state = np.append(source.reshape(-1), canvas.reshape(-1))
                    # state = source.reshape(-1)
                    state = np.append(state, pointer)
                    state = np.array(
                        state, dtype=np.float32
                    )  # prevent automatic casting to float64 (don't know why that happened though...)

                    action, pen_state = agent.choose_action(state)
                    # action = random.randint(0,4)
                    state_next, reward, done, is_win = env.step_simultaneous(
                        action, pen_state)
                    shape_n_next, source_next, canvas_next, pointer_next = state_next

                    state = state_next

                    eval_score += reward
                    eval_score = round(eval_score, 2)

                eval_scores.append(eval_score)

                if is_win:
                    eval_wins += 1

            # test_win_pct = (eval_wins/n_eval_games) * 100
            if np.mean(eval_scores
                       ) >= self.eval_best_score and agent.epsilon == 0:
                self.eval_best_score = np.mean(eval_scores)
                if is_eval:
                    agent.save_models()

            # if eval_score >= self.eval_best_score and agent.epsilon == 0:
            #     self.eval_best_score = eval_score
            # if eval_wins >= self.eval_best_win_n and agent.epsilon == 0:
            #    self.eval_best_win_n = eval_wins
            #    # TODO: What do we prefer? An agent that achieves higher reward but does not draw 100% correct, or an agent that draws well but takes more time? Reward functions, however, could change.
            # if is_eval:
            #     agent.save_models()

            eval_or_test_name = 'eval' if is_eval else 'test'
            print('############################\n' + eval_or_test_name + '\n',
                  n_games, 'games avg SCORE:', np.mean(eval_scores),
                  'win pct (%)', (eval_wins / n_games) * 100,
                  '\n##################\n')
            if using_wandb:
                wandb.log({
                    str(n_games) + " " + str(eval_or_test_name) + " games, win pct (%)":
                    (eval_wins / n_games) * 100
                })
                wandb.log({
                    str(n_games) + " " + str(eval_or_test_name) + " games, avg rewards":
                    np.mean(eval_scores)
                })
            plot_scores_testing(eval_scores, n_games,
                                os.path.join(self.plots_path, name) +
                                '_eval.png')  # 'plots/' + name + '_eval.png')
    def test_working(self, agent, name, n_test_games, n_test_games_to_avg):
        # n_test_games = config.n_test_games
        # n_test_games_to_avg = config.n_test_games_to_avg

        n_states = self.env.num_states

        # keep track of wins
        starts_per_states = {i: 0 for i in range(n_states)}
        wins_per_states = {i: 0 for i in range(n_states)}
        losses_per_states = {i: 0 for i in range(n_states)}
        '''########### TESTING ###########'''
        # TODO: un def test diverso da quello di sotto gia fatto
        # TODO: creare choose action per testing
        print('########### TESTING ###########')
        # agent.eval_Q.eval()
        test_wins = 0
        test_scores = []
        agent.load_models()
        with torch.no_grad():
            agent.is_training(training=False)
            for test_game_idx in range(n_test_games):
                print('testssss')
                done = False
                test_score = 0
                game_result = 'lost'
                state = self.env.reset()
                starting_state = self.env.starting_pos
                starts_per_states[starting_state] += 1
                while not done:
                    self.env.render()
                    # print(agent.epsilon)
                    #if test_game_idx % 50 == 0:
                    #    self.env.print_debug()
                    shape_n, source, canvas, pointer = state
                    # source, canvas, pointer = state
                    state = np.append(source.reshape(-1), canvas.reshape(-1))
                    state = np.append(state, pointer)
                    state = np.array(state, dtype=np.float32)  # prevent automatic casting to float64 (don't know why that happened though...)
                    # action = agent.choose_action(state)
                    action, act_scores = agent.choose_action_debug(state)
                    act_scores = act_scores[1] # need to take advantages if working with DuelingDDQN
                    show_q_values(source.shape[0],pointer[0], pointer[1], act_scores.detach().cpu().numpy()[0])
                    # action = random.randint(0,4)
                    state_next, reward, done, is_win = self.env.step(action)
                    print(action, act_scores, reward)
                    shape_n_next, source_next, canvas_next, pointer_next = state_next
                    # source_next, canvas_next, pointer_next = state_next
                    state = state_next

                    test_score += reward
                    test_score = round(test_score, 2)
                test_scores.append(test_score)
                if np.array_equal(source_next, canvas_next):
                    test_wins += 1
                    game_result = 'won'
                    wins_per_states[starting_state] += 1
                else:
                    losses_per_states[starting_state] += 1
                print('############################\n game', test_game_idx, '\nscore:', test_scores[-1], '- game',
                      game_result)

                # test_win_pct = (test_wins / n_test_games) * 100

                print('############################\n', test_game_idx, 'games tested.\n', n_test_games,
                      'games avg SCORE:',
                      np.mean(test_scores), '\n win pct (%):', (test_wins / (test_game_idx + 1)) * 100)

            wandb.log({str(n_test_games) + " test games, avg score": np.mean(test_scores[n_test_games_to_avg-1:])})
            wandb.log({str(n_test_games) + " test games, win pct": test_wins / n_test_games * 100})

            plot_scores_testing(test_scores, n_test_games_to_avg, os.path.join(self.plots_path, name) + '_test.png')  # 'plots/' + name + '_test.png')

            print('Starts per states')
            print(starts_per_states)
            print('Wins per states')
            print(wins_per_states)
            print('#############')
            print('Losses per states')
            print(losses_per_states)
def train(name,
          env,
          agent,
          plots_path,
          max_steps,
          n_train_games_to_avg,
          eval_games_freq,
          n_eval_games,
          using_wandb=False):
    scores = []
    epsilon_history = []
    best_score = -1000
    best_win_pct = 0
    eval_best_win_n = 0
    test_win_pct = 0
    best_train_avg_score = -1000
    wins = 0
    max_steps = max_steps
    n_steps = 0
    game_n = 0
    while True:
        agent.is_training()
        if n_steps >= max_steps:
            break
        game_n += 1
        done = False
        score = 0
        state = env.reset()
        is_win = False
        while not done:
            n_steps += 1
            # TODO: shape_n not used, for now
            shape_n, source, canvas, pointer = state
            # source, canvas, pointer = state
            state = np.append(source.reshape(-1), canvas.reshape(-1))
            state = np.append(state, pointer)
            state = np.array(
                state, dtype=np.float32
            )  # prevent automatic casting to float64 (don't know why that happened though...)
            action, pen_state = agent.choose_action(state)
            # action = random.randint(0,4)
            state_next, reward, done, is_win = env.step_simultaneous(
                action, pen_state)
            shape_n_next, source_next, canvas_next, pointer_next = state_next
            # source_next, canvas_next, pointer_next = state_next
            # if done:
            # if np.array_equal(source_next, canvas_next):
            # if reward == 100:
            #    print('win')
            #    wins += 1

            flat_state_next = np.append(source_next.reshape(-1),
                                        canvas_next.reshape(-1))
            flat_state_next = np.append(flat_state_next, pointer_next)

            # TODO: Try not casting done to int
            agent.store_transition(state, action, pen_state, reward,
                                   flat_state_next, int(done))
            agent.learn()

            state = state_next

            score += reward
            score = round(score, 2)

        if is_win:
            wins += 1
        # Code below runs after each game
        if game_n % 200 == 0:
            print(score)
        scores.append(score)
        epsilon_history.append(agent.epsilon)
        # if np.mean(scores[-n_train_games_to_avg:]) >= best_train_avg_score:
        #     best_train_avg_score = np.mean(scores[-n_train_games_to_avg:])
        #     agent.save_models()
        if game_n % n_train_games_to_avg == 0:
            print('############################\ntraining recap after',
                  n_steps, 'steps and', game_n,
                  'games.\n', '50 games avg SCORE:',
                  np.mean(scores[-n_train_games_to_avg:]), 'eps:',
                  agent.epsilon, '50 games win pct',
                  wins / n_train_games_to_avg, '\n##################\n')
            plot_scores(scores, epsilon_history, n_train_games_to_avg,
                        os.path.join(plots_path, name) +
                        '.png')  # 'plots/' + name + '.png')
            if using_wandb:
                wandb.log({
                    "50 games avg reward":
                    np.mean(scores[-n_train_games_to_avg:])
                })
                wandb.log(
                    {"50 games n wins": wins / n_train_games_to_avg * 100})
                wandb.log({"epsilon": agent.epsilon})
            wins = 0
        '''########### EVALUATION ###########'''
        # TODO: un def test diverso da quello di sotto gia fatto
        # TODO: Creare choose action per testing
        if game_n % eval_games_freq == 0:
            with torch.no_grad():
                is_win = False
                agent.is_training(False)
                best_eval_score = -100
                # agent.eval_Q.eval()
                eval_wins = 0
                eval_scores = []
                for test_game_idx in range(n_eval_games):
                    done = False
                    eval_score = 0
                    state = env.reset()
                    while not done:
                        # print(agent.epsilon)
                        # if test_game_idx % 10 == 0:
                        #    env.print_debug()
                        shape_n, source, canvas, pointer = state
                        # source, canvas, pointer = state
                        state = np.append(source.reshape(-1),
                                          canvas.reshape(-1))
                        state = np.append(state, pointer)
                        state = np.array(
                            state, dtype=np.float32
                        )  # prevent automatic casting to float64 (don't know why that happened though...)

                        action, pen_state = agent.choose_action(state)
                        # action = random.randint(0,4)
                        state_next, reward, done, is_win = env.step_simultaneous(
                            action, pen_state)
                        shape_n_next, source_next, canvas_next, pointer_next = state_next

                        state = state_next

                        eval_score += reward
                        eval_score = round(eval_score, 2)

                    eval_scores.append(eval_score)

                    if is_win:
                        eval_wins += 1
                # test_win_pct = (eval_wins/n_eval_games) * 100
                # if np.mean(eval_scores) >= best_eval_score:
                #    best_eval_score = np.mean(eval_scores)
                #    agent.save_models()
                if eval_wins >= eval_best_win_n and agent.epsilon == 0:
                    eval_best_win_n = eval_wins
                    # TODO: What do we prefer? An agent that achieves higher reward but does not draw 100% correct, or an agent that draws well but takes more time? Reward functions, however, could change.
                    agent.save_models()

                print('############################\nevaluation after',
                      n_steps, 'iterations.\n', n_eval_games,
                      'games avg SCORE:', np.mean(eval_scores), 'win pct (%)',
                      (eval_wins / n_eval_games) * 100,
                      '\n##################\n')
                if using_wandb:
                    wandb.log({
                        str(n_eval_games) + " eval games, win pct (%)":
                        (eval_wins / n_eval_games) * 100
                    })
                    wandb.log({
                        str(n_eval_games) + " eval games, avg rewards":
                        np.mean(eval_scores)
                    })
                plot_scores_testing(
                    eval_scores, n_eval_games,
                    os.path.join(plots_path, name) +
                    '_eval.png')  # 'plots/' + name + '_eval.png')