def q3_learn_on(gamma, alpha, ep_num, ep_len, eps, pond: FishPond, Q_asterisk):
    """
    creates episodes for Q - learning with action choosing using the learned Q
    :param ep_num: the number of episodes to create
    :param ep_len: the length of each episode to create
    :param eps: epsilon rate for the greedy policy
    :param Q_asterisk: optimal Q
    """
    err, Q = q3_learn_off(gamma, alpha, 1, ep_len, eps, pond, Q_asterisk)
    for e in range(ep_num - 1):
        pond.reset()
        print(e)
        for j in range(ep_len):
            x, y = pond.current_state
            n_a = np.argmax(Q[x, y, :])
            action = actions[n_a]
            reached_end = pond.perform_action(action)
            if reached_end:
                r_s_a = 0
            else:
                r_s_a = -1
            x_tag, y_tag = pond.current_state
            # temp_Q[x, y, n_a] = Q[x, y, n_a] + alpha * (r_s_a + gamma * np.max(Q[x, y, :]) - Q[x, y, n_a])
            Q[x, y, n_a] = Q[x, y, n_a] + alpha * (
                r_s_a + gamma * np.max(Q[x_tag, y_tag, :]) - Q[x, y, n_a])
            if reached_end:
                break
        err = np.append(err, get_MSE(Q, Q_asterisk))
    return err, Q
def q0_sample_policy(pond: FishPond):
    # Sample code for running a policy and plotting the trajectory
    for i in range(30):
        action = the_right_policy()
        reached_end = pond.perform_action(action)
        pond.plot()
        if reached_end:
            break
    print('Done')
    plt.savefig('Q0_' + pond_name + '.png')
    plt.show()
def q3_play_a_game(pond: FishPond, pi, section):
    path_len = np.abs(pond.start_state[0] -
                      pond.end_state[0]) + np.abs(pond.start_state[1] -
                                                  pond.end_state[1])
    pond.reset()
    for i in range(3 * path_len):
        action = pi[pond.current_state[0], pond.current_state[1]]
        action = actions[action]
        reached_end = pond.perform_action(action)
        pond.plot()
        if reached_end:
            break
    print('Done')
    plt.savefig('Q3_' + pond_name + '_' + section + '_game.png')
    plt.show()
def q1_greedy_policy(pond: FishPond):
    """given a fishpond, runs a game with respect to the greedy policy as implemented in the function
    the_greedy_policy"""
    path_len = np.abs(pond.start_state[0] -
                      pond.end_state[0]) + np.abs(pond.start_state[1] -
                                                  pond.end_state[1])
    # outer loop for number of episodes
    for e in range(1):
        pond.reset()
        # inner loop for an episode
        for i in range(3 * path_len):
            action = the_greedy_policy(pond.current_state, pond.end_state)
            reached_end = pond.perform_action(action)
            pond.plot()
            if reached_end:
                break
    print('Done')
    plt.savefig('Q1_' + pond_name + '.png')
    plt.show()
def q2_greedy_policy(pond: FishPond, gamma):
    """given a fishpond, runs a single game with the policy computed with the policy iteration procedure implemented
    in the function q2_learn_q_phi"""
    path_len = np.abs(pond.start_state[0] -
                      pond.end_state[0]) + np.abs(pond.start_state[1] -
                                                  pond.end_state[1])
    pi, _ = q2_learn_q_phi(pond, gamma)
    pi = pi.astype(np.int64)
    pond.reset()
    for i in range(3 * path_len):
        action = pi[pond.current_state[0], pond.current_state[1]]
        action = actions[action]
        reached_end = pond.perform_action(action)
        pond.plot()
        if reached_end:
            break
    print('Done')
    plt.savefig('Q2_' + pond_name + '.png')
    plt.show()
def Q3(pond: FishPond):
    """wrapper function for Q3. runs Q-learning for each of the settings, then plots the errors"""
    gamma = 0.45
    alpha = 0.1
    epsilon = 0.5
    pond.reset()
    path_len = np.abs(pond.start_state[0] -
                      pond.end_state[0]) + np.abs(pond.start_state[1] -
                                                  pond.end_state[1])
    pi, Q_asterisk = q2_learn_q_phi(pond, gamma)
    err_off, Q_off = q3_learn_off(gamma, alpha, 30000, 3 * path_len, epsilon,
                                  pond, Q_asterisk)
    err_on, Q_on = q3_learn_on(gamma, alpha, 30000, 3 * path_len, epsilon,
                               pond, Q_asterisk)
    plot_errors(err_off, err_on)

    # play a game
    pi_off = np.argmax(Q_off, axis=2)
    pi_on = np.argmax(Q_on, axis=2)
    q3_play_a_game(pond, pi_off, 'a')
    q3_play_a_game(pond, pi_on, 'b')
def q3_learn_off(gamma, alpha, ep_num, ep_len, eps, pond: FishPond,
                 Q_asterisk):
    """
    creates episodes for Q - learning with action choosing with greedy policy
    :param ep_num: the number of episodes to create
    :param ep_len: the length of each episode to create
    :param eps: epsilon rate for the greedy policy
    """
    Q = np.zeros((pond.pond_size[0], pond.pond_size[1], 4))
    # Q = np.ones((pond.pond_size[0], pond.pond_size[1], 4)) * (-1)
    # Q[pond.end_state[0], pond.end_state[1]] = 0
    err = np.empty(0)
    for e in range(ep_num):
        pond.reset()
        print(e)
        i = 0
        for j in range(ep_len):
            # while not reached_end:
            x, y = pond.current_state
            a1 = the_greedy_policy(pond.current_state, pond.end_state)
            a2 = np.random.choice(actions)
            action = np.random.choice([a1, a2], 1, p=[1 - eps, eps])[0]
            n_a = numed_actions[action]
            reached_end = pond.perform_action(action)
            if reached_end:
                r_s_a = 0
            else:
                r_s_a = -1
            x_tag, y_tag = pond.current_state
            Q[x, y, n_a] = Q[x, y, n_a] + alpha * (
                r_s_a + gamma * np.max(Q[x_tag, y_tag, :]) - Q[x, y, n_a])
            i += 1
            if reached_end:
                break
        err = np.append(err, get_MSE(Q, Q_asterisk))
    return err, Q
def q2_compute_Q(prev_Q, pond: FishPond, gamma):
    """ computes the Q values for each state and action at a given iteration"""
    r_s_a = -1
    Q = np.zeros(prev_Q.shape)
    Q_max = np.max(prev_Q, axis=2)
    for x in range(pond.pond_size[0]):
        for y in range(pond.pond_size[1]):
            add_reward = 0 if x == pond.end_state[0] and y == pond.end_state[
                1] else 1
            for i, a in enumerate(actions):
                p_s_tag = pond.get_action_outcomes((x, y), a)
                p_tag_summed = 0
                for option in p_s_tag:
                    pos = option[1]
                    add_reward = 0 if pos[0] == pond.end_state[0] and pos[
                        1] == pond.end_state[1] else add_reward
                    p_tag_summed += Q_max[pos[0], pos[1]] * option[0]
                p_tag_summed *= gamma
                Q[x, y, i] = p_tag_summed
                if add_reward:
                    Q[x, y, i] += r_s_a
    return Q
    gamma = 0.45
    alpha = 0.1
    epsilon = 0.5
    pond.reset()
    path_len = np.abs(pond.start_state[0] -
                      pond.end_state[0]) + np.abs(pond.start_state[1] -
                                                  pond.end_state[1])
    pi, Q_asterisk = q2_learn_q_phi(pond, gamma)
    err_off, Q_off = q3_learn_off(gamma, alpha, 30000, 3 * path_len, epsilon,
                                  pond, Q_asterisk)
    err_on, Q_on = q3_learn_on(gamma, alpha, 30000, 3 * path_len, epsilon,
                               pond, Q_asterisk)
    plot_errors(err_off, err_on)

    # play a game
    pi_off = np.argmax(Q_off, axis=2)
    pi_on = np.argmax(Q_on, axis=2)
    q3_play_a_game(pond, pi_off, 'a')
    q3_play_a_game(pond, pi_on, 'b')


if __name__ == "__main__":
    names = ['pond1', 'pond2', 'pond3', 'pond4', 'pond43', 'pond5']
    for pond_name in names:
        my_pond = FishPond(pond_name + '.txt')
        # Run some sample policy for reference
        q0_sample_policy(my_pond)
        q1_greedy_policy(my_pond)
        q2_greedy_policy(my_pond, 1)
        Q3(my_pond)