Esempio n. 1
0
def grid_sampling(theta, cm, K, Ke, N, epsilon):
    theta_list = np.random.multivariate_normal(theta, cm, K)
    result_list = []
    for x in range(K):
        # concurrent_eval(theta_list, x, result_list, N)
        avg_reward = 0
        for i in range(N):
            grid = Grid()
            grid.pi_params = theta_list[x].reshape(23, 4)
            grid.softmax()
            epi = GridEpisode(grid)
            avg_reward += epi.run_all_steps()
        result_list.append((theta_list[x], avg_reward / N))

    # print(sorted(result_list, key=lambda n: n[-1], reverse=True))
    elite_list = sorted(result_list, key=lambda n: n[-1], reverse=True)[:Ke]
    # print(elite_list)
    theta_final = np.zeros(92)
    cm_final = epsilon * np.identity(92)
    J_final = 0
    for t in elite_list:
        theta_final += t[0]
        cm_final += np.array([t[0] - theta]).T.dot(np.array([t[0] - theta]))
        J_final += t[1]
    theta_final /= Ke
    cm_final /= (epsilon + Ke)
    # print(cm_final)
    J_final /= Ke
    return theta_final, cm_final, J_final
Esempio n. 2
0
def qlearning_grid(lr, eps, epoch=100, searchbound=400):
    grid = Grid()
    grid.pi_params = np.zeros((23, 4))
    grid.softmax()
    actions = grid.action
    estimated_rewards = np.zeros(epoch)

    q = np.zeros((23, 4))

    for x in range(epoch):
        s = grid.d_zero()

        while s != [5, 5]:
            # choose new_a from new_s using policy derived from q
            pi_temp = pe.softmax(q[grid.get_index(s)], actions, eps(x))
            a = np.random.choice(actions, 1, p=pi_temp)[0]
            # print(q)
            # Take action a and observe r and s′;
            new_s, r = grid.P_and_R(s, a)
            q[grid.get_index(s), actions.index(a)] += lr * (
                r + grid.gamma * np.max(q[grid.get_index(new_s)]) -
                q[grid.get_index(s), actions.index(a)])
            s = new_s
        # using q function to estimate the reward and add it to estimated_reward
        # print('episode: ', x, ', q function: ', q)
        grid.pi_params = pe.softmax(q, actions, eps(x))
        grid_epi = GridEpisode(grid, step_bound=searchbound)
        # print('episode: ', x, ', pi: ', grid.pi_params)
        estimated_rewards[x] = grid_epi.run_all_steps()
        print('episode: ', x, ', reward: ', estimated_rewards[x], 'epsilon: ',
              eps(x))
        # decay *= decay_rate

    return estimated_rewards
def grid_evaluate(table, N):
    avg_reward = 0
    for i in range(N):
        g = Grid()
        g.pi_params = table
        g.softmax()
        epi = GridEpisode(g)
        avg_reward += epi.run_all_steps()
    return avg_reward / N
Esempio n. 4
0
def multi_grid_episode(table, l):
    # total_reward = 0
    for i in l:
        grid = Grid()
        # print(i)
        grid.pi_params = table
        grid.softmax()
        epi = GridEpisode(grid)
        grid_q.put(epi.run_all_steps())
    return 0
Esempio n. 5
0
def grid_evaluate(t, N):
    reward_l = []
    table = t.reshape(23, 4)

    for i in range(N):
        # concurrent_eval(theta_list, x, result_list, N)
        grid = Grid()
        # print(i)
        grid.pi_params = table
        grid.softmax()
        epi = GridEpisode(grid)
        reward_l.append(epi.run_all_steps())
    return sum(reward_l) / N
Esempio n. 6
0
def sarsa_lambda_grid(lr, l, eps, epoch=100, searchbound=400):
    grid = Grid()
    grid.pi_params = np.zeros((23, 4))
    grid.softmax()
    actions = grid.action
    estimated_rewards = np.zeros(epoch)

    # Initialize tabular-q arbitrarily
    q = np.zeros((23, 4))

    # for each episode:
    for x in range(epoch):
        # s ∼ d0
        s = grid.d_zero()

        # e ← 0
        e = np.zeros((23, 4))

        # choose a from s using a policy derived from q (e.g., ε-greedy or softmax);
        pi_s = estimation.epsilon_greedy(q[grid.get_index(s)], actions, eps(x))
        a = np.random.choice(actions, 1, p=pi_s)[0]

        # for each time step, until s is the terminal absorbing state do
        while s != [5, 5]:
            # Take action a and observe r and s′;
            new_s, r = grid.P_and_R(s, a)

            # choose new_a from new_s using policy derived from q
            pi_temp = estimation.epsilon_greedy(q[grid.get_index(new_s)], actions, eps(x))
            new_a = np.random.choice(actions, 1, p=pi_temp)[0]

            # e ← γλe + ∂qw(s,a)/∂qw;
            e = l * grid.gamma * e
            e[grid.get_index(s), actions.index(a)] += 1
            # δ ← r + γqw(s′,a′) − qw(s,a);
            delta = r + grid.gamma * q[grid.get_index(new_s), actions.index(new_a)] - q[grid.get_index(s), actions.index(a)]
            # w ← w + αδe;
            q += lr * delta * e

            s = new_s
            a = new_a
        # using q function to estimate the reward and add it to estimated_reward
        # print('episode: ', x, ', q function: ', q)
        grid.pi_params = estimation.epsilon_greedy(q, actions, eps(x))
        grid_epi = GridEpisode(grid, step_bound=searchbound)
        # print('episode: ', x, ', pi: ', grid.pi_params)
        estimated_rewards[x] = grid_epi.run_all_steps()
        print('episode: ', x, ', reward: ', estimated_rewards[x], 'epsilon: ', eps(x))
        # decay *= decay_rate

    return estimated_rewards
def reinforce_grid(lr, eps, epoch=100, searchbound=400):
    estimated_rewards = np.zeros(epoch)

    # theta is a representation of policy
    theta = np.zeros((23, 4))
    grid = Grid()
    actions = grid.action
    # print(epoch)

    # for each episode:
    for x in range(epoch):
        # s ∼ d0
        s = grid.d_zero()
        count = 0
        hist_s = []
        hist_a = []
        hist_r = []
        grid.pi_params = estimation.softmax(theta, eps(x))
        # for each time step, until s is the terminal absorbing state do
        while s != [5, 5] and count < 1000:
            hist_s.append(s)
            a = grid.pi(s)
            hist_a.append(a)
            new_s, r = grid.P_and_R(s, a)
            hist_r.append(r)
            s = new_s
            count += 1

        # delta_j = 0
        decay = 1
        for i in range(len(hist_s)):
            g = 0
            gd = 1
            for j in range(i, len(hist_s)):
                g += gd * hist_r[j]
                gd *= grid.gamma
            theta[grid.get_index(hist_s[i]),
                  actions.index(hist_a[i])] += lr * decay * g
            decay *= grid.gamma

        grid.pi_params = estimation.softmax(theta, eps(x))
        # grid.softmax()
        grid_epi = GridEpisode(grid, step_bound=searchbound)
        # print('episode: ', x, ', pi: ', grid.pi_params)
        estimated_rewards[x] = grid_epi.run_all_steps()
        if x == epoch - 1:
            print('episode: ', x, ', reward: ', estimated_rewards[x])
        # decay *= decay_rate

    return estimated_rewards
def actor_critic_grid(lr, eps, epoch=100, searchbound=400):
    estimated_rewards = np.zeros(epoch)

    # Initialize tabular-v arbitrarily
    v = np.zeros(23)
    # theta is a representation of policy
    theta = np.zeros((23, 4))
    grid = Grid()
    actions = grid.action

    # for each episode:
    for x in range(epoch):
        # s ∼ d0
        s = grid.d_zero()
        count = 0
        # for each time step, until s is the terminal absorbing state do
        while s != [5, 5] and count < 1000:
            # a ∼ π(s, ·);
            grid.pi_params = estimation.softmax(theta, eps(x))
            a = grid.pi(s)
            # Take action a and observe r and s′;
            new_s, r = grid.P_and_R(s, a)

            # Critic update using TD(λ)
            # e ← γλe + ∂qw(s,a)/∂qw;
            delta = r + grid.gamma * v[grid.get_index(new_s)] - v[
                grid.get_index(s)]
            # w←w+αδev;
            v[grid.get_index(s)] += lr * delta

            theta[grid.get_index(s), actions.index(a)] += lr * delta
            # print(theta)

            s = new_s
            count += 1
        # using q function to estimate the reward and add it to estimated_reward
        # print('episode: ', x, ', q function: ', q)
        grid.pi_params = estimation.softmax(theta, eps(x))
        # grid.softmax()
        grid_epi = GridEpisode(grid, step_bound=searchbound)
        # print('episode: ', x, ', pi: ', grid.pi_params)
        estimated_rewards[x] = grid_epi.run_all_steps()
        if x == 99:
            print('episode: ', x, ', reward: ', estimated_rewards[x])
        # decay *= decay_rate

    return estimated_rewards
    for i in range(N):
        cartpole = CartPole()
        cartpole.pi_params = table
        epi = CartPoleEpisode(cartpole)
        avg_reward += epi.run_all_steps()
    return avg_reward / N


tic = time.time()

theta = np.ones(92) * 0.25
theta_f = grid_param_sampling(theta, 0.5, 200)
grid = Grid()
grid.pi_params = theta_f.reshape(23, 4)
grid.softmax()
episode = GridEpisode(grid)

print('optimized reward: ', episode.run_all_steps())
print('optimized theta: ', theta_f.reshape(23, 4))

# theta = np.ones(8) * 0.25
# theta_f = cartpole_sampling(theta, 0.5, 500)
# cartpole = CartPole()
# cartpole.pi_params = theta_f.reshape(4, 2)
# episode = CartPoleEpisode(cartpole)

# print('optimized reward: ', episode.run_all_steps())
# print('optimized theta: ', theta_f.reshape(4, 2))

toc = time.time()
print('running time: ', (toc - tic) / 60, ' mins')