Esempio n. 1
0
def td_cp_single(f_order, alpha):
    d = 4

    cartpole = CartPole()

    print('cartpole ', f_order, ' td')

    weight = np.zeros((1, (f_order + 1) ** d))
    # update weight in 100 loops
    print('alpha = ', alpha)
    for x in range(100):
        s = cartpole.d_zero()
        count = 0
        while np.abs(s[0]) < cartpole.edge and np.abs(s[1]) < cartpole.fail_angle and count < 1010:
            a = cartpole.pi(s)
            new_s, r = cartpole.P_and_R(s, a)
            weight += alpha * (r + vw(weight, new_s, f_order) - vw(weight, s, f_order)) * dvwdw(weight, s,
                                                                                                f_order).T
            s = new_s
            print(weight)
            count += 1
    # calculate td in another 100 loops
    td_list = []
    for x in range(100):
        s = cartpole.d_zero()
        count = 0
        while np.abs(s[0]) < cartpole.edge and np.abs(s[1]) < cartpole.fail_angle and count < 1010:
            a = cartpole.pi(s)
            new_s, r = cartpole.P_and_R(s, a)
            td_list.append((r + vw(weight, new_s, f_order) - vw(weight, s, f_order)) ** 2)
            s = new_s
            count += 1
        td_list.append(0)
    print('square td = ', np.mean(np.array(td_list)))
Esempio n. 2
0
def sarsa_cartpole(lr, baseparams, epoch=100, eps=1e-2, base='fourier'):
    cartpole = CartPole()
    estimated_rewards = np.zeros(epoch)
    actions = cartpole.actions
    w = None

    if base == 'fourier':
        order = baseparams['order']
        s = cartpole.d_zero()
        w = np.zeros((1, len(actions) * (order + 1)**len(s)))

    elif base == 'tile':
        num_tilings, tiles_per_tiling = baseparams['num_tilings'], baseparams[
            'tiles_per_tiling']
        s = cartpole.d_zero()
        w = np.zeros(
            (1, len(actions) * num_tilings * (tiles_per_tiling**len(s))))

    elif base == 'rbf':
        order = baseparams['order']
        s = cartpole.d_zero()
        w = np.zeros((1, len(actions) * order**len(s)))

    for x in range(epoch):
        s = cartpole.d_zero()

        # choose a from s using a policy derived from q (e.g., ε-greedy or softmax);
        first_q = pe.epsilon_greedy(pe.qw(w, s, actions, base, baseparams),
                                    actions, eps)
        # pi_s = pe.epsilon_greedy(pe.qw(w, s, order, actions, base), actions, eps)
        a = np.random.choice(actions, 1, p=first_q)[0]

        count = 0

        while np.abs(s[0]) < cartpole.edge and np.abs(
                s[1]) < cartpole.fail_angle and count < 1010:
            # Take action a and observe r and s′;
            new_s, r = cartpole.P_and_R(s, a)

            # Choose a′ from s′ using a policy derived from q;
            pi_temp = pe.epsilon_greedy(
                pe.qw(w, new_s, actions, base, baseparams), actions, eps)
            new_a = np.random.choice(actions, 1, p=pi_temp)[0]

            # w += lr * (r + pe.qw_fourier_ele(w, new_s, new_a, order, actions) -
            # pe.qw_fourier_ele(w, s, a, order, actions)) * pe.dqwdw_fourier(s, a, order, actions)
            new_q = pe.qw_ele(w, new_s, new_a, actions, base, baseparams)[0]
            q, dqdw = pe.qw_ele(w, s, a, actions, base, baseparams)
            w += lr * (r + new_q - q) * dqdw

            s = new_s
            a = new_a
            count += 1

        epi = CartPoleEpisode(cartpole)
        estimated_rewards[x] = epi.run_with_w(w, eps, base, baseparams)
        print('episode: ', x, ', reward: ', estimated_rewards[x])
        # print('episode: ', x, ', w: ', w)

    return estimated_rewards
Esempio n. 3
0
def td_cp(lrs, f_order):
    d = 4

    alpha_result = []
    cartpole = CartPole()

    print('cartpole ', f_order, ' td')

    # kth order Fourier Basis is defined as:
    for alpha in lrs:
        weight = np.zeros((1, (f_order + 1) ** d))
        # update weight in 100 loops
        print('alpha = ', alpha)
        for x in range(100):
            s = cartpole.d_zero()
            count = 0
            while np.abs(s[0]) < cartpole.edge and np.abs(s[1]) < cartpole.fail_angle and count < 1010:
                a = cartpole.pi(s)
                new_s, r = cartpole.P_and_R(s, a)
                weight += alpha * (r + vw(weight, new_s, f_order) - vw(weight, s, f_order)) * dvwdw(weight, s,
                                                                                                      f_order).T
                s = new_s
                count += 1
        # print(weight)

        # calculate td in another 100 loops
        td_list = []
        for x in range(100):
            s = cartpole.d_zero()
            count = 0
            while np.abs(s[0]) < cartpole.edge and np.abs(s[1]) < cartpole.fail_angle and count < 1010:
                a = cartpole.pi(s)
                new_s, r = cartpole.P_and_R(s, a)
                td_list.append((r + vw(weight, new_s, f_order) - vw(weight, s, f_order)) ** 2)
                s = new_s
                count += 1
            td_list.append(0)

        msv = np.mean(np.array(td_list))
        print('square td = ', msv)
        if np.isnan(msv):
            alpha_result.append(1e100)
        else:
            alpha_result.append(msv)

    print('##########################')
    return alpha_result
Esempio n. 4
0
def qlearning_cartpole(lr, baseparams, decaylambda, epoch=100, base='fourier'):
    cartpole = CartPole()
    estimated_rewards = np.zeros(epoch)
    actions = cartpole.actions
    w = None

    if base == 'fourier':
        order = baseparams['order']
        s = cartpole.d_zero()
        w = np.zeros((1, len(actions) * (order + 1)**len(s)))

    elif base == 'tile':
        num_tilings, tiles_per_tiling = baseparams['num_tilings'], baseparams[
            'tiles_per_tiling']
        s = cartpole.d_zero()
        w = np.zeros((1, len(actions) * num_tilings))

    for x in range(epoch):
        s = cartpole.d_zero()

        count = 0

        while np.abs(s[0]) < cartpole.edge and np.abs(
                s[1]) < cartpole.fail_angle and count < 1010:
            # Choose a′ from s′ using a policy derived from q;
            pi_temp = pe.epsilon_greedy(pe.qw(w, s, actions, base, baseparams),
                                        actions, decaylambda(x))
            a = np.random.choice(actions, 1, p=pi_temp)[0]

            # Take action a and observe r and s′;
            new_s, r = cartpole.P_and_R(s, a)

            # w += lr * (r + pe.qw_fourier_ele(w, new_s, new_a, order, actions) -
            # pe.qw_fourier_ele(w, s, a, order, actions)) * pe.dqwdw_fourier(s, a, order, actions)
            new_q = np.max(pe.qw(w, new_s, actions, base, baseparams))
            q, dqdw = pe.qw_ele(w, s, a, actions, base, baseparams)
            w += lr * (r + new_q - q) * dqdw

            s = new_s
            count += 1

        epi = CartPoleEpisode(cartpole)
        estimated_rewards[x] = epi.run_with_w_softmax(w, decaylambda(x), base,
                                                      baseparams)
        print('episode: ', x, ', reward: ', estimated_rewards[x])
        # print('episode: ', x, ', w: ', w)

    return estimated_rewards