Esempio n. 1
0
def sarsa_lambda_mc(lr, l, baseparams, eps, epoch=100, base='fourier'):
    mc = MountainCar()
    estimated_rewards = np.zeros(epoch)
    actions = mc.actions
    w = None

    if base == 'fourier':
        order = baseparams['order']
        s = mc.d_zero()
        w = np.zeros((1, len(actions) * (order + 1) ** len(s)))

    elif base == 'tile':
        num_tilings, tiles_per_tiling = baseparams['num_tilings'], baseparams['tiles_per_tiling']
        s = mc.d_zero()
        w = np.zeros((1, len(actions) * num_tilings))

    for x in range(epoch):
        s = mc.d_zero()

        # e ← 0
        e = np.zeros(w.shape)

        # choose a from s using a policy derived from q (e.g., ε-greedy or softmax);
        first_q = estimation.epsilon_greedy(fa.qw(w, s, actions, base, baseparams), actions, eps(x))
        # pi_s = pe.epsilon_greedy(pe.qw(w, s, order, actions, base), actions, eps)
        a = np.random.choice(actions, 1, p=first_q)[0]

        count = 0

        while s[0] < mc.right_bound and count < 1e3:
            # Take action a and observe r and s′;
            new_s, r = mc.P_and_R(s, a)

            # Choose a′ from s′ using a policy derived from q;
            pi_temp = estimation.epsilon_greedy(fa.qw(w, new_s, actions, base, baseparams), actions, eps(x))
            new_a = np.random.choice(actions, 1, p=pi_temp)[0]
            if new_s == [0.5, 0]:
                new_q = 0
            else:
                new_q = fa.qw_ele(w, new_s, new_a, actions, base, baseparams)[0]
            q, dqdw = fa.qw_ele(w, s, a, actions, base, baseparams)

            # e←γλe+∂qw(s,a)/∂w;
            e = l * 1 * e + dqdw
            # δ←r+γqw(s′,a′)−qw(s,a);
            delta = r + 1 * new_q - q
            # w←w+αδe;
            w += lr * delta * e

            # print(w)

            s = new_s
            a = new_a
            count += 1
        # print('update end')

        epi = MountainCarEpisode(mc)
        estimated_rewards[x] = epi.run_with_w(w, eps(x), base, baseparams)
        print('episode: ', x, ', reward: ', estimated_rewards[x])
    return estimated_rewards
def sarsa_mountaincar(lr, baseparams, eps, epoch=100, base='fourier'):
    mc = MountainCar()
    estimated_rewards = np.zeros(epoch)
    actions = mc.actions
    w = None

    if base == 'fourier':
        order = baseparams['order']
        s = mc.d_zero()
        w = np.zeros((1, len(actions) * (order + 1)**len(s)))

    elif base == 'tile':
        num_tilings, tiles_per_tiling = baseparams['num_tilings'], baseparams[
            'tiles_per_tiling']
        s = mc.d_zero()
        w = np.zeros((1, len(actions) * num_tilings))

    for x in range(epoch):
        s = mc.d_zero()

        # choose a from s using a policy derived from q (e.g., ε-greedy or softmax);
        first_q = estimation.epsilon_greedy(
            fa.qw(w, s, actions, base, baseparams), actions, eps(x))
        # pi_s = pe.epsilon_greedy(pe.qw(w, s, order, actions, base), actions, eps)
        a = np.random.choice(actions, 1, p=first_q)[0]

        count = 0

        while s[0] < mc.right_bound:
            # Take action a and observe r and s′;
            new_s, r = mc.P_and_R(s, a)

            # Choose a′ from s′ using a policy derived from q;
            pi_temp = estimation.epsilon_greedy(
                fa.qw(w, new_s, actions, base, baseparams), actions, eps(x))
            new_a = np.random.choice(actions, 1, p=pi_temp)[0]

            # w += lr * (r + pe.qw_fourier_ele(w, new_s, new_a, order, actions) -
            # pe.qw_fourier_ele(w, s, a, order, actions)) * pe.dqwdw_fourier(s, a, order, actions)
            new_q = fa.qw_ele(w, new_s, new_a, actions, base, baseparams)[0]
            q, dqdw = fa.qw_ele(w, s, a, actions, base, baseparams)
            w += lr * (r + new_q - q) * dqdw

            s = new_s
            a = new_a
            count += 1

        epi = MountainCarEpisode(mc)
        estimated_rewards[x] = epi.run_with_w(w, eps(x), base, baseparams)
        print('episode: ', x, ', reward: ', estimated_rewards[x])
    return estimated_rewards
def reinforce_mc(alpha, beta, l, baseparams, eps, epoch=100, base='fourier'):
    mc = MountainCar()
    estimated_rewards = np.zeros(epoch)
    actions = mc.actions
    theta = None
    order = 0

    if base == 'fourier':
        order = baseparams['order']
        s = mc.d_zero()
        theta = np.zeros((1, len(actions) * (order + 1)**len(s)))
        w = np.zeros((1, (order + 1)**len(s)))
        # theta = np.zeros((len(s), 3))

    for x in range(epoch):
        s = mc.d_zero()
        e = np.zeros(w.shape)

        hist_s = []
        hist_a = []
        hist_r = []
        hist_pi = []

        count = 0
        dj = np.zeros(theta.shape)

        # for each time step, until s is the terminal absorbing state do
        while s[0] < mc.right_bound and count < 1000:

            pi_temp = estimation.softmax(
                fa.qw(theta, s, actions, base, baseparams), eps(x))
            a = np.random.choice(actions, 1, p=pi_temp)[0]

            new_s, r = mc.P_and_R(s, a)

            hist_a.append(a)
            hist_s.append(s)
            hist_r.append(r)
            hist_pi.append(pi_temp)

            s = new_s
            count += 1

        for i in range(len(hist_a)):
            g = 0
            for j in range(i, len(hist_s)):
                g += hist_r[j]
            v, dv = fa.vw(w, hist_s[i], base, baseparams)
            dj += (g - v) * dsoftmax(hist_s[i], hist_a[i], order, actions,
                                     hist_pi[i])
            e = l * e + dv

            if i == len(hist_s) - 1:
                delta = hist_r[i] + 0 - \
                        fa.vw(w, hist_s[i], base, baseparams)[0]
            else:
                delta = hist_r[i] + fa.vw(w, hist_s[i + 1], base, baseparams)[0] - \
                        fa.vw(w, hist_s[i], base, baseparams)[0]

            w += alpha * delta * e
        theta += beta * dj

        epi = MountainCarEpisode(mc)
        # print(theta)
        estimated_rewards[x] = epi.run_with_w_softmax(theta, eps(x), base,
                                                      baseparams)
        print('episode: ', x, ', reward: ', estimated_rewards[x])
    return estimated_rewards
def actor_critic_mc(lr, l, baseparams, eps, epoch=100, base='fourier'):
    mc = MountainCar()
    estimated_rewards = np.zeros(epoch)
    actions = mc.actions
    w = None
    theta = None
    order = 0

    if base == 'fourier':
        order = baseparams['order']
        s = mc.d_zero()
        w = np.zeros((1, (order + 1)**len(s)))
        theta = np.zeros((1, len(actions) * (order + 1)**len(s)))
        # theta = np.zeros((len(s), 3))

    for x in range(epoch):
        s = mc.d_zero()
        # ev ← 0
        e = np.zeros(w.shape)
        # et ← 0
        # et = np.zeros(theta.shape)

        count = 0

        # for each time step, until s is the terminal absorbing state do
        while s[0] < mc.right_bound and count < 1000:

            pi_temp = estimation.softmax(
                fa.qw(theta, s, actions, base, baseparams), eps(x))
            a = np.random.choice(actions, 1, p=pi_temp)[0]
            # print(a)

            # print(pi_temp)

            # dydtheta_list = []
            # for na in actions:
            #     dydtheta_list.append(fa.qw_ele(theta, s, na, actions, base, baseparams)[1])
            #
            # dtheta = estimation.dsoftmax(fa.qw(theta, s, actions, base, baseparams), dydtheta_list, actions.index(
            # a), eps(x))

            dtheta = np.zeros((1, len(actions) * (order + 1)**len(s)))

            for idx in range(len(actions)):
                phi = fa.fourier_phi_mc(s, order).T
                if actions[idx] == a:
                    # print('target')
                    dtheta[:, idx * phi.shape[1]:(idx + 1) *
                           phi.shape[1]] = (1 - pi_temp[idx]) * phi
                else:
                    dtheta[:, idx * phi.shape[1]:(idx + 1) *
                           phi.shape[1]] = -pi_temp[idx] * phi

            # Take action a and observe r and s′;

            new_s, r = mc.P_and_R(s, a)

            # Critic update using TD(λ)
            # ev←γλev+∂vw(s);
            v, dv = fa.vw(w, s, base, baseparams)
            if new_s[0] > mc.right_bound:
                new_v = 0
            else:
                new_v = fa.vw(w, new_s, base, baseparams)[0]

            e = l * mc.gamma * e
            e += dv
            # δ ← r + γvw(s′,a′) − vw(s,a);
            delta = r + mc.gamma * new_v - v
            # w←w+αδev;
            w += lr * delta * e

            # Actor update
            # θ + αγ^tδ ∂ ln(π(s,a,θ))
            theta += lr * delta * dtheta

            s = new_s
            count += 1

        epi = MountainCarEpisode(mc)
        # print(theta)
        estimated_rewards[x] = epi.run_with_w_softmax(theta, eps(x), base,
                                                      baseparams)
        print('episode: ', x, ', reward: ', estimated_rewards[x])
    return estimated_rewards