Exemple #1
0
def qlearning_grid(lr, eps, epoch=100, searchbound=400):
    grid = Grid()
    grid.pi_params = np.zeros((23, 4))
    grid.softmax()
    actions = grid.action
    estimated_rewards = np.zeros(epoch)

    q = np.zeros((23, 4))

    for x in range(epoch):
        s = grid.d_zero()

        while s != [5, 5]:
            # choose new_a from new_s using policy derived from q
            pi_temp = pe.softmax(q[grid.get_index(s)], actions, eps(x))
            a = np.random.choice(actions, 1, p=pi_temp)[0]
            # print(q)
            # Take action a and observe r and s′;
            new_s, r = grid.P_and_R(s, a)
            q[grid.get_index(s), actions.index(a)] += lr * (
                r + grid.gamma * np.max(q[grid.get_index(new_s)]) -
                q[grid.get_index(s), actions.index(a)])
            s = new_s
        # using q function to estimate the reward and add it to estimated_reward
        # print('episode: ', x, ', q function: ', q)
        grid.pi_params = pe.softmax(q, actions, eps(x))
        grid_epi = GridEpisode(grid, step_bound=searchbound)
        # print('episode: ', x, ', pi: ', grid.pi_params)
        estimated_rewards[x] = grid_epi.run_all_steps()
        print('episode: ', x, ', reward: ', estimated_rewards[x], 'epsilon: ',
              eps(x))
        # decay *= decay_rate

    return estimated_rewards
Exemple #2
0
def sarsa_cartpole(lr, baseparams, epoch=100, eps=1e-2, base='fourier'):
    cartpole = CartPole()
    estimated_rewards = np.zeros(epoch)
    actions = cartpole.actions
    w = None

    if base == 'fourier':
        order = baseparams['order']
        s = cartpole.d_zero()
        w = np.zeros((1, len(actions) * (order + 1)**len(s)))

    elif base == 'tile':
        num_tilings, tiles_per_tiling = baseparams['num_tilings'], baseparams[
            'tiles_per_tiling']
        s = cartpole.d_zero()
        w = np.zeros((1, len(actions) * num_tilings))

    for x in range(epoch):
        s = cartpole.d_zero()

        # choose a from s using a policy derived from q (e.g., ε-greedy or softmax);
        first_q = pe.softmax(pe.qw(w, s, actions, base, baseparams), actions,
                             eps)
        # pi_s = pe.epsilon_greedy(pe.qw(w, s, order, actions, base), actions, eps)
        a = np.random.choice(actions, 1, p=first_q)[0]

        count = 0

        while np.abs(s[0]) < cartpole.edge and np.abs(
                s[1]) < cartpole.fail_angle and count < 1010:
            # Take action a and observe r and s′;
            new_s, r = cartpole.P_and_R(s, a)

            # Choose a′ from s′ using a policy derived from q;
            pi_temp = pe.softmax(pe.qw(w, new_s, actions, base, baseparams),
                                 actions, eps)
            new_a = np.random.choice(actions, 1, p=pi_temp)[0]

            # w += lr * (r + pe.qw_fourier_ele(w, new_s, new_a, order, actions) -
            # pe.qw_fourier_ele(w, s, a, order, actions)) * pe.dqwdw_fourier(s, a, order, actions)
            new_q = pe.qw_ele(w, new_s, new_a, actions, base, baseparams)[0]
            q, dqdw = pe.qw_ele(w, s, a, actions, base, baseparams)
            w += lr * (r + new_q - q) * dqdw

            s = new_s
            a = new_a
            count += 1

        epi = CartPoleEpisode(cartpole)
        estimated_rewards[x] = epi.run_with_w_softmax(w, eps, base, baseparams)
        print('episode: ', x, ', reward: ', estimated_rewards[x])
        # print('episode: ', x, ', w: ', w)

    return estimated_rewards
def reinforce_grid(lr, eps, epoch=100, searchbound=400):
    estimated_rewards = np.zeros(epoch)

    # theta is a representation of policy
    theta = np.zeros((23, 4))
    grid = Grid()
    actions = grid.action
    # print(epoch)

    # for each episode:
    for x in range(epoch):
        # s ∼ d0
        s = grid.d_zero()
        count = 0
        hist_s = []
        hist_a = []
        hist_r = []
        grid.pi_params = estimation.softmax(theta, eps(x))
        # for each time step, until s is the terminal absorbing state do
        while s != [5, 5] and count < 1000:
            hist_s.append(s)
            a = grid.pi(s)
            hist_a.append(a)
            new_s, r = grid.P_and_R(s, a)
            hist_r.append(r)
            s = new_s
            count += 1

        # delta_j = 0
        decay = 1
        for i in range(len(hist_s)):
            g = 0
            gd = 1
            for j in range(i, len(hist_s)):
                g += gd * hist_r[j]
                gd *= grid.gamma
            theta[grid.get_index(hist_s[i]),
                  actions.index(hist_a[i])] += lr * decay * g
            decay *= grid.gamma

        grid.pi_params = estimation.softmax(theta, eps(x))
        # grid.softmax()
        grid_epi = GridEpisode(grid, step_bound=searchbound)
        # print('episode: ', x, ', pi: ', grid.pi_params)
        estimated_rewards[x] = grid_epi.run_all_steps()
        if x == epoch - 1:
            print('episode: ', x, ', reward: ', estimated_rewards[x])
        # decay *= decay_rate

    return estimated_rewards
Exemple #4
0
    def run_with_w_softmax(self, w, eps, base, baseparams):
        reward = 0
        s = self.mountaincar.d_zero()

        count = 0

        while self.active == 1:
            q = fa.qw(w, s, self.mountaincar.actions, base, baseparams)
            # print(q)
            pi = estimation.softmax(q, eps)
            # pi = estimation.epsilon_greedy(q, self.mountaincar.actions, eps)
            # print('test')
            # print(pi)
            a = np.random.choice(self.mountaincar.actions, 1, p=pi)[0]
            # print(pi, s, a)
            # print(a)
            # print(s[0])
            s, r = self.mountaincar.P_and_R(s, a)
            count += 1
            if s[0] == self.mountaincar.right_bound:
                self.active = 0
            else:
                reward += r
            if count >= 1e3:
                self.active = 0

        return reward
def actor_critic_grid(lr, eps, epoch=100, searchbound=400):
    estimated_rewards = np.zeros(epoch)

    # Initialize tabular-v arbitrarily
    v = np.zeros(23)
    # theta is a representation of policy
    theta = np.zeros((23, 4))
    grid = Grid()
    actions = grid.action

    # for each episode:
    for x in range(epoch):
        # s ∼ d0
        s = grid.d_zero()
        count = 0
        # for each time step, until s is the terminal absorbing state do
        while s != [5, 5] and count < 1000:
            # a ∼ π(s, ·);
            grid.pi_params = estimation.softmax(theta, eps(x))
            a = grid.pi(s)
            # Take action a and observe r and s′;
            new_s, r = grid.P_and_R(s, a)

            # Critic update using TD(λ)
            # e ← γλe + ∂qw(s,a)/∂qw;
            delta = r + grid.gamma * v[grid.get_index(new_s)] - v[
                grid.get_index(s)]
            # w←w+αδev;
            v[grid.get_index(s)] += lr * delta

            theta[grid.get_index(s), actions.index(a)] += lr * delta
            # print(theta)

            s = new_s
            count += 1
        # using q function to estimate the reward and add it to estimated_reward
        # print('episode: ', x, ', q function: ', q)
        grid.pi_params = estimation.softmax(theta, eps(x))
        # grid.softmax()
        grid_epi = GridEpisode(grid, step_bound=searchbound)
        # print('episode: ', x, ', pi: ', grid.pi_params)
        estimated_rewards[x] = grid_epi.run_all_steps()
        if x == 99:
            print('episode: ', x, ', reward: ', estimated_rewards[x])
        # decay *= decay_rate

    return estimated_rewards
    def run_with_w_softmax(self, w, eps, base, baseparams):
        reward = 0

        s = self.cartpole.d_zero()

        while self.active == 1:
            q = pe.qw(w, s, self.cartpole.actions, base, baseparams)
            pi = pe.softmax(q, self.cartpole.actions, eps)
            a = np.random.choice(self.cartpole.actions, 1, p=pi)[0]
            s, r = self.cartpole.P_and_R(s, a)
            reward += 1
            self.step_count += 1
            if self.step_count > self.maxturn:
                self.active = 0
            if np.abs(s[0]) > self.cartpole.edge:
                self.active = 0
            if np.abs(s[1]) > self.cartpole.fail_angle:
                self.active = 0

        return reward - 1
def reinforce_mc(alpha, beta, l, baseparams, eps, epoch=100, base='fourier'):
    mc = MountainCar()
    estimated_rewards = np.zeros(epoch)
    actions = mc.actions
    theta = None
    order = 0

    if base == 'fourier':
        order = baseparams['order']
        s = mc.d_zero()
        theta = np.zeros((1, len(actions) * (order + 1)**len(s)))
        w = np.zeros((1, (order + 1)**len(s)))
        # theta = np.zeros((len(s), 3))

    for x in range(epoch):
        s = mc.d_zero()
        e = np.zeros(w.shape)

        hist_s = []
        hist_a = []
        hist_r = []
        hist_pi = []

        count = 0
        dj = np.zeros(theta.shape)

        # for each time step, until s is the terminal absorbing state do
        while s[0] < mc.right_bound and count < 1000:

            pi_temp = estimation.softmax(
                fa.qw(theta, s, actions, base, baseparams), eps(x))
            a = np.random.choice(actions, 1, p=pi_temp)[0]

            new_s, r = mc.P_and_R(s, a)

            hist_a.append(a)
            hist_s.append(s)
            hist_r.append(r)
            hist_pi.append(pi_temp)

            s = new_s
            count += 1

        for i in range(len(hist_a)):
            g = 0
            for j in range(i, len(hist_s)):
                g += hist_r[j]
            v, dv = fa.vw(w, hist_s[i], base, baseparams)
            dj += (g - v) * dsoftmax(hist_s[i], hist_a[i], order, actions,
                                     hist_pi[i])
            e = l * e + dv

            if i == len(hist_s) - 1:
                delta = hist_r[i] + 0 - \
                        fa.vw(w, hist_s[i], base, baseparams)[0]
            else:
                delta = hist_r[i] + fa.vw(w, hist_s[i + 1], base, baseparams)[0] - \
                        fa.vw(w, hist_s[i], base, baseparams)[0]

            w += alpha * delta * e
        theta += beta * dj

        epi = MountainCarEpisode(mc)
        # print(theta)
        estimated_rewards[x] = epi.run_with_w_softmax(theta, eps(x), base,
                                                      baseparams)
        print('episode: ', x, ', reward: ', estimated_rewards[x])
    return estimated_rewards
def actor_critic_mc(lr, l, baseparams, eps, epoch=100, base='fourier'):
    mc = MountainCar()
    estimated_rewards = np.zeros(epoch)
    actions = mc.actions
    w = None
    theta = None
    order = 0

    if base == 'fourier':
        order = baseparams['order']
        s = mc.d_zero()
        w = np.zeros((1, (order + 1)**len(s)))
        theta = np.zeros((1, len(actions) * (order + 1)**len(s)))
        # theta = np.zeros((len(s), 3))

    for x in range(epoch):
        s = mc.d_zero()
        # ev ← 0
        e = np.zeros(w.shape)
        # et ← 0
        # et = np.zeros(theta.shape)

        count = 0

        # for each time step, until s is the terminal absorbing state do
        while s[0] < mc.right_bound and count < 1000:

            pi_temp = estimation.softmax(
                fa.qw(theta, s, actions, base, baseparams), eps(x))
            a = np.random.choice(actions, 1, p=pi_temp)[0]
            # print(a)

            # print(pi_temp)

            # dydtheta_list = []
            # for na in actions:
            #     dydtheta_list.append(fa.qw_ele(theta, s, na, actions, base, baseparams)[1])
            #
            # dtheta = estimation.dsoftmax(fa.qw(theta, s, actions, base, baseparams), dydtheta_list, actions.index(
            # a), eps(x))

            dtheta = np.zeros((1, len(actions) * (order + 1)**len(s)))

            for idx in range(len(actions)):
                phi = fa.fourier_phi_mc(s, order).T
                if actions[idx] == a:
                    # print('target')
                    dtheta[:, idx * phi.shape[1]:(idx + 1) *
                           phi.shape[1]] = (1 - pi_temp[idx]) * phi
                else:
                    dtheta[:, idx * phi.shape[1]:(idx + 1) *
                           phi.shape[1]] = -pi_temp[idx] * phi

            # Take action a and observe r and s′;

            new_s, r = mc.P_and_R(s, a)

            # Critic update using TD(λ)
            # ev←γλev+∂vw(s);
            v, dv = fa.vw(w, s, base, baseparams)
            if new_s[0] > mc.right_bound:
                new_v = 0
            else:
                new_v = fa.vw(w, new_s, base, baseparams)[0]

            e = l * mc.gamma * e
            e += dv
            # δ ← r + γvw(s′,a′) − vw(s,a);
            delta = r + mc.gamma * new_v - v
            # w←w+αδev;
            w += lr * delta * e

            # Actor update
            # θ + αγ^tδ ∂ ln(π(s,a,θ))
            theta += lr * delta * dtheta

            s = new_s
            count += 1

        epi = MountainCarEpisode(mc)
        # print(theta)
        estimated_rewards[x] = epi.run_with_w_softmax(theta, eps(x), base,
                                                      baseparams)
        print('episode: ', x, ', reward: ', estimated_rewards[x])
    return estimated_rewards