Python epsilon_greedyの例、estimation.epsilon_greedy Pythonの例

コード例 #1

0

ファイルを表示

def sarsa_cartpole(lr, baseparams, epoch=100, eps=1e-2, base='fourier'):
    cartpole = CartPole()
    estimated_rewards = np.zeros(epoch)
    actions = cartpole.actions
    w = None

    if base == 'fourier':
        order = baseparams['order']
        s = cartpole.d_zero()
        w = np.zeros((1, len(actions) * (order + 1)**len(s)))

    elif base == 'tile':
        num_tilings, tiles_per_tiling = baseparams['num_tilings'], baseparams[
            'tiles_per_tiling']
        s = cartpole.d_zero()
        w = np.zeros(
            (1, len(actions) * num_tilings * (tiles_per_tiling**len(s))))

    elif base == 'rbf':
        order = baseparams['order']
        s = cartpole.d_zero()
        w = np.zeros((1, len(actions) * order**len(s)))

    for x in range(epoch):
        s = cartpole.d_zero()

        # choose a from s using a policy derived from q (e.g., ε-greedy or softmax);
        first_q = pe.epsilon_greedy(pe.qw(w, s, actions, base, baseparams),
                                    actions, eps)
        # pi_s = pe.epsilon_greedy(pe.qw(w, s, order, actions, base), actions, eps)
        a = np.random.choice(actions, 1, p=first_q)[0]

        count = 0

        while np.abs(s[0]) < cartpole.edge and np.abs(
                s[1]) < cartpole.fail_angle and count < 1010:
            # Take action a and observe r and s′;
            new_s, r = cartpole.P_and_R(s, a)

            # Choose a′ from s′ using a policy derived from q;
            pi_temp = pe.epsilon_greedy(
                pe.qw(w, new_s, actions, base, baseparams), actions, eps)
            new_a = np.random.choice(actions, 1, p=pi_temp)[0]

            # w += lr * (r + pe.qw_fourier_ele(w, new_s, new_a, order, actions) -
            # pe.qw_fourier_ele(w, s, a, order, actions)) * pe.dqwdw_fourier(s, a, order, actions)
            new_q = pe.qw_ele(w, new_s, new_a, actions, base, baseparams)[0]
            q, dqdw = pe.qw_ele(w, s, a, actions, base, baseparams)
            w += lr * (r + new_q - q) * dqdw

            s = new_s
            a = new_a
            count += 1

        epi = CartPoleEpisode(cartpole)
        estimated_rewards[x] = epi.run_with_w(w, eps, base, baseparams)
        print('episode: ', x, ', reward: ', estimated_rewards[x])
        # print('episode: ', x, ', w: ', w)

    return estimated_rewards

コード例 #2

0

ファイルを表示

ファイル: qlearning.py プロジェクト: stonezhng/cs687-ReinforcementLearning-Homework

def qlearning_grid(lr, eps, epoch=100, searchbound=400):
    q = np.zeros((23, 4))

    grid = Grid()
    grid.pi_params = np.zeros((23, 4))
    grid.softmax()
    actions = grid.action
    estimated_rewards = np.zeros(epoch)

    for x in range(epoch):
        s = grid.d_zero()
        while s != [5, 5]:
            # Choose a from s using a policy derived from q;
            pi_temp = pe.epsilon_greedy(q[grid.get_index(s)], actions, eps(x))
            a = np.random.choice(actions, 1, p=pi_temp)[0]

            # Take action a and observe r and s′;
            new_s, r = grid.P_and_R(s, a)

            q[grid.get_index(s), actions.index(a)] += lr * (
                r + grid.gamma * np.max(q[grid.get_index(new_s)]) -
                q[grid.get_index(s), actions.index(a)])
            s = new_s

        grid.pi_params = pe.epsilon_greedy(q, actions, eps(x))
        grid_epi = GridEpisode(grid, step_bound=searchbound)
        # print('episode: ', x, ', pi: ', grid.pi_params)
        estimated_rewards[x] = grid_epi.run_all_steps()
        print('episode: ', x, ', reward: ', estimated_rewards[x],
              ', epsilon: ', eps(x))

    return estimated_rewards

コード例 #3

0

ファイルを表示

def sarsa_lambda_mc(lr, l, baseparams, eps, epoch=100, base='fourier'):
    mc = MountainCar()
    estimated_rewards = np.zeros(epoch)
    actions = mc.actions
    w = None

    if base == 'fourier':
        order = baseparams['order']
        s = mc.d_zero()
        w = np.zeros((1, len(actions) * (order + 1) ** len(s)))

    elif base == 'tile':
        num_tilings, tiles_per_tiling = baseparams['num_tilings'], baseparams['tiles_per_tiling']
        s = mc.d_zero()
        w = np.zeros((1, len(actions) * num_tilings))

    for x in range(epoch):
        s = mc.d_zero()

        # e ← 0
        e = np.zeros(w.shape)

        # choose a from s using a policy derived from q (e.g., ε-greedy or softmax);
        first_q = estimation.epsilon_greedy(fa.qw(w, s, actions, base, baseparams), actions, eps(x))
        # pi_s = pe.epsilon_greedy(pe.qw(w, s, order, actions, base), actions, eps)
        a = np.random.choice(actions, 1, p=first_q)[0]

        count = 0

        while s[0] < mc.right_bound and count < 1e3:
            # Take action a and observe r and s′;
            new_s, r = mc.P_and_R(s, a)

            # Choose a′ from s′ using a policy derived from q;
            pi_temp = estimation.epsilon_greedy(fa.qw(w, new_s, actions, base, baseparams), actions, eps(x))
            new_a = np.random.choice(actions, 1, p=pi_temp)[0]
            if new_s == [0.5, 0]:
                new_q = 0
            else:
                new_q = fa.qw_ele(w, new_s, new_a, actions, base, baseparams)[0]
            q, dqdw = fa.qw_ele(w, s, a, actions, base, baseparams)

            # e←γλe+∂qw(s,a)/∂w;
            e = l * 1 * e + dqdw
            # δ←r+γqw(s′,a′)−qw(s,a);
            delta = r + 1 * new_q - q
            # w←w+αδe;
            w += lr * delta * e

            # print(w)

            s = new_s
            a = new_a
            count += 1
        # print('update end')

        epi = MountainCarEpisode(mc)
        estimated_rewards[x] = epi.run_with_w(w, eps(x), base, baseparams)
        print('episode: ', x, ', reward: ', estimated_rewards[x])
    return estimated_rewards

コード例 #4

0

ファイルを表示

ファイル: policy_improvement.py プロジェクト: stonezhng/cs687-ReinforcementLearning-Homework

def sarsa_mountaincar(lr, baseparams, eps, epoch=100, base='fourier'):
    mc = MountainCar()
    estimated_rewards = np.zeros(epoch)
    actions = mc.actions
    w = None

    if base == 'fourier':
        order = baseparams['order']
        s = mc.d_zero()
        w = np.zeros((1, len(actions) * (order + 1)**len(s)))

    elif base == 'tile':
        num_tilings, tiles_per_tiling = baseparams['num_tilings'], baseparams[
            'tiles_per_tiling']
        s = mc.d_zero()
        w = np.zeros((1, len(actions) * num_tilings))

    for x in range(epoch):
        s = mc.d_zero()

        # choose a from s using a policy derived from q (e.g., ε-greedy or softmax);
        first_q = estimation.epsilon_greedy(
            fa.qw(w, s, actions, base, baseparams), actions, eps(x))
        # pi_s = pe.epsilon_greedy(pe.qw(w, s, order, actions, base), actions, eps)
        a = np.random.choice(actions, 1, p=first_q)[0]

        count = 0

        while s[0] < mc.right_bound:
            # Take action a and observe r and s′;
            new_s, r = mc.P_and_R(s, a)

            # Choose a′ from s′ using a policy derived from q;
            pi_temp = estimation.epsilon_greedy(
                fa.qw(w, new_s, actions, base, baseparams), actions, eps(x))
            new_a = np.random.choice(actions, 1, p=pi_temp)[0]

            # w += lr * (r + pe.qw_fourier_ele(w, new_s, new_a, order, actions) -
            # pe.qw_fourier_ele(w, s, a, order, actions)) * pe.dqwdw_fourier(s, a, order, actions)
            new_q = fa.qw_ele(w, new_s, new_a, actions, base, baseparams)[0]
            q, dqdw = fa.qw_ele(w, s, a, actions, base, baseparams)
            w += lr * (r + new_q - q) * dqdw

            s = new_s
            a = new_a
            count += 1

        epi = MountainCarEpisode(mc)
        estimated_rewards[x] = epi.run_with_w(w, eps(x), base, baseparams)
        print('episode: ', x, ', reward: ', estimated_rewards[x])
    return estimated_rewards

コード例 #5

0

ファイルを表示

def sarsa_lambda_grid(lr, l, eps, epoch=100, searchbound=400):
    grid = Grid()
    grid.pi_params = np.zeros((23, 4))
    grid.softmax()
    actions = grid.action
    estimated_rewards = np.zeros(epoch)

    # Initialize tabular-q arbitrarily
    q = np.zeros((23, 4))

    # for each episode:
    for x in range(epoch):
        # s ∼ d0
        s = grid.d_zero()

        # e ← 0
        e = np.zeros((23, 4))

        # choose a from s using a policy derived from q (e.g., ε-greedy or softmax);
        pi_s = estimation.epsilon_greedy(q[grid.get_index(s)], actions, eps(x))
        a = np.random.choice(actions, 1, p=pi_s)[0]

        # for each time step, until s is the terminal absorbing state do
        while s != [5, 5]:
            # Take action a and observe r and s′;
            new_s, r = grid.P_and_R(s, a)

            # choose new_a from new_s using policy derived from q
            pi_temp = estimation.epsilon_greedy(q[grid.get_index(new_s)], actions, eps(x))
            new_a = np.random.choice(actions, 1, p=pi_temp)[0]

            # e ← γλe + ∂qw(s,a)/∂qw;
            e = l * grid.gamma * e
            e[grid.get_index(s), actions.index(a)] += 1
            # δ ← r + γqw(s′,a′) − qw(s,a);
            delta = r + grid.gamma * q[grid.get_index(new_s), actions.index(new_a)] - q[grid.get_index(s), actions.index(a)]
            # w ← w + αδe;
            q += lr * delta * e

            s = new_s
            a = new_a
        # using q function to estimate the reward and add it to estimated_reward
        # print('episode: ', x, ', q function: ', q)
        grid.pi_params = estimation.epsilon_greedy(q, actions, eps(x))
        grid_epi = GridEpisode(grid, step_bound=searchbound)
        # print('episode: ', x, ', pi: ', grid.pi_params)
        estimated_rewards[x] = grid_epi.run_all_steps()
        print('episode: ', x, ', reward: ', estimated_rewards[x], 'epsilon: ', eps(x))
        # decay *= decay_rate

    return estimated_rewards

コード例 #6

0

ファイルを表示

    def run_with_w(self, w, eps, base, baseparams):
        reward = 0
        s = self.mountaincar.d_zero()

        count = 0

        while self.active == 1:
            q = fa.qw(w, s, self.mountaincar.actions, base, baseparams)
            # print(q)
            # pi = estimation.softmax(q, eps)
            pi = estimation.epsilon_greedy(q, self.mountaincar.actions, eps)
            # print(pi)
            a = np.random.choice(self.mountaincar.actions, 1, p=pi)[0]
            # print(a)
            # print(s[0])
            # print(pi, s, a)
            s, r = self.mountaincar.P_and_R(s, a)
            count += 1
            if s[0] == self.mountaincar.right_bound:
                self.active = 0
            else:
                reward += r
            if count >= 1e3:
                self.active = 0

        return reward

コード例 #7

0

ファイルを表示

def sarsa_grid(lr, eps, epoch=100, searchbound=400):
    grid = Grid()
    grid.pi_params = np.zeros((23, 4))
    grid.softmax()
    actions = grid.action
    estimated_rewards = np.zeros(epoch)

    q = np.zeros((23, 4))

    for x in range(epoch):
        s = grid.d_zero()

        # choose a from s using a policy derived from q (e.g., ε-greedy or softmax);
        pi_s = pe.epsilon_greedy(q[grid.get_index(s)], actions, eps(x))
        a = np.random.choice(actions, 1, p=pi_s)[0]

        while s != [5, 5]:
            # print(q)
            # Take action a and observe r and s′;
            new_s, r = grid.P_and_R(s, a)

            # choose new_a from new_s using policy derived from q
            pi_temp = pe.epsilon_greedy(q[grid.get_index(new_s)], actions,
                                        eps(x))
            new_a = np.random.choice(actions, 1, p=pi_temp)[0]

            q[grid.get_index(s), actions.index(a)] += lr * (
                r + grid.gamma * q[grid.get_index(new_s),
                                   actions.index(new_a)] -
                q[grid.get_index(s), actions.index(a)])
            s = new_s
            a = new_a
        # using q function to estimate the reward and add it to estimated_reward
        # print('episode: ', x, ', q function: ', q)
        grid.pi_params = pe.epsilon_greedy(q, actions, eps(x))
        grid_epi = GridEpisode(grid, step_bound=searchbound)
        # print('episode: ', x, ', pi: ', grid.pi_params)
        estimated_rewards[x] = grid_epi.run_all_steps()
        print('episode: ', x, ', reward: ', estimated_rewards[x], 'epsilon: ',
              eps(x))
        # decay *= decay_rate

    return estimated_rewards

コード例 #8

0

ファイルを表示

def qlearning_cartpole(lr, baseparams, decaylambda, epoch=100, base='fourier'):
    cartpole = CartPole()
    estimated_rewards = np.zeros(epoch)
    actions = cartpole.actions
    w = None

    if base == 'fourier':
        order = baseparams['order']
        s = cartpole.d_zero()
        w = np.zeros((1, len(actions) * (order + 1)**len(s)))

    elif base == 'tile':
        num_tilings, tiles_per_tiling = baseparams['num_tilings'], baseparams[
            'tiles_per_tiling']
        s = cartpole.d_zero()
        w = np.zeros((1, len(actions) * num_tilings))

    for x in range(epoch):
        s = cartpole.d_zero()

        count = 0

        while np.abs(s[0]) < cartpole.edge and np.abs(
                s[1]) < cartpole.fail_angle and count < 1010:
            # Choose a′ from s′ using a policy derived from q;
            pi_temp = pe.epsilon_greedy(pe.qw(w, s, actions, base, baseparams),
                                        actions, decaylambda(x))
            a = np.random.choice(actions, 1, p=pi_temp)[0]

            # Take action a and observe r and s′;
            new_s, r = cartpole.P_and_R(s, a)

            # w += lr * (r + pe.qw_fourier_ele(w, new_s, new_a, order, actions) -
            # pe.qw_fourier_ele(w, s, a, order, actions)) * pe.dqwdw_fourier(s, a, order, actions)
            new_q = np.max(pe.qw(w, new_s, actions, base, baseparams))
            q, dqdw = pe.qw_ele(w, s, a, actions, base, baseparams)
            w += lr * (r + new_q - q) * dqdw

            s = new_s
            count += 1

        epi = CartPoleEpisode(cartpole)
        estimated_rewards[x] = epi.run_with_w_softmax(w, decaylambda(x), base,
                                                      baseparams)
        print('episode: ', x, ', reward: ', estimated_rewards[x])
        # print('episode: ', x, ', w: ', w)

    return estimated_rewards

コード例 #9

0

ファイルを表示

ファイル: cartpole.py プロジェクト: stonezhng/cs687-ReinforcementLearning-Homework

    def run_with_w(self, w, eps, base, baseparams):
        reward = 0
        s = self.cartpole.d_zero()

        while self.active == 1:
            q = pe.qw(w, s, self.cartpole.actions, base, baseparams)
            pi = pe.epsilon_greedy(q, self.cartpole.actions, eps)
            a = np.random.choice(self.cartpole.actions, 1, p=pi)[0]
            s, r = self.cartpole.P_and_R(s, a)
            reward += 1
            self.step_count += 1
            if self.step_count > self.maxturn:
                self.active = 0
            if np.abs(s[0]) > self.cartpole.edge:
                self.active = 0
            if np.abs(s[1]) > self.cartpole.fail_angle:
                self.active = 0

        return reward - 1