Python qw Examples

Programming Language: Python

Namespace/Package Name: estimation

Method/Function: qw

Examples at hotexamples.com: 3

Python qw - 3 examples found. These are the top rated real world Python examples of estimation.qw extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def sarsa_cartpole(lr, baseparams, epoch=100, eps=1e-2, base='fourier'):
    cartpole = CartPole()
    estimated_rewards = np.zeros(epoch)
    actions = cartpole.actions
    w = None

    if base == 'fourier':
        order = baseparams['order']
        s = cartpole.d_zero()
        w = np.zeros((1, len(actions) * (order + 1)**len(s)))

    elif base == 'tile':
        num_tilings, tiles_per_tiling = baseparams['num_tilings'], baseparams[
            'tiles_per_tiling']
        s = cartpole.d_zero()
        w = np.zeros(
            (1, len(actions) * num_tilings * (tiles_per_tiling**len(s))))

    elif base == 'rbf':
        order = baseparams['order']
        s = cartpole.d_zero()
        w = np.zeros((1, len(actions) * order**len(s)))

    for x in range(epoch):
        s = cartpole.d_zero()

        # choose a from s using a policy derived from q (e.g., ε-greedy or softmax);
        first_q = pe.epsilon_greedy(pe.qw(w, s, actions, base, baseparams),
                                    actions, eps)
        # pi_s = pe.epsilon_greedy(pe.qw(w, s, order, actions, base), actions, eps)
        a = np.random.choice(actions, 1, p=first_q)[0]

        count = 0

        while np.abs(s[0]) < cartpole.edge and np.abs(
                s[1]) < cartpole.fail_angle and count < 1010:
            # Take action a and observe r and s′;
            new_s, r = cartpole.P_and_R(s, a)

            # Choose a′ from s′ using a policy derived from q;
            pi_temp = pe.epsilon_greedy(
                pe.qw(w, new_s, actions, base, baseparams), actions, eps)
            new_a = np.random.choice(actions, 1, p=pi_temp)[0]

            # w += lr * (r + pe.qw_fourier_ele(w, new_s, new_a, order, actions) -
            # pe.qw_fourier_ele(w, s, a, order, actions)) * pe.dqwdw_fourier(s, a, order, actions)
            new_q = pe.qw_ele(w, new_s, new_a, actions, base, baseparams)[0]
            q, dqdw = pe.qw_ele(w, s, a, actions, base, baseparams)
            w += lr * (r + new_q - q) * dqdw

            s = new_s
            a = new_a
            count += 1

        epi = CartPoleEpisode(cartpole)
        estimated_rewards[x] = epi.run_with_w(w, eps, base, baseparams)
        print('episode: ', x, ', reward: ', estimated_rewards[x])
        # print('episode: ', x, ', w: ', w)

    return estimated_rewards

Example #2

Show file

def qlearning_cartpole(lr, baseparams, decaylambda, epoch=100, base='fourier'):
    cartpole = CartPole()
    estimated_rewards = np.zeros(epoch)
    actions = cartpole.actions
    w = None

    if base == 'fourier':
        order = baseparams['order']
        s = cartpole.d_zero()
        w = np.zeros((1, len(actions) * (order + 1)**len(s)))

    elif base == 'tile':
        num_tilings, tiles_per_tiling = baseparams['num_tilings'], baseparams[
            'tiles_per_tiling']
        s = cartpole.d_zero()
        w = np.zeros((1, len(actions) * num_tilings))

    for x in range(epoch):
        s = cartpole.d_zero()

        count = 0

        while np.abs(s[0]) < cartpole.edge and np.abs(
                s[1]) < cartpole.fail_angle and count < 1010:
            # Choose a′ from s′ using a policy derived from q;
            pi_temp = pe.epsilon_greedy(pe.qw(w, s, actions, base, baseparams),
                                        actions, decaylambda(x))
            a = np.random.choice(actions, 1, p=pi_temp)[0]

            # Take action a and observe r and s′;
            new_s, r = cartpole.P_and_R(s, a)

            # w += lr * (r + pe.qw_fourier_ele(w, new_s, new_a, order, actions) -
            # pe.qw_fourier_ele(w, s, a, order, actions)) * pe.dqwdw_fourier(s, a, order, actions)
            new_q = np.max(pe.qw(w, new_s, actions, base, baseparams))
            q, dqdw = pe.qw_ele(w, s, a, actions, base, baseparams)
            w += lr * (r + new_q - q) * dqdw

            s = new_s
            count += 1

        epi = CartPoleEpisode(cartpole)
        estimated_rewards[x] = epi.run_with_w_softmax(w, decaylambda(x), base,
                                                      baseparams)
        print('episode: ', x, ', reward: ', estimated_rewards[x])
        # print('episode: ', x, ', w: ', w)

    return estimated_rewards

Example #3

Show file

File: cartpole.py Project: stonezhng/cs687-ReinforcementLearning-Homework

    def run_with_w(self, w, eps, base, baseparams):
        reward = 0
        s = self.cartpole.d_zero()

        while self.active == 1:
            q = pe.qw(w, s, self.cartpole.actions, base, baseparams)
            pi = pe.epsilon_greedy(q, self.cartpole.actions, eps)
            a = np.random.choice(self.cartpole.actions, 1, p=pi)[0]
            s, r = self.cartpole.P_and_R(s, a)
            reward += 1
            self.step_count += 1
            if self.step_count > self.maxturn:
                self.active = 0
            if np.abs(s[0]) > self.cartpole.edge:
                self.active = 0
            if np.abs(s[1]) > self.cartpole.fail_angle:
                self.active = 0

        return reward - 1