Esempi in Python per update_mdp_value, esempi in Python per submission.update_mdp_value

Esempio n. 1

0

Mostra file

File: grader.py Progetto: bearbearyu1223/Stanford-XCS-229-II

  def test_11(self):
    """1-11-hidden:  Checking value iteration with complex environment."""
    sub_or_sol = self.run_with_solution_if_possible(submission, lambda sub_or_sol: sub_or_sol)
    mdp_data = observe_steps(sub_or_sol)
    v = mdp_data['value']
    sub_or_sol.update_mdp_transition_probs_avg_reward(mdp_data)
    sub_or_sol.update_mdp_value(mdp_data, 0.01, 0.99)
    solution_value = mdp_data['value']

    mdp_data = observe_steps(sub_or_sol)
    mdp_data['value'] = v
    sub_or_sol.update_mdp_transition_probs_avg_reward(mdp_data)
    submission.update_mdp_value(mdp_data, 0.01, 0.99)
    submission_value = mdp_data['value']

    self.assertTrue((solution_value == submission_value).all())

Esempio n. 2

0

Mostra file

File: grader.py Progetto: bearbearyu1223/Stanford-XCS-229-II

 def test_10(self):
   """1-10-basic: Checking simple, two-step value iteration."""
   mdp_data = {}
   mdp_data['transition_probs'] = np.array([[[0., 1., 0.],
                                             [0., 1., 0.]],
                                            [[0., 0., 1.],
                                             [0., 0., 1.]],
                                            [[0., 0., 1.],
                                             [0., 0., 1.]]],dtype=np.float64)
   mdp_data['transition_counts'] = np.array([[[0., 1., 0.],
                                              [0., 1., 0.]],
                                             [[0., 0., 1.],
                                              [0., 0., 1.]],
                                             [[0., 0., 1.],
                                              [0., 0., 1.]]],dtype=np.float64)
   mdp_data['avg_reward'] = np.array([0.,1.,0.],dtype=np.float64)
   mdp_data['sum_reward'] = np.array([0.,2.,0.],dtype=np.float64)
   mdp_data['num_states'] = 3
   mdp_data['value'] = np.array([0.,0.,0.], dtype=np.float64)
   submission.update_mdp_value(mdp_data, 0.01, 0.99)
   self.assertTrue((mdp_data['value'] == np.array([.99,1.,0.], dtype=np.float64)).all())

Esempio n. 3

0

Mostra file

File: grader.py Progetto: bearbearyu1223/Stanford-XCS-229-II

 def test_09(self):
   """1-9-basic:  Checking value iteration algorithm returns the correct types and shapes"""
   mdp_data = observe_steps(submission)
   submission.update_mdp_transition_probs_avg_reward(mdp_data)
   submission.update_mdp_value(mdp_data, 0.01, 0.99)
   self.check_valid_mdp_data(mdp_data, 4)

Esempio n. 4

0

Mostra file

File: cartpole.py Progetto: bearbearyu1223/Stanford-XCS-229-II

def main():
    # Simulation parameters
    pause_time = 0.0001
    min_trial_length_to_start_display = 100
    display_started = min_trial_length_to_start_display == 0

    NUM_STATES = 163
    GAMMA = 0.995
    TOLERANCE = 0.01
    NO_LEARNING_THRESHOLD = 500

    # Time cycle of the simulation
    time = 0

    # These variables perform bookkeeping (how many cycles was the pole
    # balanced for before it fell). Useful for plotting learning curves.
    time_steps_to_failure = []
    num_failures = 0
    time_at_start_of_current_trial = 0

    # You should reach convergence well before this
    max_failures = 500

    # Initialize a cart pole
    cart_pole = CartPole(Physics())

    # Starting `state_tuple` is (0, 0, 0, 0)
    # x, x_dot, theta, theta_dot represents the actual continuous state vector
    x, x_dot, theta, theta_dot = 0.0, 0.0, 0.0, 0.0
    state_tuple = (x, x_dot, theta, theta_dot)

    # `state` is the number given to this state, you only need to consider
    # this representation of the state
    state = cart_pole.get_state(state_tuple)
    # if min_trial_length_to_start_display == 0 or display_started == 1:
    #     cart_pole.show_cart(state_tuple, pause_time)

    mdp_data = initialize_mdp_data(NUM_STATES)

    # This is the criterion to end the simulation.
    # You should change it to terminate when the previous
    # 'NO_LEARNING_THRESHOLD' consecutive value function computations all
    # converged within one value function iteration. Intuitively, it seems
    # like there will be little learning after this, so end the simulation
    # here, and say the overall algorithm has converged.

    consecutive_no_learning_trials = 0
    while consecutive_no_learning_trials < NO_LEARNING_THRESHOLD:

        action = choose_action(state, mdp_data)

        # Get the next state by simulating the dynamics
        state_tuple = cart_pole.simulate(action, state_tuple)
        # x, x_dot, theta, theta_dot = state_tuple

        # Increment simulation time
        time = time + 1

        # Get the state number corresponding to new state vector
        new_state = cart_pole.get_state(state_tuple)
        # if display_started == 1:
        #     cart_pole.show_cart(state_tuple, pause_time)

        # reward function to use - do not change this!
        if new_state == NUM_STATES - 1:
            R = -1
        else:
            R = 0

        update_mdp_transition_counts_sum_reward(mdp_data, state, action,
                                                new_state, R)

        # Recompute MDP model whenever pole falls
        # Compute the value function V for the new model
        if new_state == NUM_STATES - 1:

            update_mdp_transition_probs_avg_reward(mdp_data)

            converged_in_one_iteration = update_mdp_value(
                mdp_data, TOLERANCE, GAMMA)

            if converged_in_one_iteration:
                consecutive_no_learning_trials = consecutive_no_learning_trials + 1
            else:
                consecutive_no_learning_trials = 0

        # Do NOT change this code: Controls the simulation, and handles the case
        # when the pole fell and the state must be reinitialized.
        if new_state == NUM_STATES - 1:
            num_failures += 1
            if num_failures >= max_failures:
                break
            print('[INFO] Failure number {}'.format(num_failures))
            time_steps_to_failure.append(time - time_at_start_of_current_trial)
            # time_steps_to_failure[num_failures] = time - time_at_start_of_current_trial
            time_at_start_of_current_trial = time

            if time_steps_to_failure[num_failures -
                                     1] > min_trial_length_to_start_display:
                display_started = 1

            # Reinitialize state
            # x = 0.0
            x = -1.1 + np.random.uniform() * 2.2
            x_dot, theta, theta_dot = 0.0, 0.0, 0.0
            state_tuple = (x, x_dot, theta, theta_dot)
            state = cart_pole.get_state(state_tuple)
        else:
            state = new_state

    # plot the learning curve (time balanced vs. trial)
    # log_tstf = np.log(np.array(time_steps_to_failure))
    tstf = np.array(time_steps_to_failure)
    plt.plot(np.arange(len(time_steps_to_failure)), tstf, 'k')
    window = 30
    w = np.array([1 / window for _ in range(window)])
    weights = lfilter(w, 1, tstf)
    x = np.arange(window // 2, len(tstf) - window // 2)
    plt.plot(x, weights[window:len(tstf)], 'r--')
    plt.xlabel('Num failures')
    plt.ylabel('Num steps to failure')
    plt.yscale('log')
    plt.ylim(bottom=1, top=2000)
    plt.savefig('./control.pdf')