def test_11(self): """1-11-hidden: Checking value iteration with complex environment.""" sub_or_sol = self.run_with_solution_if_possible(submission, lambda sub_or_sol: sub_or_sol) mdp_data = observe_steps(sub_or_sol) v = mdp_data['value'] sub_or_sol.update_mdp_transition_probs_avg_reward(mdp_data) sub_or_sol.update_mdp_value(mdp_data, 0.01, 0.99) solution_value = mdp_data['value'] mdp_data = observe_steps(sub_or_sol) mdp_data['value'] = v sub_or_sol.update_mdp_transition_probs_avg_reward(mdp_data) submission.update_mdp_value(mdp_data, 0.01, 0.99) submission_value = mdp_data['value'] self.assertTrue((solution_value == submission_value).all())
def test_10(self): """1-10-basic: Checking simple, two-step value iteration.""" mdp_data = {} mdp_data['transition_probs'] = np.array([[[0., 1., 0.], [0., 1., 0.]], [[0., 0., 1.], [0., 0., 1.]], [[0., 0., 1.], [0., 0., 1.]]],dtype=np.float64) mdp_data['transition_counts'] = np.array([[[0., 1., 0.], [0., 1., 0.]], [[0., 0., 1.], [0., 0., 1.]], [[0., 0., 1.], [0., 0., 1.]]],dtype=np.float64) mdp_data['avg_reward'] = np.array([0.,1.,0.],dtype=np.float64) mdp_data['sum_reward'] = np.array([0.,2.,0.],dtype=np.float64) mdp_data['num_states'] = 3 mdp_data['value'] = np.array([0.,0.,0.], dtype=np.float64) submission.update_mdp_value(mdp_data, 0.01, 0.99) self.assertTrue((mdp_data['value'] == np.array([.99,1.,0.], dtype=np.float64)).all())
def test_09(self): """1-9-basic: Checking value iteration algorithm returns the correct types and shapes""" mdp_data = observe_steps(submission) submission.update_mdp_transition_probs_avg_reward(mdp_data) submission.update_mdp_value(mdp_data, 0.01, 0.99) self.check_valid_mdp_data(mdp_data, 4)
def main(): # Simulation parameters pause_time = 0.0001 min_trial_length_to_start_display = 100 display_started = min_trial_length_to_start_display == 0 NUM_STATES = 163 GAMMA = 0.995 TOLERANCE = 0.01 NO_LEARNING_THRESHOLD = 500 # Time cycle of the simulation time = 0 # These variables perform bookkeeping (how many cycles was the pole # balanced for before it fell). Useful for plotting learning curves. time_steps_to_failure = [] num_failures = 0 time_at_start_of_current_trial = 0 # You should reach convergence well before this max_failures = 500 # Initialize a cart pole cart_pole = CartPole(Physics()) # Starting `state_tuple` is (0, 0, 0, 0) # x, x_dot, theta, theta_dot represents the actual continuous state vector x, x_dot, theta, theta_dot = 0.0, 0.0, 0.0, 0.0 state_tuple = (x, x_dot, theta, theta_dot) # `state` is the number given to this state, you only need to consider # this representation of the state state = cart_pole.get_state(state_tuple) # if min_trial_length_to_start_display == 0 or display_started == 1: # cart_pole.show_cart(state_tuple, pause_time) mdp_data = initialize_mdp_data(NUM_STATES) # This is the criterion to end the simulation. # You should change it to terminate when the previous # 'NO_LEARNING_THRESHOLD' consecutive value function computations all # converged within one value function iteration. Intuitively, it seems # like there will be little learning after this, so end the simulation # here, and say the overall algorithm has converged. consecutive_no_learning_trials = 0 while consecutive_no_learning_trials < NO_LEARNING_THRESHOLD: action = choose_action(state, mdp_data) # Get the next state by simulating the dynamics state_tuple = cart_pole.simulate(action, state_tuple) # x, x_dot, theta, theta_dot = state_tuple # Increment simulation time time = time + 1 # Get the state number corresponding to new state vector new_state = cart_pole.get_state(state_tuple) # if display_started == 1: # cart_pole.show_cart(state_tuple, pause_time) # reward function to use - do not change this! if new_state == NUM_STATES - 1: R = -1 else: R = 0 update_mdp_transition_counts_sum_reward(mdp_data, state, action, new_state, R) # Recompute MDP model whenever pole falls # Compute the value function V for the new model if new_state == NUM_STATES - 1: update_mdp_transition_probs_avg_reward(mdp_data) converged_in_one_iteration = update_mdp_value( mdp_data, TOLERANCE, GAMMA) if converged_in_one_iteration: consecutive_no_learning_trials = consecutive_no_learning_trials + 1 else: consecutive_no_learning_trials = 0 # Do NOT change this code: Controls the simulation, and handles the case # when the pole fell and the state must be reinitialized. if new_state == NUM_STATES - 1: num_failures += 1 if num_failures >= max_failures: break print('[INFO] Failure number {}'.format(num_failures)) time_steps_to_failure.append(time - time_at_start_of_current_trial) # time_steps_to_failure[num_failures] = time - time_at_start_of_current_trial time_at_start_of_current_trial = time if time_steps_to_failure[num_failures - 1] > min_trial_length_to_start_display: display_started = 1 # Reinitialize state # x = 0.0 x = -1.1 + np.random.uniform() * 2.2 x_dot, theta, theta_dot = 0.0, 0.0, 0.0 state_tuple = (x, x_dot, theta, theta_dot) state = cart_pole.get_state(state_tuple) else: state = new_state # plot the learning curve (time balanced vs. trial) # log_tstf = np.log(np.array(time_steps_to_failure)) tstf = np.array(time_steps_to_failure) plt.plot(np.arange(len(time_steps_to_failure)), tstf, 'k') window = 30 w = np.array([1 / window for _ in range(window)]) weights = lfilter(w, 1, tstf) x = np.arange(window // 2, len(tstf) - window // 2) plt.plot(x, weights[window:len(tstf)], 'r--') plt.xlabel('Num failures') plt.ylabel('Num steps to failure') plt.yscale('log') plt.ylim(bottom=1, top=2000) plt.savefig('./control.pdf')