def _find_farther_state(self, gamma): argmin = -1 min_value = 1 rand_value = 0 best_q_star = 0 mask_0, thres = spibb.compute_mask(self.nb_states, self.nb_actions, 1, 1, []) mask_0 = ~mask_0 rand_pi = np.ones((self.nb_states, self.nb_actions)) / self.nb_actions for final_state in range(1, self.nb_states): p, r = self._set_temporary_final_state(final_state) r_reshaped = spibb_utils.get_reward_model(p, r) rl = spibb.spibb(gamma, self.nb_states, self.nb_actions, mask_0, mask_0, p, r_reshaped, 'default', 0, None, None, None) rl.fit() v_star, q_star = spibb.policy_evaluation_exact( rl.pi, r_reshaped, p, gamma) v_rand, q_rand = spibb.policy_evaluation_exact( rand_pi, r_reshaped, p, gamma) perf_star = v_star[0] perf_rand = v_rand[0] if perf_star < min_value and perf_star > gamma**50: min_value = perf_star argmin = final_state rand_value = perf_rand best_q_star = q_star.copy() avg_time_to_goal = np.log(min_value) / np.log(gamma) avg_time_to_goal_rand = np.log(rand_value) / np.log(gamma) print("Optimal performance : " + str(min_value)) print("Optimal average time to goal: " + str(avg_time_to_goal)) print("Random policy performance : " + str(rand_value)) print("Random policy average time to goal: " + str(avg_time_to_goal_rand)) return argmin, min_value, best_q_star, rand_value
# Pre-compute the true reward function in function of SxA: current_proba = maze.transition_function garnet = garnets.Garnets(nb_states, nb_actions, 1, self_transitions=0) garnet.transition_function = current_proba reward_current = garnet.compute_reward() r_reshaped = spibb_utils.get_reward_model(current_proba, reward_current) # Compute the baseline policy performance: pi_b_perf = spibb.policy_evaluation_exact(pi_b, r_reshaped, current_proba, gamma)[0][0] print("baseline_perf: " + str(pi_b_perf)) # Creates a mask that is always True for classical RL and other non policy-based SPIBB algorithms mask_0, thres = spibb.compute_mask(nb_states, nb_actions, 1, 1, []) mask_0 = ~mask_0 pi_star = spibb.spibb(gamma, nb_states, nb_actions, mask_0, mask_0, current_proba, r_reshaped, 'default') pi_star.fit() pi_star_perf = spibb.policy_evaluation_exact(pi_star.pi, r_reshaped, current_proba, gamma)[0][0] print("pi_star_perf: " + str(pi_star_perf)) # Place to save the results filename = 'results/' + expname + '/results_' + str(index) results = [] if not os.path.isdir('results'): os.mkdir('results') if not os.path.isdir('results/' + expname): os.mkdir('results/' + expname)
N_wedges = [5, 7, 10, 15, 20] delta = 1 epsilons = [0.1, 0.2, 0.5, 1, 2, 5] nb_trajectories_list = [10, 20, 50, 100, 200, 500, 1000, 2000] ratios = [0.1, 0.9] seed = index np.random.seed(seed) gamma = 0.95 nb_states = 50 nb_actions = 4 nb_next_state_transition = 4 mask_0, thres = spibb.compute_mask(nb_states, nb_actions, 1, 1, []) mask_0 = ~mask_0 rand_pi = np.ones((nb_states,nb_actions)) / nb_actions filename = 'results/' + expname + '/results_' + str(index) results = [] if not os.path.isdir('results'): os.mkdir('results') if not os.path.isdir('results/' + expname): os.mkdir('results/' + expname) while True: for ratio in ratios: garnet = garnets.Garnets(nb_states, nb_actions, nb_next_state_transition, self_transitions=0)