def _find_farther_state(self, gamma): argmin = -1 min_value = 1 rand_value = 0 best_q_star = 0 mask_0, thres = spibb.compute_mask(self.nb_states, self.nb_actions, 1, 1, []) mask_0 = ~mask_0 rand_pi = np.ones((self.nb_states, self.nb_actions)) / self.nb_actions for final_state in range(1, self.nb_states): p, r = self._set_temporary_final_state(final_state) r_reshaped = spibb_utils.get_reward_model(p, r) rl = spibb.spibb(gamma, self.nb_states, self.nb_actions, mask_0, mask_0, p, r_reshaped, 'default', 0, None, None, None) rl.fit() v_star, q_star = spibb.policy_evaluation_exact( rl.pi, r_reshaped, p, gamma) v_rand, q_rand = spibb.policy_evaluation_exact( rand_pi, r_reshaped, p, gamma) perf_star = v_star[0] perf_rand = v_rand[0] if perf_star < min_value and perf_star > gamma**50: min_value = perf_star argmin = final_state rand_value = perf_rand best_q_star = q_star.copy() avg_time_to_goal = np.log(min_value) / np.log(gamma) avg_time_to_goal_rand = np.log(rand_value) / np.log(gamma) print("Optimal performance : " + str(min_value)) print("Optimal average time to goal: " + str(avg_time_to_goal)) print("Random policy performance : " + str(rand_value)) print("Random policy average time to goal: " + str(avg_time_to_goal_rand)) return argmin, min_value, best_q_star, rand_value
def _generate_softmax_policy(self, q_star, p, r_reshaped, softmax_target_perf, reduction_factor, gamma): temp = 2 * 10**6 # Actually starts exploring for half its value. v = np.ones(1) while v[0] > softmax_target_perf: temp *= reduction_factor pi = spibb.softmax(q_star, temp) v, q = spibb.policy_evaluation_exact(pi, r_reshaped, p, gamma) avg_time_to_goal = np.log(v[0]) / np.log(gamma) print("Softmax performance : " + str(v[0])) print("Softmax temperature : " + str(temp)) print("Softmax average time to goal: " + str(avg_time_to_goal)) return pi, v, q
def _perturb_policy(self, pi, q_star, p, r_reshaped, baseline_target_perf, reduction_factor, gamma): v = np.ones(1) while v[0] > baseline_target_perf: x = np.random.randint(self.nb_states) pi[x, np.argmax(q_star[x, :])] *= reduction_factor pi[x, :] /= np.sum(pi[x, :]) v, q = spibb.policy_evaluation_exact(pi, r_reshaped, p, gamma) avg_time_to_goal = np.log(v[0]) / np.log(gamma) print("Perturbed policy performance : " + str(v[0])) print("Perturbed policy average time to goal: " + str(avg_time_to_goal)) return pi, v, q
# The batch sizes: nb_trajectories_list = [10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000] N_wedges = [5,7,10,15,20,30,50,70,100] v = np.zeros(nb_states) # Pre-compute the true reward function in function of SxA: current_proba = maze.transition_function garnet = garnets.Garnets(nb_states, nb_actions, 1, self_transitions=0) garnet.transition_function = current_proba reward_current = garnet.compute_reward() r_reshaped = spibb_utils.get_reward_model(current_proba, reward_current) # Compute the baseline policy performance: pi_b_perf = spibb.policy_evaluation_exact(pi_b, r_reshaped, current_proba, gamma)[0][0] print("baseline_perf: " + str(pi_b_perf)) # Creates a mask that is always True for classical RL and other non policy-based SPIBB algorithms mask_0, thres = spibb.compute_mask(nb_states, nb_actions, 1, 1, []) mask_0 = ~mask_0 pi_star = spibb.spibb(gamma, nb_states, nb_actions, mask_0, mask_0, current_proba, r_reshaped, 'default') pi_star.fit() pi_star_perf = spibb.policy_evaluation_exact(pi_star.pi, r_reshaped, current_proba, gamma)[0][0] print("pi_star_perf: " + str(pi_star_perf)) # Place to save the results filename = 'results/' + expname + '/results_' + str(index)
for nb_trajectories in nb_trajectories_list: # Generate trajectories, both stored as trajectories and (s,a,s',r) transition samples trajectories, batch_traj = spibb_utils.generate_batch(nb_trajectories, garnet, pi_b) spibb_utils.prt("GENERATED A DATASET OF " + str(nb_trajectories) + " TRAJECTORIES") # Compute the maximal likelihood model for transitions and rewards. # NB: the true reward function can be used for ease of implementation since it is not stochastic in our environment. # One should compute it fro mthe samples when it is stochastic. model = modelTransitions.ModelTransitions(batch_traj, nb_states, nb_actions) reward_model = spibb_utils.get_reward_model(model.transitions, reward_current) policy_error = np.sum(abs(pi_b - model.policy), 1) # print("policy l1 error:", policy_error) print("policy divergence. mean: %05.4f; std: %05.4f" % (np.mean(policy_error), np.std(policy_error))) perf_pi_hat = spibb.policy_evaluation_exact(model.policy, r_reshaped, current_proba, gamma)[0][0] print("perf pi_hat: " + str(perf_pi_hat)) # Estimates the values of the baseline policy with a monte-carlo estimation from the batch data: # q_pib_est = spibb_utils.compute_q_pib_est(gamma, nb_states, nb_actions, trajectories) # Computes the RL policy rl = spibb.spibb(gamma, nb_states, nb_actions, pi_b, mask_0, model.transitions, reward_model, 'default') rl.fit() # Evaluates the RL policy performance perfrl = spibb.policy_evaluation_exact(rl.pi, r_reshaped, current_proba, gamma)[0][0] print("perf RL: " + str(perfrl)) # Computes the Reward-adjusted MDP RL policy: count_state_action = 0.00001 * np.ones((nb_states, nb_actions))
width=width, max_turbulence=max_turbulence, max_velocity=max_velocity) P = wet_chicken.get_transition_function() R = wet_chicken.get_reward_function() r_reshaped = spibb_utils.get_reward_model(P, R) nb_samples = 1000000 list_unbiased_wet_chicken = [] for i in range(nb_samples): if i % 100000 == 0: print(f'{i} out of {nb_samples} done.') pi_sample = sample_deterministic_policy(nb_states, nb_actions, bias=False) list_unbiased_wet_chicken.append( spibb.policy_evaluation_exact(pi_sample, r_reshaped, P, gamma)[0][0]) df_unbiased_wet_chicken = pd.DataFrame(data=list_unbiased_wet_chicken, columns=['Performance']) sns.set(font_scale=2) g = sns.FacetGrid(data=df_unbiased_wet_chicken) g.map(sns.ecdfplot, 'Performance') g.fig.suptitle( f'ECDF of the performance of sampled deterministic policies on wet chicken' ) plt.subplots_adjust(top=0.92, right=0.9, left=0.05, bottom=0.19) ### For RandomMDPs gamma = 0.95 nb_states = 50
easter_egg = np.random.choice(potential_final_states) # Or pick the one with the least transitions # current_proba_sum = current_proba.reshape(-1, current_proba.shape[-1]).sum(axis=0) # mask_easter = np.ma.array(current_proba_sum, mask=False) # mask_easter.mask[garnet.final_state] = True # easter_egg = np.argmin(mask_easter) assert (garnet.final_state != easter_egg) reward_current[:, easter_egg] = 1 current_proba[easter_egg, :, :] = 0 r_reshaped = spibb_utils.get_reward_model(current_proba, reward_current) # Compute optimal policy in this new environment true_rl = spibb.spibb(gamma, nb_states, nb_actions, mask_0, mask_0, current_proba, r_reshaped, 'default') true_rl.fit() pi_star_perf = spibb.policy_evaluation_exact( true_rl.pi, r_reshaped, current_proba, gamma)[0][0] print("Optimal perf in easter egg environment:\t\t\t" + str(pi_star_perf)) pi_b_perf = spibb.policy_evaluation_exact(pi_b, r_reshaped, current_proba, gamma)[0][0] print("Baseline perf in easter egg environment:\t\t\t" + str(pi_b_perf)) else: easter_egg = None r_reshaped = spibb_utils.get_reward_model(current_proba, reward_current) for nb_trajectories in nb_trajectories_list: # Generate trajectories, both stored as trajectories and (s,a,s',r) transition samples trajectories, batch_traj = spibb_utils.generate_batch(