def generate_baseline_policy(self, gamma, softmax_target_perf_ratio=0.75, baseline_target_perf_ratio=0.5, softmax_reduction_factor=0.9, perturbation_reduction_factor=0.9): if softmax_target_perf_ratio < baseline_target_perf_ratio: softmax_target_perf_ratio = baseline_target_perf_ratio farther_state, pi_star_perf, q_star, pi_rand_perf = self._find_farther_state( gamma) p, r = self._set_temporary_final_state(farther_state) self.transition_function = p.copy() r_reshaped = spibb_utils.get_reward_model(p, r) softmax_target_perf = softmax_target_perf_ratio * (pi_star_perf - pi_rand_perf) \ + pi_rand_perf pi, _, _ = self._generate_softmax_policy(q_star, p, r_reshaped, softmax_target_perf, softmax_reduction_factor, gamma) baseline_target_perf = baseline_target_perf_ratio * (pi_star_perf - pi_rand_perf) \ + pi_rand_perf pi, v, q = self._perturb_policy(pi, q_star, p, r_reshaped, baseline_target_perf, perturbation_reduction_factor, gamma) return pi, q, pi_star_perf, v[0], pi_rand_perf
def estimate_q(self, trajectories): batch = [] for trajectory in trajectories: for [action, state, next_state, reward] in trajectory: batch.append([action, state, next_state, reward]) model = ModelTransitions(batch,self.pi_b.shape[0],self.pi_b.shape[1]) reward_model = spibb_utils.get_reward_model(model.transitions, self.R) return policy_evaluation_modified(self.gamma, self.pi_t, reward_model, model.transitions)
def _find_farther_state(self, gamma): argmin = -1 min_value = 1 rand_value = 0 best_q_star = 0 mask_0, thres = spibb.compute_mask(self.nb_states, self.nb_actions, 1, 1, []) mask_0 = ~mask_0 rand_pi = np.ones((self.nb_states, self.nb_actions)) / self.nb_actions for final_state in range(1, self.nb_states): p, r = self._set_temporary_final_state(final_state) r_reshaped = spibb_utils.get_reward_model(p, r) rl = spibb.spibb(gamma, self.nb_states, self.nb_actions, mask_0, mask_0, p, r_reshaped, 'default', 0, None, None, None) rl.fit() v_star, q_star = spibb.policy_evaluation_exact( rl.pi, r_reshaped, p, gamma) v_rand, q_rand = spibb.policy_evaluation_exact( rand_pi, r_reshaped, p, gamma) perf_star = v_star[0] perf_rand = v_rand[0] if perf_star < min_value and perf_star > gamma**50: min_value = perf_star argmin = final_state rand_value = perf_rand best_q_star = q_star.copy() avg_time_to_goal = np.log(min_value) / np.log(gamma) avg_time_to_goal_rand = np.log(rand_value) / np.log(gamma) print("Optimal performance : " + str(min_value)) print("Optimal average time to goal: " + str(avg_time_to_goal)) print("Random policy performance : " + str(rand_value)) print("Random policy average time to goal: " + str(avg_time_to_goal_rand)) return argmin, min_value, best_q_star, rand_value
pi_b = spibb_utils.compute_baseline(Q_baseline) pi_behavioural = np.ones(pi_b.shape)/nb_actions # The batch sizes: nb_trajectories_list = [10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000] N_wedges = [5,7,10,15,20,30,50,70,100] v = np.zeros(nb_states) # Pre-compute the true reward function in function of SxA: current_proba = maze.transition_function garnet = garnets.Garnets(nb_states, nb_actions, 1, self_transitions=0) garnet.transition_function = current_proba reward_current = garnet.compute_reward() r_reshaped = spibb_utils.get_reward_model(current_proba, reward_current) # Compute the baseline policy performance: pi_b_perf = spibb.policy_evaluation_exact(pi_b, r_reshaped, current_proba, gamma)[0][0] print("baseline_perf: " + str(pi_b_perf)) # Creates a mask that is always True for classical RL and other non policy-based SPIBB algorithms mask_0, thres = spibb.compute_mask(nb_states, nb_actions, 1, 1, []) mask_0 = ~mask_0 pi_star = spibb.spibb(gamma, nb_states, nb_actions, mask_0, mask_0, current_proba, r_reshaped, 'default') pi_star.fit() pi_star_perf = spibb.policy_evaluation_exact(pi_star.pi, r_reshaped, current_proba, gamma)[0][0] print("pi_star_perf: " + str(pi_star_perf))
garnet = garnets.Garnets(nb_states, nb_actions, nb_next_state_transition, env_type=env_type, self_transitions=self_transitions) softmax_target_perf_ratio = (ratio + 1) / 2 baseline_target_perf_ratio = ratio pi_b, q_pi_b, pi_star_perf, pi_b_perf, pi_rand_perf = \ garnet.generate_baseline_policy(gamma, softmax_target_perf_ratio=softmax_target_perf_ratio, baseline_target_perf_ratio=baseline_target_perf_ratio, log=False) reward_current = garnet.compute_reward() current_proba = garnet.transition_function r_reshaped = spibb_utils.get_reward_model(current_proba, reward_current) results_traj = [] for nb_trajectories in nb_trajectories_list: # Generate trajectories, both stored as trajectories and (s,a,s',r) transition samples trajectories, batch_traj = spibb_utils.generate_batch( nb_trajectories, garnet, pi_b) # Computation of the transition errors # These are the e_q # errors = spibb.compute_errors(nb_states, nb_actions, delta, batch_traj) errors = compute_errors_p(nb_states, nb_actions, delta, batch_traj, unvisited=2)
max_turbulence = 3.5 max_velocity = 3 nb_states = length * width nb_actions = 5 gamma = 0.95 wet_chicken = WetChicken(length=length, width=width, max_turbulence=max_turbulence, max_velocity=max_velocity) P = wet_chicken.get_transition_function() R = wet_chicken.get_reward_function() r_reshaped = spibb_utils.get_reward_model(P, R) nb_samples = 1000000 list_unbiased_wet_chicken = [] for i in range(nb_samples): if i % 100000 == 0: print(f'{i} out of {nb_samples} done.') pi_sample = sample_deterministic_policy(nb_states, nb_actions, bias=False) list_unbiased_wet_chicken.append( spibb.policy_evaluation_exact(pi_sample, r_reshaped, P, gamma)[0][0]) df_unbiased_wet_chicken = pd.DataFrame(data=list_unbiased_wet_chicken, columns=['Performance']) sns.set(font_scale=2) g = sns.FacetGrid(data=df_unbiased_wet_chicken) g.map(sns.ecdfplot, 'Performance')