Exemple #1
0
    def generate_baseline_policy(self,
                                 gamma,
                                 softmax_target_perf_ratio=0.75,
                                 baseline_target_perf_ratio=0.5,
                                 softmax_reduction_factor=0.9,
                                 perturbation_reduction_factor=0.9):
        if softmax_target_perf_ratio < baseline_target_perf_ratio:
            softmax_target_perf_ratio = baseline_target_perf_ratio

        farther_state, pi_star_perf, q_star, pi_rand_perf = self._find_farther_state(
            gamma)
        p, r = self._set_temporary_final_state(farther_state)
        self.transition_function = p.copy()
        r_reshaped = spibb_utils.get_reward_model(p, r)

        softmax_target_perf = softmax_target_perf_ratio * (pi_star_perf - pi_rand_perf) \
                              + pi_rand_perf
        pi, _, _ = self._generate_softmax_policy(q_star, p, r_reshaped,
                                                 softmax_target_perf,
                                                 softmax_reduction_factor,
                                                 gamma)

        baseline_target_perf = baseline_target_perf_ratio * (pi_star_perf - pi_rand_perf) \
                               + pi_rand_perf
        pi, v, q = self._perturb_policy(pi, q_star, p, r_reshaped,
                                        baseline_target_perf,
                                        perturbation_reduction_factor, gamma)

        return pi, q, pi_star_perf, v[0], pi_rand_perf
Exemple #2
0
 def estimate_q(self, trajectories):
     batch = []
     for trajectory in trajectories:
         for [action, state, next_state, reward] in trajectory:
             batch.append([action, state, next_state, reward])
     model = ModelTransitions(batch,self.pi_b.shape[0],self.pi_b.shape[1])
     reward_model = spibb_utils.get_reward_model(model.transitions, self.R)
     return policy_evaluation_modified(self.gamma, self.pi_t, reward_model, model.transitions)
Exemple #3
0
    def _find_farther_state(self, gamma):
        argmin = -1
        min_value = 1
        rand_value = 0
        best_q_star = 0
        mask_0, thres = spibb.compute_mask(self.nb_states, self.nb_actions, 1,
                                           1, [])
        mask_0 = ~mask_0
        rand_pi = np.ones((self.nb_states, self.nb_actions)) / self.nb_actions
        for final_state in range(1, self.nb_states):
            p, r = self._set_temporary_final_state(final_state)
            r_reshaped = spibb_utils.get_reward_model(p, r)

            rl = spibb.spibb(gamma, self.nb_states, self.nb_actions, mask_0,
                             mask_0, p, r_reshaped, 'default', 0, None, None,
                             None)
            rl.fit()
            v_star, q_star = spibb.policy_evaluation_exact(
                rl.pi, r_reshaped, p, gamma)
            v_rand, q_rand = spibb.policy_evaluation_exact(
                rand_pi, r_reshaped, p, gamma)

            perf_star = v_star[0]
            perf_rand = v_rand[0]

            if perf_star < min_value and perf_star > gamma**50:
                min_value = perf_star
                argmin = final_state
                rand_value = perf_rand
                best_q_star = q_star.copy()

        avg_time_to_goal = np.log(min_value) / np.log(gamma)
        avg_time_to_goal_rand = np.log(rand_value) / np.log(gamma)
        print("Optimal performance : " + str(min_value))
        print("Optimal average time to goal: " + str(avg_time_to_goal))
        print("Random policy performance : " + str(rand_value))
        print("Random policy average time to goal: " +
              str(avg_time_to_goal_rand))

        return argmin, min_value, best_q_star, rand_value
pi_b = spibb_utils.compute_baseline(Q_baseline)

pi_behavioural = np.ones(pi_b.shape)/nb_actions


# The batch sizes:
nb_trajectories_list = [10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000]
N_wedges = [5,7,10,15,20,30,50,70,100]
v = np.zeros(nb_states)

# Pre-compute the true reward function in function of SxA:
current_proba = maze.transition_function
garnet = garnets.Garnets(nb_states, nb_actions, 1, self_transitions=0)
garnet.transition_function = current_proba
reward_current = garnet.compute_reward()
r_reshaped = spibb_utils.get_reward_model(current_proba, reward_current)


# Compute the baseline policy performance:
pi_b_perf = spibb.policy_evaluation_exact(pi_b, r_reshaped, current_proba, gamma)[0][0]
print("baseline_perf: " + str(pi_b_perf))


# Creates a mask that is always True for classical RL and other non policy-based SPIBB algorithms
mask_0, thres = spibb.compute_mask(nb_states, nb_actions, 1, 1, [])
mask_0 = ~mask_0

pi_star = spibb.spibb(gamma, nb_states, nb_actions, mask_0, mask_0, current_proba, r_reshaped, 'default')
pi_star.fit()
pi_star_perf = spibb.policy_evaluation_exact(pi_star.pi, r_reshaped, current_proba, gamma)[0][0]
print("pi_star_perf: " + str(pi_star_perf))
Exemple #5
0
    garnet = garnets.Garnets(nb_states,
                             nb_actions,
                             nb_next_state_transition,
                             env_type=env_type,
                             self_transitions=self_transitions)

    softmax_target_perf_ratio = (ratio + 1) / 2
    baseline_target_perf_ratio = ratio
    pi_b, q_pi_b, pi_star_perf, pi_b_perf, pi_rand_perf = \
        garnet.generate_baseline_policy(gamma,
                                        softmax_target_perf_ratio=softmax_target_perf_ratio,
                                        baseline_target_perf_ratio=baseline_target_perf_ratio, log=False)

    reward_current = garnet.compute_reward()
    current_proba = garnet.transition_function
    r_reshaped = spibb_utils.get_reward_model(current_proba, reward_current)
    results_traj = []

    for nb_trajectories in nb_trajectories_list:
        # Generate trajectories, both stored as trajectories and (s,a,s',r) transition samples
        trajectories, batch_traj = spibb_utils.generate_batch(
            nb_trajectories, garnet, pi_b)

        # Computation of the transition errors
        # These are the e_q
        # errors = spibb.compute_errors(nb_states, nb_actions, delta, batch_traj)
        errors = compute_errors_p(nb_states,
                                  nb_actions,
                                  delta,
                                  batch_traj,
                                  unvisited=2)
Exemple #6
0
max_turbulence = 3.5
max_velocity = 3

nb_states = length * width
nb_actions = 5

gamma = 0.95

wet_chicken = WetChicken(length=length,
                         width=width,
                         max_turbulence=max_turbulence,
                         max_velocity=max_velocity)

P = wet_chicken.get_transition_function()
R = wet_chicken.get_reward_function()
r_reshaped = spibb_utils.get_reward_model(P, R)

nb_samples = 1000000
list_unbiased_wet_chicken = []
for i in range(nb_samples):
    if i % 100000 == 0:
        print(f'{i} out of {nb_samples} done.')
    pi_sample = sample_deterministic_policy(nb_states, nb_actions, bias=False)
    list_unbiased_wet_chicken.append(
        spibb.policy_evaluation_exact(pi_sample, r_reshaped, P, gamma)[0][0])
df_unbiased_wet_chicken = pd.DataFrame(data=list_unbiased_wet_chicken,
                                       columns=['Performance'])

sns.set(font_scale=2)
g = sns.FacetGrid(data=df_unbiased_wet_chicken)
g.map(sns.ecdfplot, 'Performance')