Exemple #1
0
    def _find_farther_state(self, gamma):
        argmin = -1
        min_value = 1
        rand_value = 0
        best_q_star = 0
        mask_0, thres = spibb.compute_mask(self.nb_states, self.nb_actions, 1,
                                           1, [])
        mask_0 = ~mask_0
        rand_pi = np.ones((self.nb_states, self.nb_actions)) / self.nb_actions
        for final_state in range(1, self.nb_states):
            p, r = self._set_temporary_final_state(final_state)
            r_reshaped = spibb_utils.get_reward_model(p, r)

            rl = spibb.spibb(gamma, self.nb_states, self.nb_actions, mask_0,
                             mask_0, p, r_reshaped, 'default', 0, None, None,
                             None)
            rl.fit()
            v_star, q_star = spibb.policy_evaluation_exact(
                rl.pi, r_reshaped, p, gamma)
            v_rand, q_rand = spibb.policy_evaluation_exact(
                rand_pi, r_reshaped, p, gamma)

            perf_star = v_star[0]
            perf_rand = v_rand[0]

            if perf_star < min_value and perf_star > gamma**50:
                min_value = perf_star
                argmin = final_state
                rand_value = perf_rand
                best_q_star = q_star.copy()

        avg_time_to_goal = np.log(min_value) / np.log(gamma)
        avg_time_to_goal_rand = np.log(rand_value) / np.log(gamma)
        print("Optimal performance : " + str(min_value))
        print("Optimal average time to goal: " + str(avg_time_to_goal))
        print("Random policy performance : " + str(rand_value))
        print("Random policy average time to goal: " +
              str(avg_time_to_goal_rand))

        return argmin, min_value, best_q_star, rand_value
Exemple #2
0
    def _generate_softmax_policy(self, q_star, p, r_reshaped,
                                 softmax_target_perf, reduction_factor, gamma):
        temp = 2 * 10**6  # Actually starts exploring for half its value.
        v = np.ones(1)
        while v[0] > softmax_target_perf:
            temp *= reduction_factor
            pi = spibb.softmax(q_star, temp)
            v, q = spibb.policy_evaluation_exact(pi, r_reshaped, p, gamma)

        avg_time_to_goal = np.log(v[0]) / np.log(gamma)
        print("Softmax performance : " + str(v[0]))
        print("Softmax temperature : " + str(temp))
        print("Softmax average time to goal: " + str(avg_time_to_goal))
        return pi, v, q
Exemple #3
0
    def _perturb_policy(self, pi, q_star, p, r_reshaped, baseline_target_perf,
                        reduction_factor, gamma):
        v = np.ones(1)
        while v[0] > baseline_target_perf:
            x = np.random.randint(self.nb_states)
            pi[x, np.argmax(q_star[x, :])] *= reduction_factor
            pi[x, :] /= np.sum(pi[x, :])
            v, q = spibb.policy_evaluation_exact(pi, r_reshaped, p, gamma)

        avg_time_to_goal = np.log(v[0]) / np.log(gamma)
        print("Perturbed policy performance : " + str(v[0]))
        print("Perturbed policy average time to goal: " +
              str(avg_time_to_goal))
        return pi, v, q
# The batch sizes:
nb_trajectories_list = [10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000]
N_wedges = [5,7,10,15,20,30,50,70,100]
v = np.zeros(nb_states)

# Pre-compute the true reward function in function of SxA:
current_proba = maze.transition_function
garnet = garnets.Garnets(nb_states, nb_actions, 1, self_transitions=0)
garnet.transition_function = current_proba
reward_current = garnet.compute_reward()
r_reshaped = spibb_utils.get_reward_model(current_proba, reward_current)


# Compute the baseline policy performance:
pi_b_perf = spibb.policy_evaluation_exact(pi_b, r_reshaped, current_proba, gamma)[0][0]
print("baseline_perf: " + str(pi_b_perf))


# Creates a mask that is always True for classical RL and other non policy-based SPIBB algorithms
mask_0, thres = spibb.compute_mask(nb_states, nb_actions, 1, 1, [])
mask_0 = ~mask_0

pi_star = spibb.spibb(gamma, nb_states, nb_actions, mask_0, mask_0, current_proba, r_reshaped, 'default')
pi_star.fit()
pi_star_perf = spibb.policy_evaluation_exact(pi_star.pi, r_reshaped, current_proba, gamma)[0][0]
print("pi_star_perf: " + str(pi_star_perf))

# Place to save the results
filename = 'results/' + expname + '/results_' + str(index)
Exemple #5
0
		for nb_trajectories in nb_trajectories_list:
			# Generate trajectories, both stored as trajectories and (s,a,s',r) transition samples
			trajectories, batch_traj = spibb_utils.generate_batch(nb_trajectories, garnet, pi_b)
			spibb_utils.prt("GENERATED A DATASET OF " + str(nb_trajectories) + " TRAJECTORIES")

			# Compute the maximal likelihood model for transitions and rewards.
			# NB: the true reward function can be used for ease of implementation since it is not stochastic in our environment.
			# One should compute it fro mthe samples when it is stochastic.
			model = modelTransitions.ModelTransitions(batch_traj, nb_states, nb_actions)
			reward_model = spibb_utils.get_reward_model(model.transitions, reward_current)

			policy_error = np.sum(abs(pi_b - model.policy), 1)
			# print("policy l1 error:", policy_error)
			print("policy divergence. mean: %05.4f; std: %05.4f" % (np.mean(policy_error), np.std(policy_error)))
			perf_pi_hat = spibb.policy_evaluation_exact(model.policy, r_reshaped, current_proba, gamma)[0][0]
			print("perf pi_hat: " + str(perf_pi_hat))

			# Estimates the values of the baseline policy with a monte-carlo estimation from the batch data:
			# q_pib_est = spibb_utils.compute_q_pib_est(gamma, nb_states, nb_actions, trajectories)

			# Computes the RL policy
			rl = spibb.spibb(gamma, nb_states, nb_actions, pi_b, mask_0, model.transitions, reward_model, 'default')
			rl.fit()
			# Evaluates the RL policy performance
			perfrl = spibb.policy_evaluation_exact(rl.pi, r_reshaped, current_proba, gamma)[0][0]
			print("perf RL: " + str(perfrl))


			# Computes the Reward-adjusted MDP RL policy:
			count_state_action = 0.00001 * np.ones((nb_states, nb_actions))
Exemple #6
0
                         width=width,
                         max_turbulence=max_turbulence,
                         max_velocity=max_velocity)

P = wet_chicken.get_transition_function()
R = wet_chicken.get_reward_function()
r_reshaped = spibb_utils.get_reward_model(P, R)

nb_samples = 1000000
list_unbiased_wet_chicken = []
for i in range(nb_samples):
    if i % 100000 == 0:
        print(f'{i} out of {nb_samples} done.')
    pi_sample = sample_deterministic_policy(nb_states, nb_actions, bias=False)
    list_unbiased_wet_chicken.append(
        spibb.policy_evaluation_exact(pi_sample, r_reshaped, P, gamma)[0][0])
df_unbiased_wet_chicken = pd.DataFrame(data=list_unbiased_wet_chicken,
                                       columns=['Performance'])

sns.set(font_scale=2)
g = sns.FacetGrid(data=df_unbiased_wet_chicken)
g.map(sns.ecdfplot, 'Performance')
g.fig.suptitle(
    f'ECDF of the performance of sampled deterministic policies on wet chicken'
)
plt.subplots_adjust(top=0.92, right=0.9, left=0.05, bottom=0.19)

### For RandomMDPs

gamma = 0.95
nb_states = 50
            easter_egg = np.random.choice(potential_final_states)
            # Or pick the one with the least transitions
            # current_proba_sum = current_proba.reshape(-1, current_proba.shape[-1]).sum(axis=0)
            # mask_easter = np.ma.array(current_proba_sum, mask=False)
            # mask_easter.mask[garnet.final_state] = True
            # easter_egg = np.argmin(mask_easter)
            assert (garnet.final_state != easter_egg)
            reward_current[:, easter_egg] = 1
            current_proba[easter_egg, :, :] = 0
            r_reshaped = spibb_utils.get_reward_model(current_proba,
                                                      reward_current)
            # Compute optimal policy in this new environment
            true_rl = spibb.spibb(gamma, nb_states, nb_actions, mask_0, mask_0,
                                  current_proba, r_reshaped, 'default')
            true_rl.fit()
            pi_star_perf = spibb.policy_evaluation_exact(
                true_rl.pi, r_reshaped, current_proba, gamma)[0][0]
            print("Optimal perf in easter egg environment:\t\t\t" +
                  str(pi_star_perf))
            pi_b_perf = spibb.policy_evaluation_exact(pi_b, r_reshaped,
                                                      current_proba,
                                                      gamma)[0][0]
            print("Baseline perf in easter egg environment:\t\t\t" +
                  str(pi_b_perf))
        else:
            easter_egg = None
            r_reshaped = spibb_utils.get_reward_model(current_proba,
                                                      reward_current)

        for nb_trajectories in nb_trajectories_list:
            # Generate trajectories, both stored as trajectories and (s,a,s',r) transition samples
            trajectories, batch_traj = spibb_utils.generate_batch(