def _run_one_iteration(self): """ Runs one iteration on the Random MDPs benchmark, so iterates through different baseline and data set parameters and then starts the computation for each algorithm. """ path_config = configparser.ConfigParser() path_config.read(os.path.join(directory, 'paths.ini')) spibb_path = path_config['PATHS']['spibb_path'] sys.path.append(spibb_path) import garnets for baseline_target_perf_ratio in self.baseline_target_perf_ratios: print(f'Process with seed {self.seed} starting with baseline_target_perf_ratio {baseline_target_perf_ratio}' f' out of {self.baseline_target_perf_ratios}') self.garnet = garnets.Garnets(self.nb_states, self.nb_actions, self.nb_next_state_transition, env_type=self.env_type, self_transitions=self.self_transitions) softmax_target_perf_ratio = (baseline_target_perf_ratio + 1) / 2 self.to_append_run_one_iteration = self.to_append_run + [softmax_target_perf_ratio, baseline_target_perf_ratio] self.pi_b, self._q_pi_b, self.pi_star_perf, self.pi_b_perf, self.pi_rand_perf = \ self.garnet.generate_baseline_policy(self.gamma, softmax_target_perf_ratio=softmax_target_perf_ratio, baseline_target_perf_ratio=baseline_target_perf_ratio, log=self.log) self.R_state_state = self.garnet.compute_reward() self.P = self.garnet.transition_function if self.env_type == 2: # easter self._set_easter_egg(reward=1) elif self.env_type == 3: self._set_easter_egg(reward=-1) else: self.easter_egg = None self.R_state_action = compute_r_state_action(self.P, self.R_state_state) self.to_append_run_one_iteration += [self.pi_b_perf, self.pi_rand_perf, self.pi_star_perf] for nb_trajectories in self.nb_trajectories_list: print( f'Process with seed {self.seed} starting with nb_trajectories {nb_trajectories} out of ' f'{self.nb_trajectories_list}') # Generate trajectories, both stored as trajectories and (s,a,s',r) transition samples self.data, batch_traj = self.generate_batch(nb_trajectories, self.garnet, self.pi_b, easter_egg=self.easter_egg) self.to_append = self.to_append_run_one_iteration + [nb_trajectories] self._run_algorithms()
Q_baseline = np.load(npy_filename) # Compute the baseline policy: pi_b = spibb_utils.compute_baseline(Q_baseline) pi_behavioural = np.ones(pi_b.shape)/nb_actions # The batch sizes: nb_trajectories_list = [10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000] N_wedges = [5,7,10,15,20,30,50,70,100] v = np.zeros(nb_states) # Pre-compute the true reward function in function of SxA: current_proba = maze.transition_function garnet = garnets.Garnets(nb_states, nb_actions, 1, self_transitions=0) garnet.transition_function = current_proba reward_current = garnet.compute_reward() r_reshaped = spibb_utils.get_reward_model(current_proba, reward_current) # Compute the baseline policy performance: pi_b_perf = spibb.policy_evaluation_exact(pi_b, r_reshaped, current_proba, gamma)[0][0] print("baseline_perf: " + str(pi_b_perf)) # Creates a mask that is always True for classical RL and other non policy-based SPIBB algorithms mask_0, thres = spibb.compute_mask(nb_states, nb_actions, 1, 1, []) mask_0 = ~mask_0 pi_star = spibb.spibb(gamma, nb_states, nb_actions, mask_0, mask_0, current_proba, r_reshaped, 'default')
mask_0, thres = spibb.compute_mask(nb_states, nb_actions, 1, 1, []) mask_0 = ~mask_0 rand_pi = np.ones((nb_states, nb_actions)) / nb_actions filename = 'results/' + expname + '/results_' + str(index) results = [] if not os.path.isdir('results'): os.mkdir('results') if not os.path.isdir('results/' + expname): os.mkdir('results/' + expname) while True: for ratio in ratios: garnet = garnets.Garnets(nb_states, nb_actions, nb_next_state_transition, self_transitions=0) softmax_target_perf_ratio = (ratio + 1) / 2 baseline_target_perf_ratio = ratio pi_b, q_pi_b, pi_star_perf, pi_b_perf, pi_rand_perf = \ garnet.generate_baseline_policy(gamma, softmax_target_perf_ratio=softmax_target_perf_ratio, baseline_target_perf_ratio=baseline_target_perf_ratio) reward_current = garnet.compute_reward() current_proba = garnet.transition_function r_reshaped = spibb_utils.get_reward_model(current_proba, reward_current) for nb_trajectories in nb_trajectories_list:
for action in range(nb_actions): if count_state_action[state, action] == 0: errors[state, action] = unvisited else: errors[state, action] = np.sqrt( 2 * (np.log(2 * (nb_states * nb_actions * 2**nb_actions) / delta)) / count_state_action[state, action]) return errors results = [] for ratio in ratios: garnet = garnets.Garnets(nb_states, nb_actions, nb_next_state_transition, env_type=env_type, self_transitions=self_transitions) softmax_target_perf_ratio = (ratio + 1) / 2 baseline_target_perf_ratio = ratio pi_b, q_pi_b, pi_star_perf, pi_b_perf, pi_rand_perf = \ garnet.generate_baseline_policy(gamma, softmax_target_perf_ratio=softmax_target_perf_ratio, baseline_target_perf_ratio=baseline_target_perf_ratio, log=False) reward_current = garnet.compute_reward() current_proba = garnet.transition_function r_reshaped = spibb_utils.get_reward_model(current_proba, reward_current) results_traj = []