pi_star_perf = spibb.policy_evaluation_exact(pi_star.pi, r_reshaped, current_proba, gamma)[0][0] print("pi_star_perf: " + str(pi_star_perf)) # Place to save the results filename = 'results/' + expname + '/results_' + str(index) results = [] if not os.path.isdir('results'): os.mkdir('results') if not os.path.isdir('results/' + expname): os.mkdir('results/' + expname) while True: for nb_trajectories in nb_trajectories_list: # Generate trajectories, both stored as trajectories and (s,a,s',r) transition samples trajectories, batch_traj = spibb_utils.generate_batch(nb_trajectories, garnet, pi_behavioural) spibb_utils.prt("GENERATED A DATASET OF " + str(nb_trajectories) + " TRAJECTORIES") # Compute the maximal likelihood model for transitions and rewards. # NB: the true reward function can be used for ease of implementation since it is not stochastic in our environment. # One should compute it from the samples when it is stochastic. model = modelTransitions.ModelTransitions(batch_traj, nb_states, nb_actions) reward_model = spibb_utils.get_reward_model(model.transitions, reward_current) # Computes the RL policy rl = spibb.spibb(gamma, nb_states, nb_actions, pi_b, mask_0, model.transitions, reward_model, 'default') rl.fit() # Evaluates the RL policy performance perfrl = spibb.policy_evaluation_exact(rl.pi, r_reshaped, current_proba, gamma)[0][0] print("perf RL: " + str(perfrl))
true_rl.pi, r_reshaped, current_proba, gamma)[0][0] print("Optimal perf in easter egg environment:\t\t\t" + str(pi_star_perf)) pi_b_perf = spibb.policy_evaluation_exact(pi_b, r_reshaped, current_proba, gamma)[0][0] print("Baseline perf in easter egg environment:\t\t\t" + str(pi_b_perf)) else: easter_egg = None r_reshaped = spibb_utils.get_reward_model(current_proba, reward_current) for nb_trajectories in nb_trajectories_list: # Generate trajectories, both stored as trajectories and (s,a,s',r) transition samples trajectories, batch_traj = spibb_utils.generate_batch( nb_trajectories, garnet, pi_b, easter_egg) print("GENERATED A DATASET OF " + str(nb_trajectories) + " TRAJECTORIES") # Compute the maximal likelihood model for transitions and rewards. # NB: the true reward function can be used for ease of implementation since it is not stochastic in our environment. # One should compute it from the samples when it is stochastic. model = modelTransitions.ModelTransitions(batch_traj, nb_states, nb_actions) reward_model = spibb_utils.get_reward_model( model.transitions, reward_current) # Estimates the values of the baseline policy with a monte-carlo estimation from the batch data: # q_pib_est = spibb_utils.compute_q_pib_est(gamma, nb_states, nb_actions, trajectories) # Computes the RL policy