def fun(algorithm): perf = evaluate_policy(algorithm._mdp, algorithm._policy, criterion=criterion, n_episodes=n_episodes, initial_states=initial_states, n_threads=n_threads) fields = {} fields[field_name + "_mean"] = perf[0] fields[field_name + "_std"] = perf[1] algorithm._result.update_step(**fields)
def fun(algorithm): policy = EpsilonGreedy(algorithm._actions, algorithm._policy.Q, 0) perf = evaluate_policy(algorithm._mdp, policy, criterion=criterion, n_episodes=n_episodes, initial_states=initial_states, n_threads=n_threads) fields = {} fields[field_name + "_mean"] = perf[0] fields[field_name + "_std"] = perf[1] fields[field_name + "_steps"] = perf[2] algorithm._result.update_step(**fields)
################ TEST ################ pi_fqi = pickle.load(open(filename, 'rb')) ## filename = 'TEST_' + str(test_days) + ' days - ' + str( year_test) + ' - ' + str(minsplit_opt) + ' ms_' + str( max_iterations) + ' it' print(filename) target_mdp_test = target_mdp_test_1 # FIXME: change mdp mean_scores_test = [] #actions_test=[] ## ris_test, act_test = evaluate_policy(test_days, target_mdp_test, pi_fqi, criterion='discounted', n_episodes=1, initial_states=None, n_threads=1) mean_scores_test.append(ris_test) mean_scores_test = np.transpose(np.reshape(mean_scores_test, (1, test_days))) #actions_test.append(act_test) #act_test_day = np.reshape(actions_test, (test_days, 1167)) m = np.zeros((test_days, 1167)) m.fill(np.nan) for i in range(0, test_days): for j in range(0, len(act_test[0, i])): m[i, j] = int((act_test[0, i])[j]) act_test_day = m
regressor_type=ExtraTreesRegressor, **regressor_params) initial_states = [np.array([0., 0.]) for _ in range(5)] callback_list = [] callback_list.append( get_callback_list_entry("eval_greedy_policy_callback", field_name="perf_disc_greedy", criterion='discounted', initial_states=initial_states)) experiment = RepeatExperiment("FQI Experiment", fqi, n_steps=5, n_runs=1, callback_list=callback_list) result = experiment.run(1) plot_average([result], "n_episodes", "perf_disc_greedy_mean", names=["FQI"]) plot_average([result], "n_episodes", "n_samples", names=["FQI"]) policy = EpsilonGreedy(actions, pi.Q, 0) print( evaluate_policy(mdp, policy, criterion='discounted', initial_states=initial_states)) save_object(policy, file_name)
def test_eval(self): mdp = MockMDP() pi = MockPolicy(1) score_mean, score_std, score_steps = evaluate_policy( mdp, pi, "discounted", n_episodes=1, initial_states=None, n_threads=1) self.assertEqual(1.0 * 1.0 + 2.0 * 0.99 + 3.0 * 0.99 * 0.99, score_mean) self.assertEqual(0, score_std) self.assertEqual(3, score_steps) score_mean, score_std, score_steps = evaluate_policy( mdp, pi, "discounted", n_episodes=10, initial_states=None, n_threads=1) self.assertEqual(1.0 * 1.0 + 2.0 * 0.99 + 3.0 * 0.99 * 0.99, score_mean) self.assertEqual(0, score_std) self.assertEqual(3, score_steps) score_mean, score_std, score_steps = evaluate_policy( mdp, pi, "average", n_episodes=1, initial_states=None, n_threads=1) self.assertEqual(2.0, score_mean) self.assertEqual(0, score_std) self.assertEqual(3, score_steps) score_mean, score_std, score_steps = evaluate_policy( mdp, pi, "average", n_episodes=10, initial_states=None, n_threads=1) self.assertEqual(2.0, score_mean) self.assertEqual(0, score_std) self.assertEqual(3, score_steps) score_mean, score_std, score_steps = evaluate_policy( mdp, pi, "discounted", n_episodes=10, initial_states=None, n_threads=2) self.assertEqual(1.0 * 1.0 + 2.0 * 0.99 + 3.0 * 0.99 * 0.99, score_mean) self.assertEqual(0, score_std) self.assertEqual(3, score_steps) score_mean, score_std, score_steps = evaluate_policy( mdp, pi, "discounted", n_episodes=10, initial_states=np.array([10.0, 10.0]), n_threads=1) self.assertTrue( np.linalg.norm(11.0 * 1.0 + 12.0 * 0.99 + 13.0 * 0.99 * 0.99 - score_mean) < 0.0000001) self.assertTrue(np.linalg.norm(0 - score_std) < 0.0000001) self.assertEqual(3, score_steps) score_mean, score_std, score_steps = evaluate_policy( mdp, pi, "discounted", n_episodes=10, initial_states=[np.array([0.0, 0.0]), np.array([10.0, 10.0])], n_threads=1) scores = np.array([ 1.0 * 1.0 + 2.0 * 0.99 + 3.0 * 0.99 * 0.99, 11.0 * 1.0 + 12.0 * 0.99 + 13.0 * 0.99 * 0.99 ]) self.assertTrue( np.linalg.norm(np.mean(scores) - score_mean) < 0.0000001) self.assertTrue( np.linalg.norm(np.std(scores) / np.sqrt(2) - score_std) < 0.0000001) self.assertEqual(3, score_steps) score_mean, score_std, score_steps = evaluate_policy( mdp, pi, "discounted", n_episodes=10, initial_states=[np.array([0.0, 0.0]), np.array([10.0, 10.0])], n_threads=2) scores = np.array([ 1.0 * 1.0 + 2.0 * 0.99 + 3.0 * 0.99 * 0.99, 11.0 * 1.0 + 12.0 * 0.99 + 13.0 * 0.99 * 0.99 ]) self.assertTrue( np.linalg.norm(np.mean(scores) - score_mean) < 0.0000001) self.assertTrue( np.linalg.norm(np.std(scores) / np.sqrt(2) - score_std) < 0.0000001) self.assertEqual(3, score_steps)