def perform(self): # Value iteration self._details.env.reset() grid_file_name = '{}/VI/{}_grid.csv'.format(OUTPUT_DIRECTORY, self._details.env_name) with open(grid_file_name, 'w') as f: f.write( "params,time,steps,reward_mean,reward_median,reward_min,reward_max,reward_std\n" ) discount_factors = np.array([0.1, 0.3, 0.5, 0.7, 0.9, 0.95, 0.99]) dims = len(discount_factors) self.log("Searching VI in {} dimensions".format(dims)) runs = 1 for discount_factor in discount_factors: t = int(round(time.time() * 1000)) self.log("{}/{} Processing VI with discount factor {}".format( runs, dims, discount_factor)) v = solvers.ValueIterationSolver(self._details.env, discount_factor=discount_factor) stats = self.run_solver_and_collect(v, self.convergence_check_fn, self._details.state_to_track) self.log("Took {} steps".format(len(stats.steps))) stats.to_csv('{}/VI/{}_{}.csv'.format(OUTPUT_DIRECTORY, self._details.env_name, discount_factor)) optimal_policy_stats = self.run_policy_and_collect( v, stats.optimal_policy) self.log('{}'.format(optimal_policy_stats)) optimal_policy_stats.to_csv('{}/VI/{}_{}_optimal.csv'.format( OUTPUT_DIRECTORY, self._details.env_name, discount_factor)) with open(grid_file_name, 'a') as f: f.write('"{}",{},{},{},{},{},{},{}\n'.format( json.dumps({ 'discount_factor': discount_factor }).replace('"', '""'), int(round(time.time() * 1000)) - t, len(optimal_policy_stats.rewards), optimal_policy_stats.reward_mean, optimal_policy_stats.reward_median, optimal_policy_stats.reward_min, optimal_policy_stats.reward_max, optimal_policy_stats.reward_std, )) runs += 1
def perform(self): # Value iteration self._details.env.reset() map_desc = self._details.env.unwrapped.desc grid_file_name = '{}/VI/{}_grid.csv'.format(OUTPUT_DIRECTORY, self._details.env_name) with open(grid_file_name, 'w') as f: f.write("params,time,steps,reward_mean,reward_median,reward_min,reward_max,reward_std\n") discount_factors = np.round(np.linspace(0, 0.9, num=10), 2) dims = len(discount_factors) self.log("Searching VI in {} dimensions".format(dims)) runs = 1 for discount_factor in discount_factors: t = time.clock() self.log("{}/{} Processing VI with discount factor {}".format(runs, dims, discount_factor)) v = solvers.ValueIterationSolver(self._details.env, discount_factor=discount_factor) stats = self.run_solver_and_collect(v, self.convergence_check_fn) self.log("Took {} steps".format(len(stats.steps))) stats.to_csv('{}/VI/{}_{}.csv'.format(OUTPUT_DIRECTORY, self._details.env_name, discount_factor)) stats.pickle_results('{}/VI/pkl/{}_{}_{}.pkl'.format(OUTPUT_DIRECTORY, self._details.env_name, discount_factor, '{}'), map_desc.shape) stats.plot_policies_on_map('{}/images/VI/{}_{}_{}.png'.format(OUTPUT_DIRECTORY, self._details.env_name, discount_factor, '{}_{}'), map_desc, self._details.env.colors(), self._details.env.directions(), 'Value Iteration', 'Step', self._details, only_last=True) optimal_policy_stats = self.run_policy_and_collect(v, stats.optimal_policy) self.log('{}'.format(optimal_policy_stats)) optimal_policy_stats.to_csv('{}/VI/{}_{}_optimal.csv'.format(OUTPUT_DIRECTORY, self._details.env_name, discount_factor)) with open(grid_file_name, 'a') as f: f.write('"{}",{},{},{},{},{},{},{}\n'.format( json.dumps({'discount_factor': discount_factor}).replace('"', '""'), time.clock() - t, len(optimal_policy_stats.rewards), optimal_policy_stats.reward_mean, optimal_policy_stats.reward_median, optimal_policy_stats.reward_min, optimal_policy_stats.reward_max, optimal_policy_stats.reward_std, )) runs += 1
def perform(self): # Value iteration self._details.env.reset() map_desc = self._details.env.unwrapped.desc grid_file_name = os.path.join(VI_DIR, '{}_grid.csv'.format(self._details.env_name)) with open(grid_file_name, 'w') as f: f.write("params,time,steps,reward_mean,reward_median,reward_min,reward_max,reward_std\n") discount_factors = np.round(np.linspace(DISCOUNT_MIN, max(DISCOUNT_MIN, DISCOUNT_MAX), num = NUM_DISCOUNTS), 2) dims = len(discount_factors) self.log("Searching VI in {} dimensions".format(dims)) runs = 1 for discount_factor in discount_factors: t = time.clock() self.log("{}/{} Processing VI with discount factor {}".format(runs, dims, discount_factor)) v = solvers.ValueIterationSolver(self._details.env, discount_factor=discount_factor, theta=self._theta) stats = self.run_solver_and_collect(v, self.convergence_check_fn) # up to 200 iterations, run VI self.log("Took {} steps".format(len(stats.steps))) # number of iterations ran stats.to_csv(os.path.join(VI_DIR, '{}_{}.csv'.format(self._details.env_name, discount_factor))) stats.pickle_results(os.path.join(PKL_DIR, '{}_{}_{}.pkl'.format(self._details.env_name, discount_factor, '{}')), map_desc.shape) stats.plot_policies_on_map(os.path.join(IMG_DIR, '{}_{}_{}.png'.format(self._details.env_name, discount_factor, '{}_{}')), map_desc, self._details.env.colors(), self._details.env.directions(), 'Value Iteration', 'Step', self._details, only_last=True) # plot final policy and final utilities computed by VI optimal_policy_stats = self.run_policy_and_collect(v, stats.optimal_policy, self._num_trials) # run the policy on the environment num_trial times and return object containing average step reward in each trial and average step reward averaged over trials self.log('{}'.format(optimal_policy_stats)) optimal_policy_stats.to_csv(os.path.join(VI_DIR, '{}_{}_optimal.csv'.format(self._details.env_name, discount_factor))) # average step reward averaged over increasing num_trials with open(grid_file_name, 'a') as f: f.write('"{}",{},{},{},{},{},{},{}\n'.format( json.dumps({'discount_factor': discount_factor}).replace('"', '""'), time.clock() - t, len(optimal_policy_stats.rewards), # number of trials optimal_policy_stats.reward_mean, # average step reward averaged over 200 trials optimal_policy_stats.reward_median, optimal_policy_stats.reward_min, optimal_policy_stats.reward_max, optimal_policy_stats.reward_std, )) runs += 1
def perform(self): """ :Outputs: - OUTPUT_DIRECTORY/env_name_grid.csv - Summary of each discount factor: - Steps indicates number of full walks through the environment used to evaluate rewards - Times are for the entire simulation on this discount factor, including training time and time to do on-policy evaluation of rewards - Rewards are per-step reward (set by self.run_policy_and_collect). """ # Value iteration self._details.env.reset() map_desc = self._details.env.unwrapped.desc grid_file_name = os.path.join( VI_DIR, '{}_grid.csv'.format(self._details.env_name)) with open(grid_file_name, 'w') as f: f.write( "params,discount_factor,time,steps,reward_mean,reward_median,reward_min,reward_max,reward_std\n" ) dims = len(self._discount_factors) self.log("Searching VI in {} dimensions".format(dims)) runs = 1 for discount_factor in self._discount_factors: t = time.clock() self.log("{}/{} Processing VI with discount factor {}".format( runs, dims, discount_factor)) v = solvers.ValueIterationSolver(self._details.env, discount_factor=discount_factor, theta=self._theta) stats = self.run_solver_and_collect(v, self.convergence_check_fn) self.log("Took {} steps".format(len(stats.steps))) stats.to_csv( os.path.join( VI_DIR, '{}_{}.csv'.format(self._details.env_name, discount_factor))) stats.pickle_results( os.path.join( PKL_DIR, '{}_{}_{}.pkl'.format(self._details.env_name, discount_factor, '{}')), map_desc.shape) stats.plot_policies_on_map(os.path.join( IMG_DIR, '{}_{}_{}.png'.format(self._details.env_name, discount_factor, '{}_{}')), map_desc, self._details.env.colors(), self._details.env.directions(), 'Value Iteration', 'Step', self._details, only_last=True) optimal_policy_stats = self.run_policy_and_collect( v, stats.best_policy, self._num_trials) self.log('{}'.format(optimal_policy_stats)) optimal_policy_stats.to_csv( os.path.join( VI_DIR, '{}_{}_optimal.csv'.format(self._details.env_name, discount_factor))) with open(grid_file_name, 'a') as f: f.write('"{}",{},{},{},{},{},{},{}\n'.format( json.dumps({ 'discount_factor': discount_factor }).replace('"', '""'), discount_factor, time.clock() - t, len(optimal_policy_stats.rewards), optimal_policy_stats.reward_mean, optimal_policy_stats.reward_median, optimal_policy_stats.reward_min, optimal_policy_stats.reward_max, optimal_policy_stats.reward_std, )) runs += 1