def perform(self):
        # Value iteration
        self._details.env.reset()

        grid_file_name = '{}/VI/{}_grid.csv'.format(OUTPUT_DIRECTORY,
                                                    self._details.env_name)
        with open(grid_file_name, 'w') as f:
            f.write(
                "params,time,steps,reward_mean,reward_median,reward_min,reward_max,reward_std\n"
            )

        discount_factors = np.array([0.1, 0.3, 0.5, 0.7, 0.9, 0.95, 0.99])
        dims = len(discount_factors)
        self.log("Searching VI in {} dimensions".format(dims))

        runs = 1
        for discount_factor in discount_factors:
            t = int(round(time.time() * 1000))
            self.log("{}/{} Processing VI with discount factor {}".format(
                runs, dims, discount_factor))

            v = solvers.ValueIterationSolver(self._details.env,
                                             discount_factor=discount_factor)

            stats = self.run_solver_and_collect(v, self.convergence_check_fn,
                                                self._details.state_to_track)

            self.log("Took {} steps".format(len(stats.steps)))
            stats.to_csv('{}/VI/{}_{}.csv'.format(OUTPUT_DIRECTORY,
                                                  self._details.env_name,
                                                  discount_factor))

            optimal_policy_stats = self.run_policy_and_collect(
                v, stats.optimal_policy)
            self.log('{}'.format(optimal_policy_stats))
            optimal_policy_stats.to_csv('{}/VI/{}_{}_optimal.csv'.format(
                OUTPUT_DIRECTORY, self._details.env_name, discount_factor))
            with open(grid_file_name, 'a') as f:
                f.write('"{}",{},{},{},{},{},{},{}\n'.format(
                    json.dumps({
                        'discount_factor': discount_factor
                    }).replace('"', '""'),
                    int(round(time.time() * 1000)) - t,
                    len(optimal_policy_stats.rewards),
                    optimal_policy_stats.reward_mean,
                    optimal_policy_stats.reward_median,
                    optimal_policy_stats.reward_min,
                    optimal_policy_stats.reward_max,
                    optimal_policy_stats.reward_std,
                ))
            runs += 1
Esempio n. 2
0
    def perform(self):
        # Value iteration
        self._details.env.reset()
        map_desc = self._details.env.unwrapped.desc

        grid_file_name = '{}/VI/{}_grid.csv'.format(OUTPUT_DIRECTORY, self._details.env_name)
        with open(grid_file_name, 'w') as f:
            f.write("params,time,steps,reward_mean,reward_median,reward_min,reward_max,reward_std\n")

        discount_factors = np.round(np.linspace(0, 0.9, num=10), 2)
        dims = len(discount_factors)
        self.log("Searching VI in {} dimensions".format(dims))

        runs = 1
        for discount_factor in discount_factors:
            t = time.clock()
            self.log("{}/{} Processing VI with discount factor {}".format(runs, dims, discount_factor))

            v = solvers.ValueIterationSolver(self._details.env, discount_factor=discount_factor)

            stats = self.run_solver_and_collect(v, self.convergence_check_fn)

            self.log("Took {} steps".format(len(stats.steps)))
            stats.to_csv('{}/VI/{}_{}.csv'.format(OUTPUT_DIRECTORY, self._details.env_name, discount_factor))
            stats.pickle_results('{}/VI/pkl/{}_{}_{}.pkl'.format(OUTPUT_DIRECTORY, self._details.env_name,
                                                                 discount_factor, '{}'),
                                 map_desc.shape)
            stats.plot_policies_on_map('{}/images/VI/{}_{}_{}.png'.format(OUTPUT_DIRECTORY, self._details.env_name,
                                                                          discount_factor, '{}_{}'),
                                       map_desc, self._details.env.colors(), self._details.env.directions(),
                                       'Value Iteration', 'Step', self._details, only_last=True)

            optimal_policy_stats = self.run_policy_and_collect(v, stats.optimal_policy)
            self.log('{}'.format(optimal_policy_stats))
            optimal_policy_stats.to_csv('{}/VI/{}_{}_optimal.csv'.format(OUTPUT_DIRECTORY, self._details.env_name,
                                                                         discount_factor))
            with open(grid_file_name, 'a') as f:
                f.write('"{}",{},{},{},{},{},{},{}\n'.format(
                    json.dumps({'discount_factor': discount_factor}).replace('"', '""'),
                    time.clock() - t,
                    len(optimal_policy_stats.rewards),
                    optimal_policy_stats.reward_mean,
                    optimal_policy_stats.reward_median,
                    optimal_policy_stats.reward_min,
                    optimal_policy_stats.reward_max,
                    optimal_policy_stats.reward_std,
                ))
            runs += 1
Esempio n. 3
0
    def perform(self):
        # Value iteration
        self._details.env.reset()
        map_desc = self._details.env.unwrapped.desc

        grid_file_name = os.path.join(VI_DIR, '{}_grid.csv'.format(self._details.env_name))
        with open(grid_file_name, 'w') as f:
            f.write("params,time,steps,reward_mean,reward_median,reward_min,reward_max,reward_std\n")

        discount_factors = np.round(np.linspace(DISCOUNT_MIN, max(DISCOUNT_MIN, DISCOUNT_MAX), num = NUM_DISCOUNTS), 2)
        dims = len(discount_factors)
        self.log("Searching VI in {} dimensions".format(dims))

        runs = 1
        for discount_factor in discount_factors:
            t = time.clock()
            self.log("{}/{} Processing VI with discount factor {}".format(runs, dims, discount_factor))

            v = solvers.ValueIterationSolver(self._details.env, discount_factor=discount_factor, theta=self._theta)

            stats = self.run_solver_and_collect(v, self.convergence_check_fn) # up to 200 iterations, run VI

            self.log("Took {} steps".format(len(stats.steps))) # number of iterations ran
            stats.to_csv(os.path.join(VI_DIR, '{}_{}.csv'.format(self._details.env_name, discount_factor)))
            stats.pickle_results(os.path.join(PKL_DIR, '{}_{}_{}.pkl'.format(self._details.env_name, discount_factor, '{}')), map_desc.shape)
            stats.plot_policies_on_map(os.path.join(IMG_DIR, '{}_{}_{}.png'.format(self._details.env_name, discount_factor, '{}_{}')),
                                       map_desc, self._details.env.colors(), self._details.env.directions(),
                                       'Value Iteration', 'Step', self._details, only_last=True) # plot final policy and final utilities computed by VI

            optimal_policy_stats = self.run_policy_and_collect(v, stats.optimal_policy, self._num_trials) # run the policy on the environment num_trial times and return object containing average step reward in each trial and average step reward averaged over trials
            self.log('{}'.format(optimal_policy_stats))
            optimal_policy_stats.to_csv(os.path.join(VI_DIR, '{}_{}_optimal.csv'.format(self._details.env_name, discount_factor))) # average step reward averaged over increasing num_trials
            with open(grid_file_name, 'a') as f:
                f.write('"{}",{},{},{},{},{},{},{}\n'.format(
                    json.dumps({'discount_factor': discount_factor}).replace('"', '""'),
                    time.clock() - t,
                    len(optimal_policy_stats.rewards), # number of trials
                    optimal_policy_stats.reward_mean, # average step reward averaged over 200 trials
                    optimal_policy_stats.reward_median,
                    optimal_policy_stats.reward_min,
                    optimal_policy_stats.reward_max,
                    optimal_policy_stats.reward_std,
                ))
            runs += 1
Esempio n. 4
0
    def perform(self):
        """

        :Outputs:
        -   OUTPUT_DIRECTORY/env_name_grid.csv
            -   Summary of each discount factor:
                -   Steps indicates number of full walks through the environment used to evaluate rewards
                -   Times are for the entire simulation on this discount factor, including training time and time to do
                    on-policy evaluation of rewards
                -   Rewards are per-step reward (set by self.run_policy_and_collect).
        """

        # Value iteration
        self._details.env.reset()
        map_desc = self._details.env.unwrapped.desc

        grid_file_name = os.path.join(
            VI_DIR, '{}_grid.csv'.format(self._details.env_name))
        with open(grid_file_name, 'w') as f:
            f.write(
                "params,discount_factor,time,steps,reward_mean,reward_median,reward_min,reward_max,reward_std\n"
            )

        dims = len(self._discount_factors)
        self.log("Searching VI in {} dimensions".format(dims))

        runs = 1
        for discount_factor in self._discount_factors:
            t = time.clock()
            self.log("{}/{} Processing VI with discount factor {}".format(
                runs, dims, discount_factor))

            v = solvers.ValueIterationSolver(self._details.env,
                                             discount_factor=discount_factor,
                                             theta=self._theta)

            stats = self.run_solver_and_collect(v, self.convergence_check_fn)

            self.log("Took {} steps".format(len(stats.steps)))
            stats.to_csv(
                os.path.join(
                    VI_DIR, '{}_{}.csv'.format(self._details.env_name,
                                               discount_factor)))
            stats.pickle_results(
                os.path.join(
                    PKL_DIR, '{}_{}_{}.pkl'.format(self._details.env_name,
                                                   discount_factor, '{}')),
                map_desc.shape)
            stats.plot_policies_on_map(os.path.join(
                IMG_DIR, '{}_{}_{}.png'.format(self._details.env_name,
                                               discount_factor, '{}_{}')),
                                       map_desc,
                                       self._details.env.colors(),
                                       self._details.env.directions(),
                                       'Value Iteration',
                                       'Step',
                                       self._details,
                                       only_last=True)

            optimal_policy_stats = self.run_policy_and_collect(
                v, stats.best_policy, self._num_trials)
            self.log('{}'.format(optimal_policy_stats))
            optimal_policy_stats.to_csv(
                os.path.join(
                    VI_DIR, '{}_{}_optimal.csv'.format(self._details.env_name,
                                                       discount_factor)))
            with open(grid_file_name, 'a') as f:
                f.write('"{}",{},{},{},{},{},{},{}\n'.format(
                    json.dumps({
                        'discount_factor': discount_factor
                    }).replace('"', '""'),
                    discount_factor,
                    time.clock() - t,
                    len(optimal_policy_stats.rewards),
                    optimal_policy_stats.reward_mean,
                    optimal_policy_stats.reward_median,
                    optimal_policy_stats.reward_min,
                    optimal_policy_stats.reward_max,
                    optimal_policy_stats.reward_std,
                ))
            runs += 1