Beispiel #1
0
    def perform(self):
        # Q-Learner
        self._details.env.reset()
        map_desc = self._details.env.unwrapped.desc

        grid_file_name = os.path.join(
            QL_DIR, '{}_grid.csv'.format(self._details.env_name))
        with open(grid_file_name, 'w') as f:
            f.write(
                "params,time,steps,reward_mean,reward_median,reward_min,reward_max,reward_std\n"
            )

        alphas = ALPHAS
        q_inits = Q_INITS
        epsilons = EPSILONS
        discount_factors = np.round(
            np.linspace(DISCOUNT_MIN,
                        max(DISCOUNT_MIN, DISCOUNT_MAX),
                        num=NUM_DISCOUNTS), 2)
        dims = len(discount_factors) * len(alphas) * len(q_inits) * len(
            epsilons) * len(self._epsilon_decays)
        self.log("Searching Q in {} dimensions".format(dims))

        runs = 1
        for alpha in alphas:
            for q_init in q_inits:
                for epsilon in epsilons:
                    for epsilon_decay in self._epsilon_decays:
                        for discount_factor in discount_factors:
                            t = time.clock()
                            self.log(
                                "{}/{} Processing QL with alpha {}, q_init {}, epsilon {}, epsilon_decay {},"
                                " discount_factor {}".format(
                                    runs, dims, alpha, q_init, epsilon,
                                    epsilon_decay, discount_factor))

                            qs = solvers.QLearningSolver(
                                self._details.env,
                                self._max_episodes,
                                discount_factor=discount_factor,
                                alpha=alpha,
                                epsilon=epsilon,
                                epsilon_decay=epsilon_decay,
                                q_init=q_init,
                                min_consecutive_sub_theta_episodes=self.
                                _min_sub_thetas,
                                verbose=self._verbose,
                                theta=self._theta)

                            stats = self.run_solver_and_collect(
                                qs, self.convergence_check_fn)

                            self.log("Took {} episodes".format(len(
                                stats.steps)))
                            stats.to_csv(
                                os.path.join(
                                    QL_DIR, '{}_{}_{}_{}_{}_{}.csv'.format(
                                        self._details.env_name, alpha, q_init,
                                        epsilon, epsilon_decay,
                                        discount_factor)))
                            stats.pickle_results(os.path.join(
                                PKL_DIR, '{}_{}_{}_{}_{}_{}_{}.pkl'.format(
                                    self._details.env_name, alpha, q_init,
                                    epsilon, epsilon_decay, discount_factor,
                                    '{}')),
                                                 map_desc.shape,
                                                 step_size=self._max_episodes /
                                                 20.0)
                            stats.plot_policies_on_map(
                                os.path.join(
                                    IMG_DIR, '{}_{}_{}_{}_{}_{}_{}.png'.format(
                                        self._details.env_name, alpha, q_init,
                                        epsilon, epsilon_decay,
                                        discount_factor, '{}_{}')),
                                map_desc,
                                self._details.env.colors(),
                                self._details.env.directions(),
                                'Q-Learner',
                                'Episode',
                                self._details,
                                step_size=self._max_episodes / 20.0,
                                only_last=True)

                            # We have extra stats about the episode we might want to look at later
                            episode_stats = qs.get_stats()
                            episode_stats.to_csv(
                                os.path.join(
                                    QL_DIR,
                                    '{}_{}_{}_{}_{}_{}_episode.csv'.format(
                                        self._details.env_name, alpha, q_init,
                                        epsilon, epsilon_decay,
                                        discount_factor)))

                            optimal_policy_stats = self.run_policy_and_collect(
                                qs, stats.optimal_policy, self._num_trials)
                            self.log('{}'.format(optimal_policy_stats))
                            optimal_policy_stats.to_csv(
                                os.path.join(
                                    QL_DIR,
                                    '{}_{}_{}_{}_{}_{}_optimal.csv'.format(
                                        self._details.env_name, alpha, q_init,
                                        epsilon, epsilon_decay,
                                        discount_factor)))

                            with open(grid_file_name, 'a') as f:
                                f.write('"{}",{},{},{},{},{},{},{}\n'.format(
                                    json.dumps({
                                        'alpha':
                                        alpha,
                                        'q_init':
                                        q_init,
                                        'epsilon':
                                        epsilon,
                                        'epsilon_decay':
                                        epsilon_decay,
                                        'discount_factor':
                                        discount_factor,
                                    }).replace('"', '""'),
                                    time.clock() - t,
                                    len(optimal_policy_stats.rewards),
                                    optimal_policy_stats.reward_mean,
                                    optimal_policy_stats.reward_median,
                                    optimal_policy_stats.reward_min,
                                    optimal_policy_stats.reward_max,
                                    optimal_policy_stats.reward_std,
                                ))
                            runs += 1
    def run_q(self, params):
        grid_file_name = '{}/Q/{}_grid.csv'.format(OUTPUT_DIRECTORY, self._details.env_name)
        alpha, q_init, epsilon, epsilon_decay, discount_factor, runs, dims, map_desc = params
        print("Processing run {}".format(runs))
        t = time.clock()
        self.log("{}/{} Processing Q with alpha {}, q_init {}, epsilon {}, epsilon_decay {},"
                 " discount_factor {}".format(
            runs, dims, alpha, q_init, epsilon, epsilon_decay, discount_factor
        ))

        qs = solvers.QLearningSolver(self._details.env, self.max_episodes,
                                     discount_factor=discount_factor,
                                     alpha=alpha,
                                     epsilon=epsilon, epsilon_decay=epsilon_decay,
                                     q_init=q_init, verbose=self._verbose, theta=0.001)

        stats = self.run_solver_and_collect(qs, self.convergence_check_fn)

        self.log("Took {} episodes".format(len(stats.steps)))
        stats.to_csv('{}/Q/{}_{}_{}_{}_{}_{}.csv'.format(OUTPUT_DIRECTORY, self._details.env_name,
                                                      alpha, q_init, epsilon, epsilon_decay,
                                                      discount_factor))
        stats.pickle_results('{}/Q/pkl/{}_{}_{}_{}_{}_{}_{}.pkl'.format(OUTPUT_DIRECTORY,
                                                                        self._details.env_name,
                                                                        alpha, q_init, epsilon,
                                                                        epsilon_decay,
                                                                        discount_factor,
                                                                        '{}'), map_desc.shape,
                              step_size=self.max_episodes/20.0)
        stats.plot_policies_on_map('{}/images/Q/{}_{}_{}_{}_{}_{}_{}.png'.format(OUTPUT_DIRECTORY,
                                                                              self._details.env_name,
                                                                              alpha, q_init, epsilon,
                                                                              epsilon_decay,
                                                                              discount_factor,
                                                                              '{}_{}'),
                                   map_desc, self._details.env.colors(),
                                   self._details.env.directions(),
                                   'Q-Learner', 'Episode', self._details,
                                   step_size=self.max_episodes / 20.0,
                                   only_last=True)

        # We have extra stats about the episode we might want to look at later
        episode_stats = qs.get_stats()
        episode_stats.to_csv('{}/Q/{}_{}_{}_{}_{}_{}_episode.csv'.format(OUTPUT_DIRECTORY,
                                                                         self._details.env_name,
                                                                         alpha, q_init, epsilon,
                                                                         epsilon_decay,
                                                                         discount_factor))

        optimal_policy_stats = self.run_policy_and_collect(qs, stats.optimal_policy)
        self.log('{}'.format(optimal_policy_stats))
        optimal_policy_stats.to_csv('{}/Q/{}_{}_{}_{}_{}_{}_optimal.csv'.format(OUTPUT_DIRECTORY,
                                                                             self._details.env_name,
                                                                             alpha, q_init, epsilon,
                                                                             epsilon_decay,
                                                                             discount_factor))

        with open(grid_file_name, 'a') as f:
            f.write('"{}",{},{},{},{},{},{},{}\n'.format(
                json.dumps({
                    'alpha': alpha,
                    'q_init': q_init,
                    'epsilon': epsilon,
                    'epsilon_decay': epsilon_decay,
                    'discount_factor': discount_factor,
                }).replace('"', '""'),
                time.clock() - t,
                len(optimal_policy_stats.rewards),
                optimal_policy_stats.reward_mean,
                optimal_policy_stats.reward_median,
                optimal_policy_stats.reward_min,
                optimal_policy_stats.reward_max,
                optimal_policy_stats.reward_std,
            ))
        return runs
Beispiel #3
0
    def perform(self):
        # Q-Learner
        self._details.env.reset()
        map_desc = self._details.env.unwrapped.desc

        grid_file_name = '{}/Q/{}_grid.csv'.format(OUTPUT_DIRECTORY,
                                                   self._details.env_name)
        with open(grid_file_name, 'w') as f:
            f.write(
                "params,time,steps,reward_mean,reward_median,reward_min,reward_max,reward_std\n"
            )

        alphas = [0.1, 0.3, 0.5, 0.7, 0.9]
        q_inits = ['random', 0]
        epsilons = [0.1, 0.3, 0.5, 0.7, 0.9]
        #epsilon_decays = [0.0001]
        epsilon_decays = [0.001]
        #discount_factors = np.round(np.linspace(0, 0.9, num=10), 2)
        discount_factors = np.round(np.linspace(0.1, 0.9, num=5), 2)
        dims = len(discount_factors) * len(alphas) * len(q_inits) * len(
            epsilons) * len(epsilon_decays)
        self.log("Searching Q in {} dimensions".format(dims))

        runs = 1
        for alpha in alphas:
            for q_init in q_inits:
                for epsilon in epsilons:
                    for epsilon_decay in epsilon_decays:
                        for discount_factor in discount_factors:
                            t = time.clock()
                            self.log(
                                "{}/{} Processing Q with alpha {}, q_init {}, epsilon {}, epsilon_decay {},"
                                " discount_factor {}".format(
                                    runs, dims, alpha, q_init, epsilon,
                                    epsilon_decay, discount_factor))

                            qs = solvers.QLearningSolver(
                                self._details.env,
                                self.max_episodes,
                                discount_factor=discount_factor,
                                alpha=alpha,
                                epsilon=epsilon,
                                epsilon_decay=epsilon_decay,
                                q_init=q_init,
                                verbose=self._verbose)

                            stats = self.run_solver_and_collect(
                                qs, self.convergence_check_fn)

                            self.log("Took {} episodes".format(len(
                                stats.steps)))
                            stats.to_csv('{}/Q/{}_{}_{}_{}_{}_{}.csv'.format(
                                OUTPUT_DIRECTORY, self._details.env_name,
                                alpha, q_init, epsilon, epsilon_decay,
                                discount_factor))
                            stats.pickle_results(
                                '{}/Q/pkl/{}_{}_{}_{}_{}_{}_{}.pkl'.format(
                                    OUTPUT_DIRECTORY, self._details.env_name,
                                    alpha, q_init, epsilon, epsilon_decay,
                                    discount_factor, '{}'),
                                map_desc.shape,
                                step_size=self.max_episodes / 20.0)
                            stats.plot_policies_on_map(
                                '{}/images/Q/{}_{}_{}_{}_{}_{}_{}.png'.format(
                                    OUTPUT_DIRECTORY, self._details.env_name,
                                    alpha, q_init, epsilon, epsilon_decay,
                                    discount_factor, '{}_{}'),
                                map_desc,
                                self._details.env.colors(),
                                self._details.env.directions(),
                                'Q-Learner',
                                'Episode',
                                self._details,
                                step_size=self.max_episodes / 20.0,
                                only_last=True)

                            # We have extra stats about the episode we might want to look at later
                            episode_stats = qs.get_stats()
                            episode_stats.to_csv(
                                '{}/Q/{}_{}_{}_{}_{}_{}_episode.csv'.format(
                                    OUTPUT_DIRECTORY, self._details.env_name,
                                    alpha, q_init, epsilon, epsilon_decay,
                                    discount_factor))

                            optimal_policy_stats = self.run_policy_and_collect(
                                qs, stats.optimal_policy)
                            self.log('{}'.format(optimal_policy_stats))
                            optimal_policy_stats.to_csv(
                                '{}/Q/{}_{}_{}_{}_{}_{}_optimal.csv'.format(
                                    OUTPUT_DIRECTORY, self._details.env_name,
                                    alpha, q_init, epsilon, epsilon_decay,
                                    discount_factor))

                            with open(grid_file_name, 'a') as f:
                                f.write('"{}",{},{},{},{},{},{},{}\n'.format(
                                    json.dumps({
                                        'alpha':
                                        alpha,
                                        'q_init':
                                        q_init,
                                        'epsilon':
                                        epsilon,
                                        'epsilon_decay':
                                        epsilon_decay,
                                        'discount_factor':
                                        discount_factor,
                                    }).replace('"', '""'),
                                    time.clock() - t,
                                    len(optimal_policy_stats.rewards),
                                    optimal_policy_stats.reward_mean,
                                    optimal_policy_stats.reward_median,
                                    optimal_policy_stats.reward_min,
                                    optimal_policy_stats.reward_max,
                                    optimal_policy_stats.reward_std,
                                ))
                            runs += 1
    def perform(self):
        # Q-Learner
        self._details.env.reset()
        map_desc = self._details.env.unwrapped.desc

        grid_file_name = os.path.join(
            QL_DIR, '{}_grid.csv'.format(self._details.env_name))
        with open(grid_file_name, 'w') as f:
            f.write(
                "params,q_init,alpha_initial,alpha_min,alpha_decay,epsilon_initial,epsilon_min,epsilon_decay,"
                "discount_factor,time,steps,reward_mean,reward_median,reward_min,reward_max,reward_std\n"
            )

        dims = len(self._discount_factors) * len(self._alphas) * len(
            self._q_inits) * len(self._epsilons)
        self.log("Searching Q in {} dimensions".format(dims))

        runs = 1
        for alpha in self._alphas:
            for q_init in self._q_inits:
                for epsilon in self._epsilons:
                    for discount_factor in self._discount_factors:
                        t = time.clock()
                        self.log(
                            f"{runs}/{dims} Processing QL with alpha {alpha['initial']}->{alpha['min']} "
                            f"(decay={alpha['decay']}), q_init {q_init}, epsilon {epsilon['initial']}->"
                            f"{epsilon['min']} (decay={epsilon['decay']}), discount_factor {discount_factor}"
                        )

                        # Build a QLeaningSolver object
                        qs = solvers.QLearningSolver(
                            self._details.env,
                            self._max_episodes,
                            self._min_episodes,
                            max_steps_per_episode=self._max_episode_steps,
                            discount_factor=discount_factor,
                            alpha_initial=alpha['initial'],
                            alpha_decay=alpha['decay'],
                            alpha_min=alpha['min'],
                            epsilon_initial=epsilon['initial'],
                            epsilon_decay=epsilon['decay'],
                            epsilon_min=epsilon['min'],
                            q_init=q_init,
                            min_consecutive_sub_theta_episodes=self.
                            _min_sub_thetas,
                            verbose=self._verbose,
                            theta=self._theta)

                        # Run the solver to generate an optimal policy.  Stats object contains details about all
                        # optimal policy and
                        # s
                        stats = self.run_solver_and_collect(
                            qs, self.convergence_check_fn)

                        self.log("Took {} episodes".format(len(stats.steps)))

                        filename_base = params_to_filename_base(
                            self._details.env_name, alpha["initial"],
                            alpha["min"], alpha["decay"], q_init,
                            epsilon["initial"], epsilon["min"],
                            epsilon["decay"], discount_factor)

                        stats.to_csv(
                            os.path.join(QL_DIR, f'{filename_base}.csv'))
                        stats.pickle_results(
                            os.path.join(PKL_DIR, f'{filename_base}_{{}}.pkl'),
                            map_desc.shape,
                            step_size=self._max_episodes / 20.0)
                        stats.plot_policies_on_map(
                            os.path.join(IMG_DIR,
                                         f'{filename_base}_{{}}_{{}}.png'),
                            map_desc,
                            self._details.env.colors(),
                            self._details.env.directions(),
                            'Q-Learner',
                            'Episode',
                            self._details,
                            step_size=self._max_episodes / 20.0,
                            only_last=True)

                        # We have extra stats about the episode we might want to look at later
                        episode_stats = qs.get_stats()
                        episode_stats.to_csv(
                            os.path.join(QL_DIR,
                                         f'{filename_base}_episode.csv'))

                        optimal_policy_stats = self.run_policy_and_collect(
                            qs, stats.best_policy, self._num_trials)
                        self.log('{}'.format(optimal_policy_stats))
                        optimal_policy_stats.to_csv(
                            os.path.join(QL_DIR,
                                         f'{filename_base}_optimal.csv'))

                        with open(grid_file_name, 'a') as f:
                            # Data as an iterable of numbers and such
                            # TODO: Replace these instances where headers are above and numbers written down here with
                            # a csv or pandas to csv call?
                            # Single group version (for legacy support)
                            params = json.dumps({
                                'q_init':
                                q_init,
                                'alpha_initial':
                                alpha['initial'],
                                'alpha_min':
                                alpha['min'],
                                'alpha_decay':
                                alpha['decay'],
                                'epsilon_initial':
                                epsilon['initial'],
                                'epsilon_min':
                                epsilon['min'],
                                'epsilon_decay':
                                epsilon['decay'],
                                'discount_factor':
                                discount_factor,
                            }).replace('"', '""')
                            data = [
                                f'"{params}"',
                                q_init,
                                alpha['initial'],
                                alpha['min'],
                                alpha['decay'],
                                epsilon['initial'],
                                epsilon['min'],
                                epsilon['decay'],
                                discount_factor,
                                time.clock() - t,
                                len(optimal_policy_stats.rewards),
                                optimal_policy_stats.reward_mean,
                                optimal_policy_stats.reward_median,
                                optimal_policy_stats.reward_min,
                                optimal_policy_stats.reward_max,
                                optimal_policy_stats.reward_std,
                            ]
                            # Convert to a single csv string
                            data_as_string = ",".join([str(d) for d in data])
                            f.write(f'{data_as_string}\n')
                        runs += 1