Ejemplo n.º 1
0
    def add_features(self, feat_func: RadialBasisFunction, approximator_list, optimizer_list, alpha, iteration_number):

        if self.baseline:
            return False

        # Add features half way through training
        if self.phased_training and iteration_number + 1 == MIDPOINT:
            num_good_features = 0
            num_bad_features = 0

            # add 45 new good centers to the feature function
            if self.experiment_type in ['add_good_feats','add_5_good_5_bad','add_5_good_20_bad','add_5_good_100_bad']:
                feat_func.add_feature(5, noise_mean=0, noise_var=0, fake_feature=False)
                num_good_features += 5

            # add irrelevant features to the feature function
            if self.experiment_type in ['add_bad_feats','add_5_good_5_bad','add_5_good_20_bad','add_5_good_100_bad']:
                if self.experiment_type in ['add_bad_feats', 'add_5_good_5_bad']:
                    num_bad_features += 5
                elif self.experiment_type == 'add_5_good_20_bad':
                    num_bad_features += 20
                elif self.experiment_type == 'add_5_good_100_bad':
                    num_bad_features += 100
                feat_func.add_feature(num_bad_features, noise_mean=0.0, noise_var=5*self.config.init_noise_var,
                                      fake_feature=self.add_fake_features)

            # add features to the optimizer and the function approximator
            num_new_features = num_good_features + num_bad_features
            for k in range(self.num_actions):
                approximator_list[k].increase_num_features(num_new_features)

                if self.method != 'sgd':    # if method is not sgd, stepsizes are set automatically
                    optimizer_list[k].increase_size(num_new_features)
                else:                       # otherwise, set the stepsize manually
                    if isinstance(alpha, tuple):
                        # if adding good and bad features, add good features with stepsize = alpha[0]
                        # and bad features with stepsize = alpha[1]
                        good_features_stepsize, bad_features_stepsize = alpha
                        optimizer_list[k].increase_size(num_good_features, good_features_stepsize)
                        optimizer_list[k].increase_size(num_bad_features, bad_features_stepsize)
                    else:
                        # otherwise, alpha is a scalar
                        optimizer_list[k].increase_size(num_new_features, alpha)
            return True     # True indicates that features were added to the representation

        # Add features every certain number of steps specified by ADD_FEATURE_INTERVAL
        elif self.experiment_type == 'continuously_add_bad' and (iteration_number + 1) % ADD_FEATURE_INTERVAL == 0:
            # add bad features
            feat_func.add_feature(1, noise_mean=0.0, noise_var=5*self.config.init_noise_var,
                                  fake_feature=self.add_fake_features)

            for k in range(self.num_actions):   # extend function approximator and optimizer
                approximator_list[k].increase_num_features(1)
                if self.method != 'sgd':        # if not sgd, stepsizes are set automatically
                    optimizer_list[k].increase_size(1)
                else:                           # if sgd, stepsizes are set manually
                    optimizer_list[k].increase_size(1, init_stepsize=alpha)
            return True     # True indicates that features were added to the representation

        return False        # False indicates that no feature was added to the representation
Ejemplo n.º 2
0
    def run(self):
        results_dir = {'sample_size': self.sample_size}
        alphas, names = self.get_alphas_and_names()

        for a, alpha in enumerate(alphas):
            self._print("Currently working on: {0}".format(names[a]))
            self.setup_baseline(alpha)

            # For measuring performance
            avg_mse_per_checkpoint = np.zeros(
                (self.sample_size, self.num_transitions // CHECKPOINT),
                dtype=np.float64)
            diverging_runs = np.zeros(self.sample_size, dtype=np.int8)
            action_counter = np.zeros(
                (self.sample_size, self.num_transitions // CHECKPOINT,
                 self.num_actions),
                dtype=np.int32)
            weight_sum_per_checkpoint = np.zeros(
                (self.sample_size, self.num_transitions // CHECKPOINT,
                 self.num_actions, self.config.max_num_features),
                dtype=np.float64)
            # For keeping track of stepsizes
            if self.method not in ['sgd', 'dense_baseline']:
                stepsize_sum_per_checkpoint = np.zeros(
                    (self.sample_size, self.num_transitions // CHECKPOINT,
                     self.num_actions, self.config.max_num_features),
                    dtype=np.float64)
            # start processing samples
            for i in range(self.sample_size):
                seed_number = self.init_seed + i
                self._print("\tCurrent Seed: {0}".format(seed_number))

                ff = RadialBasisFunction(self.config)  # feature function
                approximators = []  # one function approximator per action
                optimizers = []  # one optimizer per action
                for _ in range(self.num_actions):
                    approximators.append(
                        LinearFunctionApproximator(self.config))
                    optimizers.append(OPTIMIZER_DICT[self.method](self.config))

                curr_checkpoint = 0
                # load data
                states, actions, rewards, terminations, avg_disc_return = self.load_data(
                    seed_number=seed_number)
                """ Start of Training """
                curr_obs_feats = ff.get_observable_features(
                    states[0])  # current observable features
                j = 0
                while j < self.num_transitions:
                    curr_a = actions[j]  # current action
                    curr_av = approximators[curr_a].get_prediction(
                        curr_obs_feats)  # current value
                    next_s = states[j + 1]  # next state
                    next_r = rewards[j + 1]  # next reward
                    next_term = terminations[j + 1]  # next termination
                    next_a = actions[j + 1]  # next action

                    # get next observable features and action-value
                    next_obs_feats = ff.get_observable_features(next_s)
                    next_av = approximators[next_a].get_prediction(
                        next_obs_feats)
                    # compute TD error for Sarsa(0)
                    td_error = next_r + self.gamma * (
                        1 - next_term) * next_av - curr_av
                    # update weight vector
                    _, ss, new_weights = optimizers[
                        curr_a].update_weight_vector(
                            td_error, curr_obs_feats,
                            approximators[curr_a].get_weight_vector())
                    approximators[curr_a].update_weight_vector(new_weights)

                    # handle cases where weights diverge
                    if np.sum(np.isnan(new_weights)) > 0 or np.sum(
                            np.isinf(new_weights)) > 0:
                        print(
                            "\tThe weights diverged on iteration: {0}!".format(
                                j + 1))
                        avg_mse_per_checkpoint[i][curr_checkpoint:] += 1000000
                        diverging_runs[i] += np.int8(1)
                        break

                    # update state information and progress
                    curr_obs_feats = next_obs_feats
                    avg_mse_per_checkpoint[i][curr_checkpoint] += np.square(
                        curr_av - avg_disc_return[j]) / CHECKPOINT
                    action_counter[i][curr_checkpoint][curr_a] += np.int32(1)
                    weight_sum_per_checkpoint[i][curr_checkpoint][
                        curr_a][:new_weights.size] += new_weights
                    if self.method not in ['sgd', 'dense_baseline']:
                        stepsize_sum_per_checkpoint[i][curr_checkpoint][
                            curr_a][:ss.size] += ss

                    # increase iteration number and process checkpoints
                    j += 1
                    if j % CHECKPOINT == 0: curr_checkpoint += 1

                    # add features
                    if self.add_features(ff, approximators, optimizers, alpha,
                                         j):
                        curr_obs_feats = ff.get_observable_features(states[j])

                    # handle terminal states
                    if next_term and j < self.num_transitions:
                        j += 1  # skips terminal state
                        if j % CHECKPOINT == 0: curr_checkpoint += 1
                        curr_obs_feats = ff.get_observable_features(states[j])

            results_dir[names[a]] = {
                'avg_mse_per_checkpoint': avg_mse_per_checkpoint,
                'diverging_runs': diverging_runs,
                'action_counter': action_counter,
                'weight_sum_per_checkpoint': weight_sum_per_checkpoint
            }
            if self.method not in ['sgd', 'dense_baseline']:
                results_dir[names[a]][
                    'stepsize_sum_per_checkpoint'] = stepsize_sum_per_checkpoint

            if DEBUG:
                agg_results = np.average(avg_mse_per_checkpoint, axis=0)
                import matplotlib.pyplot as plt
                plt.plot((np.arange(agg_results.size) + 1) * CHECKPOINT,
                         agg_results)
                plt.vlines(MIDPOINT,
                           ymin=agg_results.min(),
                           ymax=agg_results.max())
                plt.show()
                plt.close()

        self.store_results(results_dir)
Ejemplo n.º 3
0
    def run(self):
        results_dir = {'sample_size': self.sample_size}
        alphas, names = self.get_alphas_and_names()

        for a, alpha in enumerate(alphas):
            self._print("Currently working on: {0}".format(names[a]))
            self.setup_baseline(alpha)

            # For measuring performance
            reward_per_step = np.zeros((self.sample_size, self.num_transitions), dtype=np.int8)
            diverging_runs = np.zeros(self.sample_size, dtype=np.int8)
            action_counter = np.zeros((self.sample_size, self.num_transitions // CHECKPOINT,
                                       self.num_actions), dtype=np.int32)
            weight_sum_per_checkpoint = np.zeros((self.sample_size, self.num_transitions // CHECKPOINT,
                                                  self.num_actions, self.config.max_num_features), dtype=np.float64)
            # For keeping track of stepsizes
            if self.method != 'sgd':
                stepsize_sum_per_checkpoint = np.zeros((self.sample_size, self.num_transitions // CHECKPOINT,
                                                        self.num_actions, self.config.max_num_features),
                                                       dtype=np.float64)
            # start processing samples
            for i in range(self.sample_size):
                np.random.seed(i)
                self._print("\tCurrent sample: {0}".format(i+1))

                ff = RadialBasisFunction(self.config)   # feature function
                env = MountainCar(self.config)          # environment: Mountain Car
                approximators = []                      # one function approximator per action
                optimizers = []                         # one optimizer per action
                for _ in range(self.num_actions):
                    approximators.append(LinearFunctionApproximator(self.config))
                    optimizers.append(OPTIMIZER_DICT[self.method](self.config))
                # learning_approximators = approximators
                # pe_phase = False    # pe = policy evaluation
                # curr_pe_iterations = 0
                # total_pe_iterations = 100

                curr_checkpoint = 0
                """ Start of Training """
                curr_s = env.get_current_state()                                    # current state
                curr_obs_feats = ff.get_observable_features(curr_s)                 # current observable features
                for j in range(self.num_transitions):
                    curr_avs = self.get_action_values(curr_obs_feats, approximators)    # current action values
                    curr_a = self.epsilon_greedy_policy(curr_avs)                       # current action
                    # execute action
                    next_s, r, terminal = env.step(curr_a)               # r = reward
                    # compute next action values and action
                    next_obs_feats = ff.get_observable_features(next_s)
                    next_avs = self.get_action_values(next_obs_feats, approximators)
                    next_a = self.epsilon_greedy_policy(next_avs)
                    # compute TD error of Sarsa(0)
                    # curr_av = learning_approximators[curr_a].get_prediction(curr_obs_feats)
                    # next_av = learning_approximators[next_a].get_prediction(next_obs_feats)
                    # td_error = r + self.gamma * (1-terminal) * next_av - curr_av
                    # _, ss, new_weights = optimizers[curr_a].update_weight_vector(td_error, curr_obs_feats,
                    #                                                 learning_approximators[curr_a].get_weight_vector())
                    # learning_approximators[curr_a].update_weight_vector(new_weights)
                    # if pe_phase:
                    #     curr_pe_iterations += 1
                    #     if curr_pe_iterations == total_pe_iterations:
                    #         pe_phase = False
                    #         curr_pe_iterations = 0
                    #         approximators = learning_approximators#copy.deepcopy(learning_approximators)

                    td_error = r + self.gamma * (1-terminal) * next_avs[next_a] - curr_avs[curr_a]
                    _, ss, new_weights = optimizers[curr_a].update_weight_vector(td_error, curr_obs_feats,
                                                                        approximators[curr_a].get_weight_vector())
                    # update weight vector
                    approximators[curr_a].update_weight_vector(new_weights)
                    # update state information and progress
                    curr_obs_feats = next_obs_feats
                    reward_per_step[i][j] += np.int8(r)
                    action_counter[i][curr_checkpoint][curr_a] += np.int32(1)
                    weight_sum_per_checkpoint[i][curr_checkpoint][curr_a][:new_weights.size] += new_weights
                    if self.method != 'sgd':
                        stepsize_sum_per_checkpoint[i][curr_checkpoint][curr_a][:ss.size] += ss

                    # handle cases where weights diverge
                    if np.sum(np.isnan(new_weights)) > 0 or np.sum(np.isinf(new_weights)) > 0:
                        print("\tThe weights diverged on iteration: {0}!".format(j+1))
                        reward_per_step[i][j+1:] += np.int8(-1)
                        diverging_runs[i] += np.int8(1)
                        break

                    # check if terminal state
                    if terminal:
                        env.reset()
                        curr_s = env.get_current_state()
                        curr_obs_feats = ff.get_observable_features(curr_s)

                    # process checkpoints
                    if (j + 1) % CHECKPOINT == 0:
                        curr_checkpoint += 1

                    if self.add_features(ff, approximators, optimizers, alpha, j):
                        curr_obs_feats = ff.get_observable_features(curr_s)
                        # learning_approximators = copy.deepcopy(approximators)
                        # pe_phase = True

            results_dir[names[a]] = {'reward_per_step': reward_per_step,
                                     'diverging_runs': diverging_runs,
                                     'action_counter': action_counter,
                                     'weight_sum_per_checkpoint': weight_sum_per_checkpoint}
            if self.method != 'sgd':
                results_dir[names[a]]['stepsize_sum_per_checkpoint'] = stepsize_sum_per_checkpoint

            if DEBUG:
                agg_results = np.average(reward_per_step, axis=0)
                ms = moving_sum(agg_results, n=CHECKPOINT) + CHECKPOINT
                import matplotlib.pyplot as plt
                plt.plot(np.arange(ms.size)+1, ms)
                plt.vlines(MIDPOINT, ymin=ms.min(), ymax=ms.max())
                plt.show()
                plt.close()

        self.store_results(results_dir)
    def __init__(self, exp_arguments, results_path, tunable_parameter_values):

        self.results_path = results_path
        self.verbose = exp_arguments.verbose
        self.tunable_parameter_values = tunable_parameter_values
        self.stepsize_method = exp_arguments.stepsize_method
        self.config = Config()
        """ Feature Function Setup """
        self.config.state_dims = 2  # number of dimension in mountain car
        self.config.state_lims = np.array(
            ((-1, 1), (-1, 1)),
            dtype=np.float64)  # state bounds in mountain car
        self.config.initial_centers = np.array(
            ((0, 0), (.25, .25), (.25, -.25), (-.25, -.25), (-.25, .25)),
            dtype=np.float64)
        self.config.sigma = 0.5
        self.config.init_noise_mean = 0.0
        self.config.init_noise_var = 0.01
        self.feature_function = RadialBasisFunction(
            self.config)  # stays constant regardless of the parameter value
        """ Environment and Policy Setup """
        self.num_actions = 3  # number of actions in mountain car
        self.config.norm_state = True
        self.config.num_obs_features = self.config.initial_centers.shape[
            0]  # number of initial centers
        self.config.max_num_features = self.config.initial_centers.shape[
            0] + 1  # arbitrary since features are fixed
        self.training_data = exp_arguments.training_data_size
        self.epsilon = 0.1  # reasonable choice in mountain car environment
        self.gamma = 0.99  # discount factor
        self.checkpoint = 5000
        assert self.training_data % self.checkpoint == 0
        """ Experiment Setup """
        self.sample_size = exp_arguments.sample_size
        """ Stepsize adaptation settings"""
        self.config.parameter_size = self.config.num_obs_features
        if self.stepsize_method == 'idbd':
            # non-tunable parameters
            self.config.init_beta = np.log(0.001)
            self.parameter_name = 'meta_stepsize'
            self.stepsize_method_class = IDBD
        elif self.stepsize_method == 'sidbd':
            # non-tunable parameters
            self.config.init_beta = -np.log(
                (1 / 0.001) -
                1)  # equivalent to starting with a stepsize of 0.001
            self.parameter_name = 'meta_stepsize'
            self.stepsize_method_class = SIDBD
        elif self.stepsize_method in ['adam', 'slow_adam']:
            # non-tunable parameters
            self.config.beta1 = 0.9 if self.stepsize_method == 'adam' else 0.0
            self.config.beta2 = 0.99
            self.config.eps = 1e-08
            self.parameter_name = 'initial_stepsize'
            self.stepsize_method_class = Adam
            self.config.restart_ma = False
        elif self.stepsize_method == 'autostep':
            # non-tunable parameters
            self.config.tau = 10000.0
            self.config.init_stepsize = 0.001
            self.parameter_name = 'meta_stepsize'
            self.stepsize_method_class = AutoStep
        elif self.stepsize_method in ['sgd', 'rescaled_sgd']:
            # non-tunable parameters
            self.parameter_name = 'stepsize'
            self.stepsize_method_class = SGD
            self.config.rescale = (self.stepsize_method == 'rescaled_sgd')
        else:
            raise ValueError("Unrecognized stepsize adaptation method.")
class Experiment:
    def __init__(self, exp_arguments, results_path, tunable_parameter_values):

        self.results_path = results_path
        self.verbose = exp_arguments.verbose
        self.tunable_parameter_values = tunable_parameter_values
        self.stepsize_method = exp_arguments.stepsize_method
        self.config = Config()
        """ Feature Function Setup """
        self.config.state_dims = 2  # number of dimension in mountain car
        self.config.state_lims = np.array(
            ((-1, 1), (-1, 1)),
            dtype=np.float64)  # state bounds in mountain car
        self.config.initial_centers = np.array(
            ((0, 0), (.25, .25), (.25, -.25), (-.25, -.25), (-.25, .25)),
            dtype=np.float64)
        self.config.sigma = 0.5
        self.config.init_noise_mean = 0.0
        self.config.init_noise_var = 0.01
        self.feature_function = RadialBasisFunction(
            self.config)  # stays constant regardless of the parameter value
        """ Environment and Policy Setup """
        self.num_actions = 3  # number of actions in mountain car
        self.config.norm_state = True
        self.config.num_obs_features = self.config.initial_centers.shape[
            0]  # number of initial centers
        self.config.max_num_features = self.config.initial_centers.shape[
            0] + 1  # arbitrary since features are fixed
        self.training_data = exp_arguments.training_data_size
        self.epsilon = 0.1  # reasonable choice in mountain car environment
        self.gamma = 0.99  # discount factor
        self.checkpoint = 5000
        assert self.training_data % self.checkpoint == 0
        """ Experiment Setup """
        self.sample_size = exp_arguments.sample_size
        """ Stepsize adaptation settings"""
        self.config.parameter_size = self.config.num_obs_features
        if self.stepsize_method == 'idbd':
            # non-tunable parameters
            self.config.init_beta = np.log(0.001)
            self.parameter_name = 'meta_stepsize'
            self.stepsize_method_class = IDBD
        elif self.stepsize_method == 'sidbd':
            # non-tunable parameters
            self.config.init_beta = -np.log(
                (1 / 0.001) -
                1)  # equivalent to starting with a stepsize of 0.001
            self.parameter_name = 'meta_stepsize'
            self.stepsize_method_class = SIDBD
        elif self.stepsize_method in ['adam', 'slow_adam']:
            # non-tunable parameters
            self.config.beta1 = 0.9 if self.stepsize_method == 'adam' else 0.0
            self.config.beta2 = 0.99
            self.config.eps = 1e-08
            self.parameter_name = 'initial_stepsize'
            self.stepsize_method_class = Adam
            self.config.restart_ma = False
        elif self.stepsize_method == 'autostep':
            # non-tunable parameters
            self.config.tau = 10000.0
            self.config.init_stepsize = 0.001
            self.parameter_name = 'meta_stepsize'
            self.stepsize_method_class = AutoStep
        elif self.stepsize_method in ['sgd', 'rescaled_sgd']:
            # non-tunable parameters
            self.parameter_name = 'stepsize'
            self.stepsize_method_class = SGD
            self.config.rescale = (self.stepsize_method == 'rescaled_sgd')
        else:
            raise ValueError("Unrecognized stepsize adaptation method.")

    def _print(self, astring):
        if self.verbose:
            print(astring)

    def set_tunable_parameter_value(self, val):
        if self.stepsize_method in ['idbd', 'sidbd']:
            self.config.theta = val
        elif self.stepsize_method in ['adam', 'slow_adam']:
            self.config.init_alpha = val
        elif self.stepsize_method == 'autostep':
            self.config.mu = val
        elif self.stepsize_method in ['sgd', 'rescaled_sgd']:
            self.config.alpha = val
        else:
            raise ValueError("Unrecognized stepsize adaptation method.")

    def run(self):
        np.random.seed(0)
        results = {
            'parameter_name': self.parameter_name,
            'sample_size': self.sample_size,
            'parameter_values': self.tunable_parameter_values,
            'avg_return': np.zeros(len(self.tunable_parameter_values)),
            'avg_episodes_per_run':
            np.zeros(len(self.tunable_parameter_values))
        }

        for j, pv in enumerate(self.tunable_parameter_values):
            self._print("Parameter value: {0}".format(pv))

            self.set_tunable_parameter_value(pv)
            avg_return_per_run = np.zeros(self.sample_size, dtype=np.float64)
            episodes_per_run = np.zeros(self.sample_size, dtype=np.int32)

            for i in range(self.sample_size):
                self._print("\tRun number: {0}".format(i + 1))
                env = MountainCar(self.config)
                approximators = []
                stepsize_method = []
                for _ in range(self.num_actions):
                    approximators.append(
                        LinearFunctionApproximator(self.config))
                    stepsize_method.append(
                        self.stepsize_method_class(self.config))

                avg_return_per_checkpoint = np.zeros(self.training_data //
                                                     self.checkpoint,
                                                     dtype=np.float64)

                return_per_episode = []
                curr_checkpoint = 0
                current_return = 0.0

                # initial features and action
                curr_s = env.get_current_state()
                curr_obs_feats = self.feature_function.get_observable_features(
                    curr_s)  # current observable features
                for k in range(self.training_data):
                    # get current action values
                    curr_avs = self.get_action_values(
                        curr_obs_feats, approximators)  # current action values
                    curr_a = self.epsilon_greedy_policy(
                        curr_avs)  # current action
                    # execute action
                    next_s, r, term = env.step(curr_a)  # r = reward
                    # get next observable features
                    next_obs_feats = self.feature_function.get_observable_features(
                        next_s)
                    # get next action values and action
                    next_avs = self.get_action_values(next_obs_feats,
                                                      approximators)
                    next_a = self.epsilon_greedy_policy(next_avs)
                    # compute the TD error for Sarsa(0)
                    td_error = r + self.gamma * (
                        1 - term) * next_avs[next_a] - curr_avs[curr_a]
                    # update weight vector
                    _, _, new_weights = stepsize_method[
                        curr_a].update_weight_vector(
                            td_error, curr_obs_feats,
                            approximators[curr_a].get_weight_vector())
                    # store new weights
                    approximators[curr_a].update_weight_vector(new_weights)
                    # update feature and action information, and keep track of progress
                    curr_obs_feats = next_obs_feats
                    current_return += r
                    # handle cases where weights diverge
                    if np.sum(np.isnan(new_weights)) > 0 or np.sum(
                            np.isinf(new_weights)) > 0:
                        print("The weights diverged!")
                        avg_return_per_checkpoint[
                            curr_checkpoint:] -= self.checkpoint
                        break
                    # check if terminal state
                    if term:
                        # store summaries
                        episodes_per_run[i] += 1
                        return_per_episode.append(current_return)
                        current_return *= 0.0
                        # reset environment
                        env.reset()
                        curr_s = env.get_current_state()
                        curr_obs_feats = self.feature_function.get_observable_features(
                            curr_s)

                    if (k + 1) % self.checkpoint == 0:
                        if len(return_per_episode) == 0:
                            avg_return_per_checkpoint[
                                curr_checkpoint] -= self.checkpoint
                        else:
                            avg_return_per_checkpoint[
                                curr_checkpoint] += np.average(
                                    return_per_episode)
                            return_per_episode = []
                        curr_checkpoint += 1

                if DEBUG:
                    import matplotlib.pyplot as plt
                    x = np.arange(self.training_data // self.checkpoint)
                    plt.plot(x, avg_return_per_checkpoint)
                    plt.show()
                    plt.close()

                avg_return_per_run[i] = np.average(avg_return_per_checkpoint)
                self._print("\t\tAverage Return per Run: {0:.4f}".format(
                    avg_return_per_run[i]))
                self._print("\t\tEpisodes Completed: {0}".format(
                    episodes_per_run[i]))

            results['avg_return'][j] += np.average(avg_return_per_run)
            results['avg_episodes_per_run'][j] += np.average(episodes_per_run)
            self._print("Average Return: {0:.4f}".format(
                results['avg_return'][j]))
            self._print("Average Episodes Completed: {0:.4f}".format(
                results['avg_episodes_per_run'][j]))

        self.store_results(results)

    def get_action_values(self, features, approximators):
        action_values = np.zeros(self.num_actions, dtype=np.float64)
        for k in range(self.num_actions):
            action_values[k] += approximators[k].get_prediction(features)
        return action_values

    def epsilon_greedy_policy(self, action_values: np.ndarray):
        p = np.random.rand()
        if p > self.epsilon:
            argmax_av = np.random.choice(
                np.flatnonzero(action_values == action_values.max()))
            return argmax_av
        else:
            return np.random.randint(self.num_actions)

    def store_results(self, results):
        file_path = os.path.join(self.results_path,
                                 'parameter_tuning_results.p')
        with open(file_path, mode='wb') as results_file:
            pickle.dump(results, results_file)
        print("Results successfully stored.")
Ejemplo n.º 6
0
    def __init__(self, exp_arguments, results_path, tunable_parameter_values):

        self.results_path = results_path
        self.verbose = exp_arguments.verbose
        self.tunable_parameter_values = tunable_parameter_values
        self.stepsize_method = exp_arguments.stepsize_method
        self.sample_size = exp_arguments.sample_size
        self.num_transitions = 200000
        self.checkpoint = 1000
        self.dense = exp_arguments.dense
        self.config = Config()

        self.data_path = os.path.join(
            os.getcwd(), 'mountain_car_prediction_data_30evaluations')
        assert len(os.listdir(self.data_path)) >= self.sample_size
        """ Feature Function Setup """
        self.config.state_dims = 2  # number of dimension in mountain car
        self.config.state_lims = np.array(
            ((-1, 1), (-1, 1)),
            dtype=np.float64)  # state bounds in mountain car
        if self.dense:
            x = np.arange(-1, 1.2, 2 / 10)
            self.config.initial_centers = np.transpose(
                [np.tile(x, len(x)), np.repeat(x, len(x))])
        else:
            self.config.initial_centers = np.array(
                ((0, 0), (.25, .25), (.25, -.25), (-.25, -.25), (-.25, .25)),
                dtype=np.float64)
        self.config.sigma = 0.5
        self.config.init_noise_mean = 0.0
        self.config.init_noise_var = 0.01 if not self.dense else 0.0
        self.feature_function = RadialBasisFunction(
            self.config)  # stays constant regardless of the parameter value
        """ Environment and Policy Setup """
        self.num_actions = 3  # number of actions in mountain car
        self.config.num_obs_features = self.config.initial_centers.shape[
            0]  # number of initial centers
        self.config.max_num_features = self.config.initial_centers.shape[
            0] + 1  # arbitrary since features are fixed
        self.gamma = 0.99  # discount factor
        """ Stepsize adaptation settings"""
        self.config.parameter_size = self.config.num_obs_features
        if self.stepsize_method == 'idbd':
            # non-tunable parameters
            self.config.init_beta = np.log(0.001)
            self.parameter_name = 'meta_stepsize'
            self.stepsize_method_class = IDBD
        elif self.stepsize_method == 'sidbd':
            # non-tunable parameters
            self.config.init_beta = -np.log(
                (1 / 0.001) -
                1)  # equivalent to starting with a stepsize of 0.001
            self.parameter_name = 'meta_stepsize'
            self.stepsize_method_class = SIDBD
            self.increase_setting = 'reset'  # not used since no new features are added during tuning
        elif self.stepsize_method in ['adam', 'slow_adam']:
            # non-tunable parameters
            self.config.beta1 = 0.9 if self.stepsize_method == 'adam' else 0.0
            self.config.beta2 = 0.99
            self.config.eps = 1e-08
            self.parameter_name = 'initial_stepsize'
            self.stepsize_method_class = Adam
            self.config.restart_ma = False
        elif self.stepsize_method == 'autostep':
            # non-tunable parameters
            self.config.tau = 10000.0
            self.config.init_stepsize = 0.001
            self.parameter_name = 'meta_stepsize'
            self.stepsize_method_class = AutoStep
        elif self.stepsize_method in ['sgd', 'rescaled_sgd']:
            # non-tunable parameters
            self.parameter_name = 'stepsize'
            self.stepsize_method_class = SGD
            self.config.rescale = (self.stepsize_method == 'rescaled_sgd')
        else:
            raise ValueError("Unrecognized stepsize adaptation method.")
Ejemplo n.º 7
0
class Experiment:
    def __init__(self, exp_arguments, results_path, tunable_parameter_values):

        self.results_path = results_path
        self.verbose = exp_arguments.verbose
        self.tunable_parameter_values = tunable_parameter_values
        self.stepsize_method = exp_arguments.stepsize_method
        self.sample_size = exp_arguments.sample_size
        self.num_transitions = 200000
        self.checkpoint = 1000
        self.dense = exp_arguments.dense
        self.config = Config()

        self.data_path = os.path.join(
            os.getcwd(), 'mountain_car_prediction_data_30evaluations')
        assert len(os.listdir(self.data_path)) >= self.sample_size
        """ Feature Function Setup """
        self.config.state_dims = 2  # number of dimension in mountain car
        self.config.state_lims = np.array(
            ((-1, 1), (-1, 1)),
            dtype=np.float64)  # state bounds in mountain car
        if self.dense:
            x = np.arange(-1, 1.2, 2 / 10)
            self.config.initial_centers = np.transpose(
                [np.tile(x, len(x)), np.repeat(x, len(x))])
        else:
            self.config.initial_centers = np.array(
                ((0, 0), (.25, .25), (.25, -.25), (-.25, -.25), (-.25, .25)),
                dtype=np.float64)
        self.config.sigma = 0.5
        self.config.init_noise_mean = 0.0
        self.config.init_noise_var = 0.01 if not self.dense else 0.0
        self.feature_function = RadialBasisFunction(
            self.config)  # stays constant regardless of the parameter value
        """ Environment and Policy Setup """
        self.num_actions = 3  # number of actions in mountain car
        self.config.num_obs_features = self.config.initial_centers.shape[
            0]  # number of initial centers
        self.config.max_num_features = self.config.initial_centers.shape[
            0] + 1  # arbitrary since features are fixed
        self.gamma = 0.99  # discount factor
        """ Stepsize adaptation settings"""
        self.config.parameter_size = self.config.num_obs_features
        if self.stepsize_method == 'idbd':
            # non-tunable parameters
            self.config.init_beta = np.log(0.001)
            self.parameter_name = 'meta_stepsize'
            self.stepsize_method_class = IDBD
        elif self.stepsize_method == 'sidbd':
            # non-tunable parameters
            self.config.init_beta = -np.log(
                (1 / 0.001) -
                1)  # equivalent to starting with a stepsize of 0.001
            self.parameter_name = 'meta_stepsize'
            self.stepsize_method_class = SIDBD
            self.increase_setting = 'reset'  # not used since no new features are added during tuning
        elif self.stepsize_method in ['adam', 'slow_adam']:
            # non-tunable parameters
            self.config.beta1 = 0.9 if self.stepsize_method == 'adam' else 0.0
            self.config.beta2 = 0.99
            self.config.eps = 1e-08
            self.parameter_name = 'initial_stepsize'
            self.stepsize_method_class = Adam
            self.config.restart_ma = False
        elif self.stepsize_method == 'autostep':
            # non-tunable parameters
            self.config.tau = 10000.0
            self.config.init_stepsize = 0.001
            self.parameter_name = 'meta_stepsize'
            self.stepsize_method_class = AutoStep
        elif self.stepsize_method in ['sgd', 'rescaled_sgd']:
            # non-tunable parameters
            self.parameter_name = 'stepsize'
            self.stepsize_method_class = SGD
            self.config.rescale = (self.stepsize_method == 'rescaled_sgd')
        else:
            raise ValueError("Unrecognized stepsize adaptation method.")

    def _print(self, astring):
        if self.verbose:
            print(astring)

    def set_tunable_parameter_value(self, val):
        if self.stepsize_method in ['idbd', 'sidbd']:
            self.config.theta = val
        elif self.stepsize_method in ['adam', 'slow_adam']:
            self.config.init_alpha = val
        elif self.stepsize_method == 'autostep':
            self.config.mu = val
        elif self.stepsize_method in ['sgd', 'rescaled_sgd']:
            self.config.alpha = val
        else:
            raise ValueError("Unrecognized stepsize adaptation method.")

    def run(self):
        results = {
            'parameter_name': self.parameter_name,
            'sample_size': self.sample_size,
            'parameter_values': self.tunable_parameter_values,
            'avg_mse': np.zeros(len(self.tunable_parameter_values))
        }

        for j, pv in enumerate(self.tunable_parameter_values):
            self._print("Parameter value: {0}".format(pv))

            self.set_tunable_parameter_value(pv)
            avg_mse_per_run = np.zeros(self.sample_size, dtype=np.int32)

            for i in range(self.sample_size):
                self._print("\tRun number: {0}".format(i + 1))

                approximators = []
                stepsize_method = []
                for _ in range(self.num_actions):
                    approximators.append(
                        LinearFunctionApproximator(self.config))
                    stepsize_method.append(
                        self.stepsize_method_class(self.config))

                mse_per_checkpoint = np.zeros(self.num_transitions //
                                              self.checkpoint,
                                              dtype=np.float64)
                curr_checkpoint = 0

                # load data
                states, actions, rewards, terminations, avg_disc_return = self.load_data(
                    seed_number=i)
                """ Start of Training"""
                curr_obs_feats = self.feature_function.get_observable_features(
                    states[0])  # current observable features
                k = 0
                while k < self.num_transitions:
                    curr_a = actions[k]  # current action
                    curr_av = approximators[curr_a].get_prediction(
                        curr_obs_feats)  # current value
                    next_s = states[k + 1]  # next state
                    next_r = rewards[k + 1]  # next reward
                    next_term = terminations[k + 1]  # next termination
                    next_a = actions[k + 1]  # next action

                    # get next observable features
                    next_obs_feats = self.feature_function.get_observable_features(
                        next_s)
                    # get next action values and action
                    next_av = approximators[next_a].get_prediction(
                        next_obs_feats)
                    # compute the TD error for Sarsa(0)
                    td_error = next_r + self.gamma * (
                        1 - next_term) * next_av - curr_av
                    # update weight vector
                    _, _, new_weights = stepsize_method[
                        curr_a].update_weight_vector(
                            td_error, curr_obs_feats,
                            approximators[curr_a].get_weight_vector())
                    approximators[curr_a].update_weight_vector(new_weights)
                    # handle cases where weights diverge
                    if np.sum(np.isnan(new_weights)) > 0 or np.sum(
                            np.isinf(new_weights)) > 0:
                        mse_per_checkpoint[curr_checkpoint:] += 1000000
                        print("The weights diverged!")
                        break

                    # update feature information, checkpoint, and k, and keep track of progress
                    curr_obs_feats = next_obs_feats
                    mse_per_checkpoint[curr_checkpoint] += np.square(
                        curr_av - avg_disc_return[k]) / self.checkpoint
                    k += 1
                    if k % self.checkpoint == 0: curr_checkpoint += 1

                    # handle terminal states
                    if next_term and k < self.num_transitions:
                        k += 1  # skips terminal states
                        if k % self.checkpoint == 0: curr_checkpoint += 1
                        curr_obs_feats = self.feature_function.get_observable_features(
                            states[k])

                if DEBUG:
                    import matplotlib.pyplot as plt
                    x = np.arange(self.num_transitions // self.checkpoint)
                    plt.plot(x, mse_per_checkpoint)
                    plt.show()
                    plt.close()

                avg_mse_per_run[i] = np.average(mse_per_checkpoint)
                self._print("\t\tAverage Mean Squared Error: {0:.4f}".format(
                    avg_mse_per_run[i]))

            results['avg_mse'][j] += np.average(avg_mse_per_run)
            self._print("Average MSE: {0:.4f}".format(results['avg_mse'][j]))

        self.store_results(results)

    def store_results(self, results):
        file_path = os.path.join(self.results_path,
                                 'parameter_tuning_results.p')
        with open(file_path, mode='wb') as results_file:
            pickle.dump(results, results_file)
        print("Results successfully stored.")

    def load_data(self, seed_number):
        with open(os.path.join(self.data_path,
                               'seed' + str(seed_number) + '.p'),
                  mode='rb') as data_file:
            data_dict = pickle.load(data_file)
        states = data_dict['states']
        actions = data_dict['actions']
        rewards = data_dict['rewards']
        terminations = data_dict['terminations']
        avg_discounted_return = data_dict['avg_discounted_return']
        return states, actions, rewards, terminations, avg_discounted_return