コード例 #1
0
class PEPG:
    '''Extension of PEPG with bells and whistles.'''
    def __init__(
        self,
        num_params,  # number of model parameters
        sigma_init=0.10,  # initial standard deviation
        sigma_alpha=0.20,  # learning rate for std
        sigma_decay=0.999,  # anneal standard deviation
        sigma_limit=0.01,  # stop annealing if less than
        sigma_max_change=0.2,  # clips adaptive sigma to 20%
        learning_rate=0.01,  # learning rate for std
        learning_rate_decay=0.9999,  # annealing the learning rate
        learning_rate_limit=0.01,  # stop annealing learning rate
        elite_ratio=0,  # if >0 then ignore learning_rate
        pop_size=256,  # population size
        average_baseline=True,  # set baseline to average
        weight_decay=0.01,  # weight decay coefficient
        rank_fitness=True,  # use rank rather than fitness
        forget_best=True):  # don't keep the hist best sol

        self.num_params = num_params
        self.sigma_init = sigma_init
        self.sigma_alpha = sigma_alpha
        self.sigma_decay = sigma_decay
        self.sigma_limit = sigma_limit
        self.sigma_max_change = sigma_max_change
        self.learning_rate = learning_rate
        self.learning_rate_decay = learning_rate_decay
        self.learning_rate_limit = learning_rate_limit
        self.pop_size = pop_size
        self.average_baseline = average_baseline
        if self.average_baseline:
            assert (self.pop_size % 2 == 0), "Population size must be even"
            self.pop_size = int(self.pop_size / 2)
        else:
            assert (self.pop_size & 1), "Population size must be odd"
            self.pop_size = int((self.pop_size - 1) / 2)

        # option to use greedy es method to select next mu,
        # rather than using drift param
        self.elite_ratio = elite_ratio
        self.elite_pop_size = int(self.pop_size * self.elite_ratio)
        self.use_elite = False
        if self.elite_pop_size > 0:
            self.use_elite = True

        self.forget_best = forget_best
        self.batch_reward = np.zeros(self.pop_size * 2)
        self.mu = np.zeros(self.num_params)
        self.sigma = np.ones(self.num_params) * self.sigma_init
        self.curr_best_mu = np.zeros(self.num_params)
        self.best_mu = np.zeros(self.num_params)
        self.best_reward = 0
        self.first_interation = True
        self.weight_decay = weight_decay
        self.rank_fitness = rank_fitness
        if self.rank_fitness:
            self.forget_best = True  # always forget the best one if we rank
        # choose optimizer
        self.optimizer = Adam(self, learning_rate)

    def rms_stdev(self):
        sigma = self.sigma
        return np.mean(np.sqrt(sigma * sigma))

    def ask(self):
        '''returns a list of parameters'''
        # antithetic sampling
        self.epsilon = np.random.randn(self.pop_size, self.num_params)
        self.epsilon *= self.sigma.reshape(1, self.num_params)
        self.epsilon_full = np.concatenate([self.epsilon, -self.epsilon])
        if self.average_baseline:
            epsilon = self.epsilon_full
        else:
            # first population is mu, then positive epsilon,
            # then negative epsilon
            epsilon = np.concatenate(
                [np.zeros((1, self.num_params)), self.epsilon_full])
        solutions = self.mu.reshape(1, self.num_params) + epsilon
        self.solutions = solutions
        return solutions

    def tell(self, scores):
        # input must be a numpy float array
        assert (len(scores) == self.pop_size
                ), "Inconsistent reward_table size reported."

        reward_table = np.array(scores)

        if self.rank_fitness:
            reward_table = compute_centered_ranks(reward_table)

        if self.weight_decay > 0:
            l2_decay = compute_weight_decay(self.weight_decay, self.solutions)
            reward_table += l2_decay

        reward_offset = 1
        if self.average_baseline:
            b = np.mean(reward_table)
            reward_offset = 0
        else:
            b = reward_table[0]  # baseline

        reward = reward_table[reward_offset:]
        if self.use_elite:
            idx = np.argsort(reward)[::-1][0:self.elite_pop_size]
        else:
            idx = np.argsort(reward)[::-1]

        best_reward = reward[idx[0]]
        if (best_reward > b or self.average_baseline):
            best_mu = self.mu + self.epsilon_full[idx[0]]
            best_reward = reward[idx[0]]
        else:
            best_mu = self.mu
            best_reward = b

        self.curr_best_reward = best_reward
        self.curr_best_mu = best_mu

        if self.first_interation:
            self.sigma = np.ones(self.num_params) * self.sigma_init
            self.first_interation = False
            self.best_reward = self.curr_best_reward
            self.best_mu = best_mu
        else:
            if self.forget_best or (self.curr_best_reward > self.best_reward):
                self.best_mu = best_mu
                self.best_reward = self.curr_best_reward

        # short hand
        epsilon = self.epsilon
        sigma = self.sigma

        # update the mean

        # move mean to the average of the best idx means
        if self.use_elite:
            self.mu += self.epsilon_full[idx].mean(axis=0)
        else:
            rT = (reward[:self.pop_size] - reward[self.pop_size:])
            change_mu = np.dot(rT, epsilon)
            self.optimizer.stepsize = self.learning_rate
            # adam, rmsprop, momentum, etc.
            update_ratio = self.optimizer.update(-change_mu)
            # self.mu += (change_mu * self.learning_rate) # normal SGD method

        # adaptive sigma
        # normalization
        if (self.sigma_alpha > 0):
            stdev_reward = 1.0
            if not self.rank_fitness:
                stdev_reward = reward.std()
            S = epsilon * epsilon - (sigma * sigma).reshape(1, self.num_params)
            S /= sigma.reshape(1, self.num_params)
            reward_avg = (reward[:self.pop_size] +
                          reward[self.pop_size:]) / 2.0
            rS = reward_avg - b
            delta_sigma = (np.dot(rS, S)) / \
                (2 * self.pop_size * stdev_reward)

            # adjust sigma according to the adaptive sigma calculation
            # for stability, don't let sigma move more than 10% of orig value
            change_sigma = self.sigma_alpha * delta_sigma
            change_sigma = np.minimum(change_sigma,
                                      self.sigma_max_change * self.sigma)
            change_sigma = np.maximum(change_sigma,
                                      -self.sigma_max_change * self.sigma)
            self.sigma += change_sigma

        if (self.sigma_decay < 1):
            self.sigma[self.sigma > self.sigma_limit] *= self.sigma_decay

        if (self.learning_rate_decay < 1
                and self.learning_rate > self.learning_rate_limit):
            self.learning_rate *= self.learning_rate_decay

    def current_param(self):
        return self.curr_best_mu

    def set_mu(self, mu):
        self.mu = np.array(mu)

    def best_param(self):
        return self.best_mu

    def result(self):
        # return best params so far, along with historically
        # best reward, curr reward, sigma
        return (self.best_mu, self.best_reward, self.curr_best_reward,
                self.sigma)
コード例 #2
0
class OpenES:
    """
    Basic Version of OpenAI Evolution Strategies
    """
    def __init__(
        self,
        num_params,  # number of model parameters
        mu_init=None,  # initial mean
        sigma_init=1,  # initial standard deviation
        sigma_decay=0.999,  # anneal standard deviation
        sigma_limit=0.01,  # stop annealing if less than
        learning_rate=0.01,  # learning rate for std
        learning_rate_decay=0.9999,  # annealing the learning rate
        learning_rate_limit=0.001,  # stop annealing learning rate
        pop_size=256,  # population size
        antithetic=False,  # whether to use anti sampling
        weight_decay=0.01,  # weight decay coefficient
        rank_fitness=True,  # use rank rather than fitness
        forget_best=True):  # forget historical best

        # misc
        self.num_params = num_params
        self.first_interation = True

        # distribution parameters
        if mu_init is None:
            self.mu = np.zeros(self.num_params)
        else:
            self.mu = np.array(mu_init)
        self.sigma_decay = sigma_decay
        self.sigma = sigma_init
        self.sigma_init = sigma_init
        self.sigma_limit = sigma_limit

        # optimizarion stuff
        self.learning_rate = learning_rate
        self.learning_rate_decay = learning_rate_decay
        self.learning_rate_limit = learning_rate_limit
        self.optimizer = Adam(self, learning_rate)

        # sampling stuff
        self.pop_size = pop_size
        self.antithetic = antithetic
        if self.antithetic:
            assert (self.pop_size % 2 == 0), "Population size must be even"
        self.forget_best = forget_best
        self.weight_decay = weight_decay
        self.rank_fitness = rank_fitness
        if self.rank_fitness:
            self.forget_best = True

    def ask(self, pop_size):
        """
        Returns a list of candidates parameterss
        """

        if self.antithetic:
            epsilon_half = np.random.randn(self.pop_size // 2, self.num_params)
            epsilon = np.concatenate([epsilon_half, -epsilon_half])

        else:
            epsilon = np.random.randn(pop_size, self.num_params)

        return self.mu.reshape(1, self.num_params) + epsilon * self.sigma

    def tell(self, solutions, scores):
        """
        Updates the distribution
        """
        assert (len(scores) == self.pop_size
                ), "Inconsistent reward_table size reported."

        reward = np.array(scores)

        if self.rank_fitness:
            reward = compute_centered_ranks(reward)

        if self.weight_decay > 0:
            l2_decay = compute_weight_decay(self.weight_decay, solutions)
            reward += l2_decay

        # TBD check if ok
        epsilon = (solutions -
                   self.mu.reshape(1, self.num_params)) / self.sigma

        # standardize the rewards to have a gaussian distribution
        normalized_reward = (reward - np.mean(reward)) / np.std(reward)
        change_mu = 1. / (self.pop_size * self.sigma) * \
            np.dot(epsilon.T, normalized_reward)

        # updating stuff
        idx = np.argsort(reward)[::-1]
        best_reward = reward[idx[0]]
        best_mu = solutions[idx[0]]

        self.curr_best_reward = best_reward
        self.curr_best_mu = best_mu

        if self.first_interation:
            self.first_interation = False
            self.best_reward = self.curr_best_reward
            self.best_mu = best_mu
        else:
            if self.forget_best or (self.curr_best_reward > self.best_reward):
                self.best_mu = best_mu
                self.best_reward = self.curr_best_reward

        # optimization step
        self.optimizer.stepsize = self.learning_rate
        self.optimizer.update(-change_mu)

        # adjust sigma according to the adaptive sigma calculation
        if (self.sigma > self.sigma_limit):
            self.sigma *= self.sigma_decay

        if (self.learning_rate > self.learning_rate_limit):
            self.learning_rate *= self.learning_rate_decay

    def get_distrib_params(self):
        """
        Returns the parameters of the distrubtion:
        the mean and sigma
        """
        return self.mu, self.sigma

    def result(self):
        """
        Returns best params so far, best score, current score
        and sigma
        """
        return (self.best_mu, self.best_reward, self.curr_best_reward,
                self.sigma)

    def rms_stdev(self):
        sigma = self.sigma
        return np.mean(np.sqrt(sigma * sigma))
コード例 #3
0
ファイル: Model.py プロジェクト: bradleybauer/mario
class Parameters:
    def __init__(self, population_size=1, sigma=0, alpha=0, filename=''):
        self.population_size = population_size
        self.sigma = sigma
        self.alpha = alpha
        self.optimizer = Adam()

        if filename:
            npz = np.load(filename)
            self.F1 = Param(npz['arr_0'], population_size, sigma)
            self.F2 = Param(npz['arr_1'], population_size, sigma)
            self.F3 = Param(npz['arr_2'], population_size, sigma)
            self.F4 = Param(npz['arr_3'], population_size, sigma)
            self.F5 = Param(npz['arr_4'], population_size, sigma)
            self.F6 = Param(npz['arr_5'], population_size, sigma)

            self.g3 = Param(npz['arr_6'], population_size, sigma)
            self.b3 = Param(npz['arr_7'], population_size, sigma)
            self.g4 = Param(npz['arr_8'], population_size, sigma)
            self.b4 = Param(npz['arr_9'], population_size, sigma)
            self.g5 = Param(npz['arr_10'], population_size, sigma)
            self.b5 = Param(npz['arr_11'], population_size, sigma)
            self.g6 = Param(npz['arr_12'], population_size, sigma)
            self.b6 = Param(npz['arr_13'], population_size, sigma)

            self.Wx0 = Param(npz['arr_14'], population_size, sigma)
            self.bx0 = Param(npz['arr_15'], population_size, sigma)
            self.Wx1 = Param(npz['arr_16'], population_size, sigma)
            self.bx1 = Param(npz['arr_17'], population_size, sigma)
            self.Wx2 = Param(npz['arr_18'], population_size, sigma)
            self.bx2 = Param(npz['arr_19'], population_size, sigma)
            self.Wv = Param(npz['arr_20'], population_size, sigma)
            self.bv = Param(npz['arr_21'], population_size, sigma)

            self.lg0 = Param(npz['arr_22'], population_size, sigma)
            self.lb0 = Param(npz['arr_23'], population_size, sigma)
            self.lg1 = Param(npz['arr_24'], population_size, sigma)
            self.lb1 = Param(npz['arr_25'], population_size, sigma)
            self.lg2 = Param(npz['arr_26'], population_size, sigma)
            self.lb2 = Param(npz['arr_27'], population_size, sigma)
        else:
            # filter weight is whdo
            # w = width
            # h = height
            # d = depth (in channels)
            # o = out depth (out channels)?
            self.F1 = Param(
                tf.random.normal([F_size, F_size, 3, NF1_out],
                                 stddev=m.sqrt(2 / F_size)), population_size,
                sigma)
            self.F2 = Param(
                tf.random.normal([F_size, F_size, NF1_out, NF2_out],
                                 stddev=m.sqrt(2 / F_size)), population_size,
                sigma)
            self.F3 = Param(
                tf.random.normal([F_size, F_size, NF2_out, NF3_out],
                                 stddev=m.sqrt(2 / F_size)), population_size,
                sigma)
            self.g3 = Param(tf.ones((NF3_out, 1)), population_size, sigma)
            self.b3 = Param(tf.zeros((NF3_out, 1)), population_size, sigma)
            self.F4 = Param(
                tf.random.normal([F_size, F_size, NF3_out, NF4_out],
                                 stddev=m.sqrt(2 / F_size)), population_size,
                sigma)
            self.g4 = Param(tf.ones((NF4_out, 1)), population_size, sigma)
            self.b4 = Param(tf.zeros((NF4_out, 1)), population_size, sigma)
            self.F5 = Param(
                tf.random.normal([F_size, F_size, NF4_out, NF5_out],
                                 stddev=m.sqrt(2 / F_size)), population_size,
                sigma)
            self.g5 = Param(tf.ones((NF5_out, 1)), population_size, sigma)
            self.b5 = Param(tf.zeros((NF5_out, 1)), population_size, sigma)
            self.F6 = Param(
                tf.random.normal([F_size, F_size, NF5_out, NF6_out],
                                 stddev=m.sqrt(2 / F_size)), population_size,
                sigma)
            self.g6 = Param(tf.ones((NF6_out, 1)), population_size, sigma)
            self.b6 = Param(tf.zeros((NF6_out, 1)), population_size, sigma)

            self.lg0 = Param(tf.ones((H_size, 1)), population_size, sigma)
            self.lb0 = Param(tf.zeros((H_size, 1)), population_size, sigma)
            self.lg1 = Param(tf.ones((H_size, 1)), population_size, sigma)
            self.lb1 = Param(tf.zeros((H_size, 1)), population_size, sigma)
            self.lg2 = Param(tf.ones((H_size, 1)), population_size, sigma)
            self.lb2 = Param(tf.zeros((H_size, 1)), population_size, sigma)

            self.Wx0 = Param(tf.random.normal([H_size * 4, z_size]),
                             population_size, sigma)
            self.bx0 = Param(tf.zeros([H_size * 4, 1]), population_size, sigma)
            self.Wx1 = Param(tf.random.normal([H_size * 4, H_size * 2]),
                             population_size, sigma)
            self.bx1 = Param(tf.zeros([H_size * 4, 1]), population_size, sigma)
            self.Wx2 = Param(tf.random.normal([H_size * 4, H_size * 2]),
                             population_size, sigma)
            self.bx2 = Param(tf.zeros([H_size * 4, 1]), population_size, sigma)
            self.Wv = Param(tf.random.normal([Y_size, H_size]),
                            population_size, sigma)
            self.bv = Param(tf.zeros([Y_size, 1]), population_size, sigma)

    def all(self):
        return [self.F1, self.F2, self.F3, self.F4, self.F5, self.F6,\
                self.g3, self.b3, self.g4, self.b4, self.g5, self.b5, self.g6, self.b6,\
                self.Wx0, self.bx0, self.Wx1, self.bx1, self.Wx2, self.bx2,\
                self.Wv, self.bv,\
                self.lg0,self.lb0,self.lg1,self.lb1,self.lg2,self.lb2]

    # return reference to current tensors
    def current(self):
        return [param.current for param in self.all()]

    def set_current_population_member(self, i):
        for param in self.all():
            param.set_current_population_member(i)

    def update_nes(self, reward, reward_mean, reward_std):
        reward = (reward - reward_mean) / (reward_std + .00001)
        grads = []
        means = []
        for param in self.all():
            grads += [
                param.get_grad(reward) * (self.alpha /
                                          (self.population_size * self.sigma))
            ]
            means += [param.mean]
        self.optimizer.update(means, grads)
        for param in self.all():
            param.gen_pop_about_mean(self.sigma)

    def mutate(self, param, i):
        x = param.population[i]
        if random.randint(1, 4) == 1:
            jitter = tf.random.normal(x.shape, stddev=self.sigma)
            return x + jitter
        else:
            return x

    def mate(self, param, i, j):
        if random.randint(1, 4) == 1:
            return self.mutate(param, i)
        else:
            return self.mutate(param, j)

    def update_ga(self, rewards):
        # sort parameters by rewards
        top_reward_indices = rewards.argsort()[-PASS_THROUGH:]
        top_reward_indices = top_reward_indices[::-1]
        for param in self.all():
            # sort population
            for i, j in enumerate(top_reward_indices):
                param.population[i] = param.population[j]
            # generate new population
            for k in range(PASS_THROUGH, self.population_size):
                param.population[k] = self.mate(param, random.randint(0, 9),
                                                random.randint(0, 9))