Beispiel #1
0
    def __init__(self,
                 num_params,
                 mu_init=None,
                 sigma_init=0.1,
                 lr=10**-2,
                 pop_size=256,
                 antithetic=True,
                 weight_decay=0,
                 rank_fitness=True):

        # misc
        self.num_params = num_params
        self.first_interation = True

        # distribution parameters
        if mu_init is None:
            self.mu = np.zeros(self.num_params)
        else:
            self.mu = np.array(mu_init)
        self.sigma = sigma_init

        # optimization stuff
        self.learning_rate = lr
        self.optimizer = Adam(self.learning_rate)

        # sampling stuff
        self.pop_size = pop_size
        self.antithetic = antithetic
        if self.antithetic:
            assert (self.pop_size % 2 == 0), "Population size must be even"
        self.weight_decay = weight_decay
        self.rank_fitness = rank_fitness
Beispiel #2
0
    def __init__(
        self,
        num_params,  # number of model parameters
        sigma_init=0.10,  # initial standard deviation
        sigma_alpha=0.20,  # learning rate for std
        sigma_decay=0.999,  # anneal standard deviation
        sigma_limit=0.01,  # stop annealing if less than
        sigma_max_change=0.2,  # clips adaptive sigma to 20%
        learning_rate=0.01,  # learning rate for std
        learning_rate_decay=0.9999,  # annealing the learning rate
        learning_rate_limit=0.01,  # stop annealing learning rate
        elite_ratio=0,  # if >0 then ignore learning_rate
        pop_size=256,  # population size
        average_baseline=True,  # set baseline to average
        weight_decay=0.01,  # weight decay coefficient
        rank_fitness=True,  # use rank rather than fitness
        forget_best=True):  # don't keep the hist best sol

        self.num_params = num_params
        self.sigma_init = sigma_init
        self.sigma_alpha = sigma_alpha
        self.sigma_decay = sigma_decay
        self.sigma_limit = sigma_limit
        self.sigma_max_change = sigma_max_change
        self.learning_rate = learning_rate
        self.learning_rate_decay = learning_rate_decay
        self.learning_rate_limit = learning_rate_limit
        self.pop_size = pop_size
        self.average_baseline = average_baseline
        if self.average_baseline:
            assert (self.pop_size % 2 == 0), "Population size must be even"
            self.pop_size = int(self.pop_size / 2)
        else:
            assert (self.pop_size & 1), "Population size must be odd"
            self.pop_size = int((self.pop_size - 1) / 2)

        # option to use greedy es method to select next mu,
        # rather than using drift param
        self.elite_ratio = elite_ratio
        self.elite_pop_size = int(self.pop_size * self.elite_ratio)
        self.use_elite = False
        if self.elite_pop_size > 0:
            self.use_elite = True

        self.forget_best = forget_best
        self.batch_reward = np.zeros(self.pop_size * 2)
        self.mu = np.zeros(self.num_params)
        self.sigma = np.ones(self.num_params) * self.sigma_init
        self.curr_best_mu = np.zeros(self.num_params)
        self.best_mu = np.zeros(self.num_params)
        self.best_reward = 0
        self.first_interation = True
        self.weight_decay = weight_decay
        self.rank_fitness = rank_fitness
        if self.rank_fitness:
            self.forget_best = True  # always forget the best one if we rank
        # choose optimizer
        self.optimizer = Adam(self, learning_rate)
Beispiel #3
0
    def __init__(
        self,
        num_params,  # number of model parameters
        mu_init=None,  # initial mean
        sigma_init=1,  # initial standard deviation
        sigma_decay=0.999,  # anneal standard deviation
        sigma_limit=0.01,  # stop annealing if less than
        learning_rate=0.01,  # learning rate for std
        learning_rate_decay=0.9999,  # annealing the learning rate
        learning_rate_limit=0.001,  # stop annealing learning rate
        pop_size=256,  # population size
        antithetic=False,  # whether to use anti sampling
        weight_decay=0.01,  # weight decay coefficient
        rank_fitness=True,  # use rank rather than fitness
        forget_best=True):  # forget historical best

        # misc
        self.num_params = num_params
        self.first_interation = True

        # distribution parameters
        if mu_init is None:
            self.mu = np.zeros(self.num_params)
        else:
            self.mu = np.array(mu_init)
        self.sigma_decay = sigma_decay
        self.sigma = sigma_init
        self.sigma_init = sigma_init
        self.sigma_limit = sigma_limit

        # optimizarion stuff
        self.learning_rate = learning_rate
        self.learning_rate_decay = learning_rate_decay
        self.learning_rate_limit = learning_rate_limit
        self.optimizer = Adam(self, learning_rate)

        # sampling stuff
        self.pop_size = pop_size
        self.antithetic = antithetic
        if self.antithetic:
            assert (self.pop_size % 2 == 0), "Population size must be even"
        self.forget_best = forget_best
        self.weight_decay = weight_decay
        self.rank_fitness = rank_fitness
        if self.rank_fitness:
            self.forget_best = True
    def update_thetas(self, learning_rate=0.01, optimizer=''):
        """
        Update the W matrix and biases for the layer. Once we update thetas, the gradW and gradb are set to zero.
        We also reset the count of how many times we acumulated the gradients.

        Parameters:
        - learning_rate: float
            Learning rate to use to update thetas
        - optimizer: str
            Which optimizer to use. Empty string is default
        """
        createInstance = True  #We only create one instance of the Optimizer
        if createInstance and optimizer == 'Adam':  #Taking advantage of lazy and
            self.Optimizer = Adam()
            createInstance = False
        self.W = self.Optimizer.newWeight(self.W, self.gradW, learning_rate)
        self.b = self.Optimizer.newBias(self.b, self.gradb, learning_rate)
        if (not self.last_layer) and self.batchnorm:
            self.gamma += -learning_rate * self.gradGamma
            self.beta += -learning_rate * self.gradBeta
 def __init__(self,
              input_size,
              num_neurons,
              Activation=ActivationType.SIGMOID,
              last_layer=False,
              drop_out=False,
              drop_percent=0.2):
     """
     Constructs the layer
     Parameters:
     - input_size: int
         input size of the layer
     - num_neurons: int
         number of neurons in layer
     - Activation: ActivationType
         Activation type of all neurons in layer
     - last_layer: bool
         Specifies if layer is last layer. Only if last_layer = True, we can use calcula_loss method
     - drop_out: bool
         If layer support droput
     - drop_percent: float
         Percentage of neurons to be randomly shut down during training
     """
     self.W = np.random.uniform(low=-0.1,
                                high=0.1,
                                size=(input_size, num_neurons))
     self.b = np.random.uniform(low=-0.1, high=0.1, size=(1, num_neurons))
     self.num_neurons = num_neurons
     self.input_size = input_size
     self.Activation = Activation
     self.a = None  #The output of the layer
     self.gradW = np.zeros(shape=self.W.shape)
     self.gradb = np.zeros(shape=self.b.shape)
     self.last_layer = last_layer
     self.delta = None
     self.input_data = None
     self.Optimizer = DefaultOptimizer(
     )  #We use the otimizer to calculate new parameters
     self.batchnorm = False
     self.dropM = None
     self.drop_percent = drop_percent
     self.drop_out = drop_out
     self.mode = 'Train'
     #------------------------------------ for Batch Normalization --------------------------------#
     if not last_layer:
         self.gamma = np.random.uniform(low=-0.1,
                                        high=0.1,
                                        size=self.b.shape)
         self.beta = np.random.uniform(low=-0.1,
                                       high=0.1,
                                       size=self.b.shape)
         self.sihat = None
         self.sb = None
         self.variance = None
         self.mean = None
         self.deltaBN = None
         self.gradGamma = np.zeros(shape=self.gamma.shape)
         self.gradBeta = np.zeros(shape=self.beta.shape)
         self.cumulative_mean = 0
         self.cumulative_variance = 0
         self.EPSILON = 10**(-6)
class Layer(object):
    """Layer class"""
    def __init__(self,
                 input_size,
                 num_neurons,
                 Activation=ActivationType.SIGMOID,
                 last_layer=False,
                 drop_out=False,
                 drop_percent=0.2):
        """
        Constructs the layer
        Parameters:
        - input_size: int
            input size of the layer
        - num_neurons: int
            number of neurons in layer
        - Activation: ActivationType
            Activation type of all neurons in layer
        - last_layer: bool
            Specifies if layer is last layer. Only if last_layer = True, we can use calcula_loss method
        - drop_out: bool
            If layer support droput
        - drop_percent: float
            Percentage of neurons to be randomly shut down during training
        """
        self.W = np.random.uniform(low=-0.1,
                                   high=0.1,
                                   size=(input_size, num_neurons))
        self.b = np.random.uniform(low=-0.1, high=0.1, size=(1, num_neurons))
        self.num_neurons = num_neurons
        self.input_size = input_size
        self.Activation = Activation
        self.a = None  #The output of the layer
        self.gradW = np.zeros(shape=self.W.shape)
        self.gradb = np.zeros(shape=self.b.shape)
        self.last_layer = last_layer
        self.delta = None
        self.input_data = None
        self.Optimizer = DefaultOptimizer(
        )  #We use the otimizer to calculate new parameters
        self.batchnorm = False
        self.dropM = None
        self.drop_percent = drop_percent
        self.drop_out = drop_out
        self.mode = 'Train'
        #------------------------------------ for Batch Normalization --------------------------------#
        if not last_layer:
            self.gamma = np.random.uniform(low=-0.1,
                                           high=0.1,
                                           size=self.b.shape)
            self.beta = np.random.uniform(low=-0.1,
                                          high=0.1,
                                          size=self.b.shape)
            self.sihat = None
            self.sb = None
            self.variance = None
            self.mean = None
            self.deltaBN = None
            self.gradGamma = np.zeros(shape=self.gamma.shape)
            self.gradBeta = np.zeros(shape=self.beta.shape)
            self.cumulative_mean = 0
            self.cumulative_variance = 0
            self.EPSILON = 10**(-6)

    def forward_pass(self, input_data, mode='Train', batchnorm=False):
        """
        Forward passing data.
        Parameters:
        - input_data: np.ndarray
            Represents data being passed to the layer. Once we forward pass, the layer stores the value as a property
        - mode: str
            'Train' if in training stage. Test otherwise
        - batchnotm: bool
            Whether to perform bacth normalization

        Returns:
        - self.a: np.ndarray
            Input for next layer. If self is last layer, this is the actual output

        Raises
         - Exception: Exception
            Raised when condition ActivationType == SIGMOID and laster_layer == True is not valid
        """
        self.input_data = input_data
        s1 = input_data @ self.W + self.b
        self.batchnorm = batchnorm

        #-------------------------- for batch normalization --------------------------------#
        if (not self.last_layer) and self.batchnorm:
            if mode == 'Train':
                self.variance = np.var(s1,
                                       axis=0,
                                       keepdims=True,
                                       dtype=np.float64)
                self.mean = np.mean(s1, axis=0, keepdims=True)
                self.cumulative_mean = 0.9 * self.cumulative_mean + (
                    1 - 0.9) * self.mean
                self.cumulative_variance = 0.9 * self.cumulative_variance + (
                    1 - 0.9) * self.variance
            else:
                self.mean = self.cumulative_mean
                self.variance = self.cumulative_variance
            self.sihat = (s1 - self.mean) / np.sqrt(self.variance +
                                                    self.EPSILON)
            self.sb = self.gamma * self.sihat + self.beta
            s1 = self.sb

        #------------------------- Calculating activations ------------------------------#
        if (self.Activation == ActivationType.SIGMOID):
            self.a = Functions.sigmoid(s1)
        elif (self.Activation == ActivationType.RELU):
            self.a = Functions.relu(s1)
        elif (self.Activation == ActivationType.TANH):
            self.a = Functions.tanh(s1)
        elif (self.Activation == ActivationType.SOFTMAX and self.last_layer):
            self.a = Functions.softmax(s1)
        else:
            raise Exception(
                "Please change boolean parameter or activation type")

        #Zeroing out some weights
        if (not self.last_layer) and self.drop_out and (mode == 'Train'):
            self.a = self.a * self.dropM

        return self.a

    def back_propagate(self, weighted_deltas):
        """
        Back propagates weighted deltas back to network
        Parameters:
        - weighted_deltas: ndarray
            Deltas of the next layer weighted by next layer's W matrix.

        Returns
            weighted deltas of self as np.ndarray
        """

        #------------------------ Back propagating --------------------------#
        if (self.Activation == ActivationType.SIGMOID):
            self.delta = weighted_deltas * self.a * (1 - self.a)
        elif (self.Activation == ActivationType.RELU):
            derivative = np.where(
                self.a >= 0, 1, 0) if not self.batchnorm else np.where(
                    self.a > self.EPSILON, 1,
                    self.EPSILON)  #Just for batch normalization
            self.delta = weighted_deltas * derivative
        elif (self.Activation == ActivationType.TANH):
            self.delta = weighted_deltas * (1 - self.a**2)
        elif (self.Activation == ActivationType.SOFTMAX and self.last_layer):
            #the weighted deltas for this case are the expected result Y
            self.delta = (self.a - weighted_deltas)
        else:
            raise Exception("Please change last_layer or activation type")
        if (not self.last_layer) and self.drop_out:
            self.delta = self.delta * self.dropM
        if self.last_layer or (not self.batchnorm):
            return (self.W @ self.delta.T).T

        #------------------------- Calculating deltabn and back propagate ---------------------#
        denominator = self.input_data.shape[0] * np.sqrt(self.variance +
                                                         10**(-8))
        self.deltaBN = self.delta * self.gamma * (
            self.input_data.shape[0] - 1 - self.sihat**2) / denominator
        return (self.W @ self.deltaBN.T).T

    def update_gradients(self, regularized=True, reg_val=0.01):
        """
        Accumulate gradients with regularization by default. Layers keeps track how many times the gradients were accumulated.
        This feature is useful when implementation iterative algorithms

        Parameters:
        - regularized: bool
            If regularization is implemented in layer
        - reg_val: float
            Regularization value
        """

        if self.last_layer or (not self.batchnorm):
            self.gradW = (self.input_data.T @ self.delta
                          ) / self.input_data.shape[0]  #Average gradient
            self.gradb = np.mean(self.delta, axis=0,
                                 keepdims=True)  #average gradient
        else:
            #-------------------------- for Batch Norm --------------------------#
            self.gradW = (
                self.input_data.T @ self.deltaBN) / self.input_data.shape[0]
            self.gradb = np.mean(self.deltaBN, axis=0, keepdims=True)
            self.gradBeta = np.mean(self.delta, axis=0, keepdims=True)
            self.gradGamma = np.mean(self.sihat * self.delta,
                                     axis=0,
                                     keepdims=True)
        if regularized:
            self.gradW += reg_val * self.W

    def update_thetas(self, learning_rate=0.01, optimizer=''):
        """
        Update the W matrix and biases for the layer. Once we update thetas, the gradW and gradb are set to zero.
        We also reset the count of how many times we acumulated the gradients.

        Parameters:
        - learning_rate: float
            Learning rate to use to update thetas
        - optimizer: str
            Which optimizer to use. Empty string is default
        """
        createInstance = True  #We only create one instance of the Optimizer
        if createInstance and optimizer == 'Adam':  #Taking advantage of lazy and
            self.Optimizer = Adam()
            createInstance = False
        self.W = self.Optimizer.newWeight(self.W, self.gradW, learning_rate)
        self.b = self.Optimizer.newBias(self.b, self.gradb, learning_rate)
        if (not self.last_layer) and self.batchnorm:
            self.gamma += -learning_rate * self.gradGamma
            self.beta += -learning_rate * self.gradBeta

    def calculate_loss(self, y_expected):
        """
        Calculates loss only at last layer

        Parameters:
        - y_expected: ndarray
            Exepected value of the last layer

        Returns:
        - float
            Loss value

        Raises:
        - Exception: Exception
            When layer is not last layer and client wants to calculate loss
        """
        if not self.last_layer:
            raise Exception("Loss can only be calculated at last layer")
        return self.loss(y_expected)

    def loss(self, y_expected):
        return np.sum(-y_expected * np.log(self.a))

    def __str__(self):
        """
        String representation of layer object
        """
        layer_type = "Output" if self.last_layer else "Hidden"
        funcType = ''
        if self.Activation == ActivationType.SIGMOID:
            funcType = "SIGMOID"
        elif self.Activation == ActivationType.RELU:
            funcType = "RELU"
        elif self.Activation == ActivationType.SOFTMAX:
            funcType = "SOFTMAX"
        else:
            funcType = "TANH"
        return "(Type: %s, Activation: %s, Size: %d)" % (layer_type, funcType,
                                                         self.num_neurons)
Beispiel #7
0
def run_all_model(train_input,
                  train_target,
                  test_input,
                  test_target,
                  Sample_number,
                  save_plot=False):

    # Define constants along the test
    hidden_nb = 25
    std = 0.1
    eta = 3e-1
    batch_size = 200
    epochs_number = 1000

    # Model 1. No dropout; constant learning rate (SGD)
    print('\nModel 1: Optimizer: SGD; No dropout; ReLU; CrossEntropy')

    # Define model name for plots
    mname = 'Model1'

    # Define structure of the network
    linear_1 = Linear(2, hidden_nb)
    relu_1 = Relu()
    linear_2 = Linear(hidden_nb, hidden_nb)
    relu_2 = Relu()
    linear_3 = Linear(hidden_nb, hidden_nb)
    relu_3 = Relu()
    linear_4 = Linear(hidden_nb, 2)
    loss = CrossEntropy()

    model_1 = Sequential(linear_1,
                         relu_1,
                         linear_2,
                         relu_2,
                         linear_3,
                         relu_3,
                         linear_4,
                         loss=CrossEntropy())

    # Initialize weights
    model_1.normalize_parameters(mean=0, std=std)
    # Define optimizer
    optimizer = Sgd(eta)

    # Train model
    my_loss_1 = train_model(model_1, train_input, train_target, optimizer,
                            epochs_number, Sample_number, batch_size)

    # Evalute model and produce plots
    model_1_perf = evaluate_model(model_1,
                                  train_input,
                                  train_target,
                                  test_input,
                                  test_target,
                                  my_loss_1,
                                  save_plot,
                                  mname=mname)

    # Model 2. No dropout; decreasing learning rate (DecreaseSGD)
    print('\nModel 2: Optimizer: DecreaseSGD; No dropout; ReLU; CrossEntropy')

    # Define model name for plots
    mname = 'Model2'

    # Define structure of the network
    linear_1 = Linear(2, hidden_nb)
    relu_1 = Relu()
    linear_2 = Linear(hidden_nb, hidden_nb)
    relu_2 = Relu()
    linear_3 = Linear(hidden_nb, hidden_nb)
    relu_3 = Relu()
    linear_4 = Linear(hidden_nb, 2)

    model_2 = Sequential(linear_1,
                         relu_1,
                         linear_2,
                         relu_2,
                         linear_3,
                         relu_3,
                         linear_4,
                         loss=CrossEntropy())

    # Initialize weights
    model_2.normalize_parameters(mean=0, std=std)
    # Define optimizer
    optimizer = DecreaseSGD(eta)

    # Train model
    my_loss_2 = train_model(model_2, train_input, train_target, optimizer,
                            epochs_number, Sample_number, batch_size)
    # Evalute model and produce plots
    model_2_perf = evaluate_model(model_2,
                                  train_input,
                                  train_target,
                                  test_input,
                                  test_target,
                                  my_loss_2,
                                  save_plot,
                                  mname=mname)

    # Model 3. No dropout; Adam Optimizer
    print('\nModel 3: Optimizer: Adam; No dropout; ReLU; CrossEntropy')

    # Define model name for plots
    mname = 'Model3'

    # Custom hyperparameters
    eta_adam = 1e-3
    epochs_number_adam = 500

    # Define structure of the network
    linear_1 = Linear(2, hidden_nb)
    relu_1 = Relu()
    linear_2 = Linear(hidden_nb, hidden_nb)
    relu_2 = Relu()
    linear_3 = Linear(hidden_nb, hidden_nb)
    relu_3 = Relu()
    linear_4 = Linear(hidden_nb, 2)
    loss = CrossEntropy()

    model_3 = Sequential(linear_1,
                         relu_1,
                         linear_2,
                         relu_2,
                         linear_3,
                         relu_3,
                         linear_4,
                         loss=CrossEntropy())

    # Initialize weights
    model_3.normalize_parameters(mean=0, std=std)
    # Define optimizer
    optimizer = Adam(eta_adam, 0.9, 0.99, 1e-8)

    # Train model
    my_loss_3 = train_model(model_3, train_input, train_target, optimizer,
                            epochs_number_adam, Sample_number, batch_size)

    # Evalute model and produce plots
    model_3_perf = evaluate_model(model_3,
                                  train_input,
                                  train_target,
                                  test_input,
                                  test_target,
                                  my_loss_3,
                                  save_plot,
                                  mname=mname)

    # PLOT TO COMPARE OPTIMIZERS
    if save_plot:
        fig = plt.figure(figsize=(10, 4))
        plt.plot(range(0, epochs_number), my_loss_1, linewidth=1)
        plt.plot(range(0, epochs_number), my_loss_2, linewidth=1)
        plt.plot(range(0, epochs_number_adam), my_loss_3, linewidth=1)
        plt.legend(["SGD", "Decreasing SGD", "Adam"])
        plt.title("Loss")
        plt.xlabel("Epochs")
        plt.savefig('output/compare_optimizers.pdf', bbox_inches='tight')
        plt.close(fig)

    # Model 4. Dropout; SGD
    print('\nModel 4: Optimizer: SGD; Dropout; ReLU; CrossEntropy')

    # Define model name for plots
    mname = 'Model4'

    # Define structure of the network
    dropout = 0.15

    linear_1 = Linear(2, hidden_nb)
    relu_1 = Relu()
    linear_2 = Linear(hidden_nb, hidden_nb, dropout=dropout)
    relu_2 = Relu()
    linear_3 = Linear(hidden_nb, hidden_nb, dropout=dropout)
    relu_3 = Relu()
    linear_4 = Linear(hidden_nb, 2)

    model_4 = Sequential(linear_1,
                         relu_1,
                         linear_2,
                         relu_2,
                         linear_3,
                         relu_3,
                         linear_4,
                         loss=CrossEntropy())

    # Initialize weights
    model_4.normalize_parameters(mean=0, std=std)
    # Define optimizer
    optimizer = Sgd(eta)

    # Train model
    my_loss_4 = train_model(model_4, train_input, train_target, optimizer,
                            epochs_number, Sample_number, batch_size)

    # Evalute model and produce plots
    model_4_perf = evaluate_model(model_4,
                                  train_input,
                                  train_target,
                                  test_input,
                                  test_target,
                                  my_loss_4,
                                  save_plot,
                                  mname=mname)

    # PLOT TO COMPARE DROPOUT AND NO DROPOUT
    if save_plot:
        fig = plt.figure(figsize=(10, 4))
        plt.plot(range(0, epochs_number), my_loss_1, linewidth=1)
        plt.plot(range(0, epochs_number), my_loss_4, linewidth=1)
        plt.legend(["Without Dropout", "With Dropout"])
        plt.title("Loss")
        plt.xlabel("Epochs")
        plt.savefig('output/compare_dropout.pdf', bbox_inches='tight')
        plt.close(fig)

    print('\nEvaluation of different activation functions\n')

    # Model 5. No Dropout; SGD; Tanh
    print('\nModel 5: Optimizer: SGD; No dropout; Tanh; CrossEntropy')

    # Define model name for plots
    mname = 'Model5'

    # Define structure of the network
    linear_1 = Linear(2, hidden_nb)
    relu_1 = Tanh()
    linear_2 = Linear(hidden_nb, hidden_nb)
    relu_2 = Tanh()
    linear_3 = Linear(hidden_nb, hidden_nb)
    relu_3 = Tanh()
    linear_4 = Linear(hidden_nb, 2)

    model_5 = Sequential(linear_1,
                         relu_1,
                         linear_2,
                         relu_2,
                         linear_3,
                         relu_3,
                         linear_4,
                         loss=CrossEntropy())

    # Initialize weights
    model_5.normalize_parameters(mean=0, std=std)
    # Define optimizer
    optimizer = Sgd(eta)

    # Train model
    my_loss_5 = train_model(model_5, train_input, train_target, optimizer,
                            epochs_number, Sample_number, batch_size)

    # Evalute model and produce plots
    model_5_perf = evaluate_model(model_5,
                                  train_input,
                                  train_target,
                                  test_input,
                                  test_target,
                                  my_loss_5,
                                  save_plot,
                                  mname=mname)

    # Model 6. Xavier Initialization
    print(
        '\nModel 6: Optimizer: SGD; No dropout; Tanh; Xavier initialization; CrossEntropy'
    )

    # Define model name for plots
    mname = 'Model6'

    # Define network structure
    linear_1 = Linear(2, hidden_nb)
    relu_1 = Tanh()
    linear_2 = Linear(hidden_nb, hidden_nb)
    relu_2 = Tanh()
    linear_3 = Linear(hidden_nb, hidden_nb)
    relu_3 = Tanh()
    linear_4 = Linear(hidden_nb, 2)

    model_6 = Sequential(linear_1,
                         relu_1,
                         linear_2,
                         relu_2,
                         linear_3,
                         relu_3,
                         linear_4,
                         loss=CrossEntropy())

    model_6.xavier_parameters()
    optimizer = Sgd()

    # Train model
    my_loss_6 = train_model(model_6, train_input, train_target, optimizer,
                            epochs_number, Sample_number, batch_size)

    # Evalute model and produce plots
    model_6_perf = evaluate_model(model_6,
                                  train_input,
                                  train_target,
                                  test_input,
                                  test_target,
                                  my_loss_6,
                                  save_plot,
                                  mname=mname)

    # Model 7. Sigmoid
    print('\nModel 7: Optimizer: SGD; No dropout; Sigmoid; CrossEntropy')

    # Define model name for plots
    mname = 'Model7'

    # Define parameter for sigmoid activation
    p_lambda = 0.1

    # Define network structure
    linear_1 = Linear(2, hidden_nb)
    relu_1 = Sigmoid(p_lambda)
    linear_2 = Linear(hidden_nb, hidden_nb)
    relu_2 = Sigmoid(p_lambda)
    linear_3 = Linear(hidden_nb, hidden_nb)
    relu_3 = Sigmoid(p_lambda)
    linear_4 = Linear(hidden_nb, 2)

    model_7 = Sequential(linear_1,
                         relu_1,
                         linear_2,
                         relu_2,
                         linear_3,
                         relu_3,
                         linear_4,
                         loss=CrossEntropy())

    model_7.normalize_parameters(mean=0.5, std=1)
    optimizer = Sgd(eta=0.5)

    # Train model
    my_loss_7 = train_model(model_7, train_input, train_target, optimizer,
                            epochs_number, Sample_number, batch_size)

    # Evalute model and produce plots
    model_7_perf = evaluate_model(model_7,
                                  train_input,
                                  train_target,
                                  test_input,
                                  test_target,
                                  my_loss_7,
                                  save_plot,
                                  mname=mname)

    # PLOT TO COMPARE EFFECT OF DIFFERENT ACTIVATIONS
    if save_plot:
        fig = plt.figure(figsize=(10, 4))
        plt.plot(range(0, epochs_number), my_loss_1, linewidth=0.5)
        plt.plot(range(0, epochs_number), my_loss_5, linewidth=0.5, alpha=0.8)
        plt.plot(range(0, epochs_number), my_loss_6, linewidth=0.5, alpha=0.8)
        plt.plot(range(0, epochs_number), my_loss_7, linewidth=0.5)
        plt.legend(["Relu", "Tanh", "Tanh (Xavier)", "Sigmoid"])
        plt.title("Loss")
        plt.xlabel("Epochs")
        plt.savefig('output/compare_activations.pdf', bbox_inches='tight')
        plt.close(fig)

    print('\nEvaluation of base model with MSE loss\n')

    # Model 8. MSE loss
    print('\nModel 8: Optimizer: SGD; No dropout; Relu; MSE')

    # Define model name for plots
    mname = 'Model8'
    linear_1 = Linear(2, hidden_nb)
    relu_1 = Relu()
    linear_2 = Linear(hidden_nb, hidden_nb)
    relu_2 = Relu()
    linear_3 = Linear(hidden_nb, hidden_nb)
    relu_3 = Relu()
    linear_4 = Linear(hidden_nb, 2)
    loss = LossMSE()

    model_8 = Sequential(linear_1,
                         relu_1,
                         linear_2,
                         relu_2,
                         linear_3,
                         relu_3,
                         linear_4,
                         loss=loss)

    model_8.normalize_parameters(mean=0, std=std)
    optimizer = Sgd(eta)

    # Train model
    my_loss_8 = train_model(model_8, train_input, train_target, optimizer,
                            epochs_number, Sample_number, batch_size)

    # Evalute model and produce plots
    model_8_perf = evaluate_model(model_8,
                                  train_input,
                                  train_target,
                                  test_input,
                                  test_target,
                                  my_loss_8,
                                  save_plot,
                                  mname=mname)

    print('Evaluation done! ')

    train_loss = torch.tensor([
        model_1_perf[0], model_2_perf[0], model_3_perf[0], model_4_perf[0],
        model_5_perf[0], model_6_perf[0], model_7_perf[0], model_8_perf[0]
    ])
    train_error = torch.tensor([
        model_1_perf[1], model_2_perf[1], model_3_perf[1], model_4_perf[1],
        model_5_perf[1], model_6_perf[1], model_7_perf[1], model_8_perf[1]
    ])
    test_loss = torch.tensor([
        model_1_perf[2], model_2_perf[2], model_3_perf[2], model_4_perf[2],
        model_5_perf[2], model_6_perf[2], model_7_perf[2], model_8_perf[2]
    ])
    test_error = torch.tensor([
        model_1_perf[3], model_2_perf[3], model_3_perf[3], model_4_perf[3],
        model_5_perf[3], model_6_perf[3], model_7_perf[3], model_8_perf[3]
    ])

    return train_loss, train_error, test_loss, test_error
Beispiel #8
0
#
# for color, dim in zip(("blue", "green", "red"), range(X_train.shape[1])):
#     plt.scatter(X_train[:, dim], y_train, marker="^", color=color)
# plt.show()


# from Models.NeuralNetworks import Layers
# from Models.NeuralNetworks import Sequential
#
# model = Sequential()
# model.add(Layers.Dense(5, "Relu"))
# model.add(Layers.Dense(10, "Relu"))
# model.build("MeanSquaredError", "Adam")
# model.call(X_train, y_train)
# print(model.layers[1].inputs.shape)
# print(model.layers[0].inputs.shape)
# print(model.layers[0].outputs.shape)


from Models.LinearModels import LogisticRegression
from Optimizers import Adam
from Losses import MAE

model = LogisticRegression(2000, Adam(), MAE())
model(X_train, y_train)
predictions = model.inference(X_test)
print(MAE()(y_test, predictions))
#
# predictions = model.inference(X_test)
# for index in range(len(y_test)):
#     print(y_test[index], predictions[index])
Beispiel #9
0
class VES:
    """
    Basic Version of OpenAI Evolution Strategies
    """
    def __init__(self,
                 num_params,
                 mu_init=None,
                 sigma_init=0.1,
                 lr=10**-2,
                 pop_size=256,
                 antithetic=True,
                 weight_decay=0,
                 rank_fitness=True):

        # misc
        self.num_params = num_params
        self.first_interation = True

        # distribution parameters
        if mu_init is None:
            self.mu = np.zeros(self.num_params)
        else:
            self.mu = np.array(mu_init)
        self.sigma = sigma_init

        # optimization stuff
        self.learning_rate = lr
        self.optimizer = Adam(self.learning_rate)

        # sampling stuff
        self.pop_size = pop_size
        self.antithetic = antithetic
        if self.antithetic:
            assert (self.pop_size % 2 == 0), "Population size must be even"
        self.weight_decay = weight_decay
        self.rank_fitness = rank_fitness

    def ask(self):
        """
        Returns a list of candidates parameterss
        """
        if self.antithetic:
            epsilon_half = np.random.randn(self.pop_size // 2, self.num_params)
            epsilon = np.concatenate([epsilon_half, -epsilon_half])

        else:
            epsilon = np.random.randn(self.pop_size, self.num_params)

        return self.mu + epsilon * self.sigma

    def tell(self, scores, solutions):
        """
        Updates the distribution
        """
        assert (len(scores) == self.pop_size
                ), "Inconsistent reward_table size reported."

        reward = np.array(scores)
        if self.rank_fitness:
            reward = compute_centered_ranks(reward)

        if self.weight_decay > 0:
            l2_decay = compute_weight_decay(self.weight_decay, solutions)
            reward += l2_decay

        epsilon = (solutions - self.mu) / self.sigma
        grad = -1 / (self.sigma * self.pop_size) * np.dot(reward, epsilon)

        # optimization step
        step = self.optimizer.step(grad)
        self.mu += step

    def get_distrib_params(self):
        """
        Returns the parameters of the distrubtion:
        the mean and sigma
        """
        return np.copy(self.mu), np.copy(self.sigma**2)
Beispiel #10
0
class GES:
    """
    Guided Evolution Strategies
    """
    def __init__(self,
                 num_params,
                 mu_init=None,
                 sigma_init=0.1,
                 lr=10**-2,
                 alpha=0.5,
                 beta=2,
                 k=1,
                 pop_size=256,
                 antithetic=True,
                 weight_decay=0,
                 rank_fitness=False):

        # misc
        self.num_params = num_params
        self.first_interation = True

        # distribution parameters
        if mu_init is None:
            self.mu = np.zeros(self.num_params)
        else:
            self.mu = np.array(mu_init)
        self.sigma = sigma_init
        self.U = np.ones((self.num_params, k))

        # optimization stuff
        self.alpha = alpha
        self.beta = beta
        self.k = k
        self.learning_rate = lr
        self.optimizer = Adam(self.learning_rate)

        # sampling stuff
        self.pop_size = pop_size
        self.antithetic = antithetic
        if self.antithetic:
            assert (self.pop_size % 2 == 0), "Population size must be even"
        self.weight_decay = weight_decay
        self.rank_fitness = rank_fitness

    def ask(self):
        """
        Returns a list of candidates parameterss
        """
        if self.antithetic:
            epsilon_half = np.sqrt(self.alpha / self.num_params) * \
                np.random.randn(self.pop_size // 2, self.num_params)
            epsilon_half += np.sqrt((1 - self.alpha) / self.k) * \
                np.random.randn(self.pop_size // 2, self.k) @ self.U.T
            epsilon = np.concatenate([epsilon_half, -epsilon_half])

        else:
            epsilon = np.sqrt(self.alpha / self.num_params) * \
                np.random.randn(self.pop_size, self.num_params)
            epsilon += np.sqrt(1 - self.alpha) * \
                np.random.randn(self.pop_size, self.num_params) @ self.U.T

        return self.mu + epsilon * self.sigma

    def tell(self, scores, solutions):
        """
        Updates the distribution
        """
        assert (len(scores) == self.pop_size
                ), "Inconsistent reward_table size reported."

        reward = np.array(scores)
        if self.rank_fitness:
            reward = compute_centered_ranks(reward)

        if self.weight_decay > 0:
            l2_decay = compute_weight_decay(self.weight_decay, solutions)
            reward += l2_decay

        epsilon = (solutions - self.mu) / self.sigma
        grad = -self.beta/(self.sigma * self.pop_size) * \
            np.dot(reward, epsilon)

        # optimization step
        step = self.optimizer.step(grad)
        self.mu += step

    def add(self, params, grads, fitness):
        """
        Adds new "gradient" to U
        """
        if params is not None:
            self.mu = params
        grads = grads / np.linalg.norm(grads)
        self.U[:, -1] = grads

    def get_distrib_params(self):
        """
        Returns the parameters of the distrubtion:
        the mean and sigma
        """
        return np.copy(self.mu), np.copy(self.sigma**2)
Beispiel #11
0
class OpenES:
    """
    Basic Version of OpenAI Evolution Strategies
    """
    def __init__(
        self,
        num_params,  # number of model parameters
        mu_init=None,  # initial mean
        sigma_init=1,  # initial standard deviation
        sigma_decay=0.999,  # anneal standard deviation
        sigma_limit=0.01,  # stop annealing if less than
        learning_rate=0.01,  # learning rate for std
        learning_rate_decay=0.9999,  # annealing the learning rate
        learning_rate_limit=0.001,  # stop annealing learning rate
        pop_size=256,  # population size
        antithetic=False,  # whether to use anti sampling
        weight_decay=0.01,  # weight decay coefficient
        rank_fitness=True,  # use rank rather than fitness
        forget_best=True):  # forget historical best

        # misc
        self.num_params = num_params
        self.first_interation = True

        # distribution parameters
        if mu_init is None:
            self.mu = np.zeros(self.num_params)
        else:
            self.mu = np.array(mu_init)
        self.sigma_decay = sigma_decay
        self.sigma = sigma_init
        self.sigma_init = sigma_init
        self.sigma_limit = sigma_limit

        # optimizarion stuff
        self.learning_rate = learning_rate
        self.learning_rate_decay = learning_rate_decay
        self.learning_rate_limit = learning_rate_limit
        self.optimizer = Adam(self, learning_rate)

        # sampling stuff
        self.pop_size = pop_size
        self.antithetic = antithetic
        if self.antithetic:
            assert (self.pop_size % 2 == 0), "Population size must be even"
        self.forget_best = forget_best
        self.weight_decay = weight_decay
        self.rank_fitness = rank_fitness
        if self.rank_fitness:
            self.forget_best = True

    def ask(self, pop_size):
        """
        Returns a list of candidates parameterss
        """

        if self.antithetic:
            epsilon_half = np.random.randn(self.pop_size // 2, self.num_params)
            epsilon = np.concatenate([epsilon_half, -epsilon_half])

        else:
            epsilon = np.random.randn(pop_size, self.num_params)

        return self.mu.reshape(1, self.num_params) + epsilon * self.sigma

    def tell(self, solutions, scores):
        """
        Updates the distribution
        """
        assert (len(scores) == self.pop_size
                ), "Inconsistent reward_table size reported."

        reward = np.array(scores)

        if self.rank_fitness:
            reward = compute_centered_ranks(reward)

        if self.weight_decay > 0:
            l2_decay = compute_weight_decay(self.weight_decay, solutions)
            reward += l2_decay

        # TBD check if ok
        epsilon = (solutions -
                   self.mu.reshape(1, self.num_params)) / self.sigma

        # standardize the rewards to have a gaussian distribution
        normalized_reward = (reward - np.mean(reward)) / np.std(reward)
        change_mu = 1. / (self.pop_size * self.sigma) * \
            np.dot(epsilon.T, normalized_reward)

        # updating stuff
        idx = np.argsort(reward)[::-1]
        best_reward = reward[idx[0]]
        best_mu = solutions[idx[0]]

        self.curr_best_reward = best_reward
        self.curr_best_mu = best_mu

        if self.first_interation:
            self.first_interation = False
            self.best_reward = self.curr_best_reward
            self.best_mu = best_mu
        else:
            if self.forget_best or (self.curr_best_reward > self.best_reward):
                self.best_mu = best_mu
                self.best_reward = self.curr_best_reward

        # optimization step
        self.optimizer.stepsize = self.learning_rate
        self.optimizer.update(-change_mu)

        # adjust sigma according to the adaptive sigma calculation
        if (self.sigma > self.sigma_limit):
            self.sigma *= self.sigma_decay

        if (self.learning_rate > self.learning_rate_limit):
            self.learning_rate *= self.learning_rate_decay

    def get_distrib_params(self):
        """
        Returns the parameters of the distrubtion:
        the mean and sigma
        """
        return self.mu, self.sigma

    def result(self):
        """
        Returns best params so far, best score, current score
        and sigma
        """
        return (self.best_mu, self.best_reward, self.curr_best_reward,
                self.sigma)

    def rms_stdev(self):
        sigma = self.sigma
        return np.mean(np.sqrt(sigma * sigma))
Beispiel #12
0
class PEPG:
    '''Extension of PEPG with bells and whistles.'''
    def __init__(
        self,
        num_params,  # number of model parameters
        sigma_init=0.10,  # initial standard deviation
        sigma_alpha=0.20,  # learning rate for std
        sigma_decay=0.999,  # anneal standard deviation
        sigma_limit=0.01,  # stop annealing if less than
        sigma_max_change=0.2,  # clips adaptive sigma to 20%
        learning_rate=0.01,  # learning rate for std
        learning_rate_decay=0.9999,  # annealing the learning rate
        learning_rate_limit=0.01,  # stop annealing learning rate
        elite_ratio=0,  # if >0 then ignore learning_rate
        pop_size=256,  # population size
        average_baseline=True,  # set baseline to average
        weight_decay=0.01,  # weight decay coefficient
        rank_fitness=True,  # use rank rather than fitness
        forget_best=True):  # don't keep the hist best sol

        self.num_params = num_params
        self.sigma_init = sigma_init
        self.sigma_alpha = sigma_alpha
        self.sigma_decay = sigma_decay
        self.sigma_limit = sigma_limit
        self.sigma_max_change = sigma_max_change
        self.learning_rate = learning_rate
        self.learning_rate_decay = learning_rate_decay
        self.learning_rate_limit = learning_rate_limit
        self.pop_size = pop_size
        self.average_baseline = average_baseline
        if self.average_baseline:
            assert (self.pop_size % 2 == 0), "Population size must be even"
            self.pop_size = int(self.pop_size / 2)
        else:
            assert (self.pop_size & 1), "Population size must be odd"
            self.pop_size = int((self.pop_size - 1) / 2)

        # option to use greedy es method to select next mu,
        # rather than using drift param
        self.elite_ratio = elite_ratio
        self.elite_pop_size = int(self.pop_size * self.elite_ratio)
        self.use_elite = False
        if self.elite_pop_size > 0:
            self.use_elite = True

        self.forget_best = forget_best
        self.batch_reward = np.zeros(self.pop_size * 2)
        self.mu = np.zeros(self.num_params)
        self.sigma = np.ones(self.num_params) * self.sigma_init
        self.curr_best_mu = np.zeros(self.num_params)
        self.best_mu = np.zeros(self.num_params)
        self.best_reward = 0
        self.first_interation = True
        self.weight_decay = weight_decay
        self.rank_fitness = rank_fitness
        if self.rank_fitness:
            self.forget_best = True  # always forget the best one if we rank
        # choose optimizer
        self.optimizer = Adam(self, learning_rate)

    def rms_stdev(self):
        sigma = self.sigma
        return np.mean(np.sqrt(sigma * sigma))

    def ask(self):
        '''returns a list of parameters'''
        # antithetic sampling
        self.epsilon = np.random.randn(self.pop_size, self.num_params)
        self.epsilon *= self.sigma.reshape(1, self.num_params)
        self.epsilon_full = np.concatenate([self.epsilon, -self.epsilon])
        if self.average_baseline:
            epsilon = self.epsilon_full
        else:
            # first population is mu, then positive epsilon,
            # then negative epsilon
            epsilon = np.concatenate(
                [np.zeros((1, self.num_params)), self.epsilon_full])
        solutions = self.mu.reshape(1, self.num_params) + epsilon
        self.solutions = solutions
        return solutions

    def tell(self, scores):
        # input must be a numpy float array
        assert (len(scores) == self.pop_size
                ), "Inconsistent reward_table size reported."

        reward_table = np.array(scores)

        if self.rank_fitness:
            reward_table = compute_centered_ranks(reward_table)

        if self.weight_decay > 0:
            l2_decay = compute_weight_decay(self.weight_decay, self.solutions)
            reward_table += l2_decay

        reward_offset = 1
        if self.average_baseline:
            b = np.mean(reward_table)
            reward_offset = 0
        else:
            b = reward_table[0]  # baseline

        reward = reward_table[reward_offset:]
        if self.use_elite:
            idx = np.argsort(reward)[::-1][0:self.elite_pop_size]
        else:
            idx = np.argsort(reward)[::-1]

        best_reward = reward[idx[0]]
        if (best_reward > b or self.average_baseline):
            best_mu = self.mu + self.epsilon_full[idx[0]]
            best_reward = reward[idx[0]]
        else:
            best_mu = self.mu
            best_reward = b

        self.curr_best_reward = best_reward
        self.curr_best_mu = best_mu

        if self.first_interation:
            self.sigma = np.ones(self.num_params) * self.sigma_init
            self.first_interation = False
            self.best_reward = self.curr_best_reward
            self.best_mu = best_mu
        else:
            if self.forget_best or (self.curr_best_reward > self.best_reward):
                self.best_mu = best_mu
                self.best_reward = self.curr_best_reward

        # short hand
        epsilon = self.epsilon
        sigma = self.sigma

        # update the mean

        # move mean to the average of the best idx means
        if self.use_elite:
            self.mu += self.epsilon_full[idx].mean(axis=0)
        else:
            rT = (reward[:self.pop_size] - reward[self.pop_size:])
            change_mu = np.dot(rT, epsilon)
            self.optimizer.stepsize = self.learning_rate
            # adam, rmsprop, momentum, etc.
            update_ratio = self.optimizer.update(-change_mu)
            # self.mu += (change_mu * self.learning_rate) # normal SGD method

        # adaptive sigma
        # normalization
        if (self.sigma_alpha > 0):
            stdev_reward = 1.0
            if not self.rank_fitness:
                stdev_reward = reward.std()
            S = epsilon * epsilon - (sigma * sigma).reshape(1, self.num_params)
            S /= sigma.reshape(1, self.num_params)
            reward_avg = (reward[:self.pop_size] +
                          reward[self.pop_size:]) / 2.0
            rS = reward_avg - b
            delta_sigma = (np.dot(rS, S)) / \
                (2 * self.pop_size * stdev_reward)

            # adjust sigma according to the adaptive sigma calculation
            # for stability, don't let sigma move more than 10% of orig value
            change_sigma = self.sigma_alpha * delta_sigma
            change_sigma = np.minimum(change_sigma,
                                      self.sigma_max_change * self.sigma)
            change_sigma = np.maximum(change_sigma,
                                      -self.sigma_max_change * self.sigma)
            self.sigma += change_sigma

        if (self.sigma_decay < 1):
            self.sigma[self.sigma > self.sigma_limit] *= self.sigma_decay

        if (self.learning_rate_decay < 1
                and self.learning_rate > self.learning_rate_limit):
            self.learning_rate *= self.learning_rate_decay

    def current_param(self):
        return self.curr_best_mu

    def set_mu(self, mu):
        self.mu = np.array(mu)

    def best_param(self):
        return self.best_mu

    def result(self):
        # return best params so far, along with historically
        # best reward, curr reward, sigma
        return (self.best_mu, self.best_reward, self.curr_best_reward,
                self.sigma)
Beispiel #13
0
from Layers import Dense_layer, Conv_layer, Pooling_layer, Dropout_layer
from Activation_Functions import ReLU, Softmax
from Loss_functions import Loss, Softmax, CategoricalCrossentropy, Act_Softmax_Loss_CCentropy
from Optimizers import SGD, Adam
from Model import Model, Accuracy, Accuracy_Categorical

# Exempel model
hugh = Model()

hugh.add(
    Dense_layer(2,
                512,
                weight_regularizer_l2=0.0004,
                bias_regularizer_l2=0.0004))
hugh.add(ReLU())
hugh.add(Dropout_layer(0.2))
hugh.add(Dense_layer(512, 3))
hugh.add(Softmax())

hugh.setters(loss=CategoricalCrossentropy(),
             optimizer=Adam(learning_rate=0.05, decay=0.00005),
             accuracy=Accuracy_Categorical())

hugh.finalize()

hugh.train(X_train,
           y_train,
           validation_data=(X_val, y_val),
           epochs=1000,
           print_every=100)
Beispiel #14
0
    def __init__(self, population_size=1, sigma=0, alpha=0, filename=''):
        self.population_size = population_size
        self.sigma = sigma
        self.alpha = alpha
        self.optimizer = Adam()

        if filename:
            npz = np.load(filename)
            self.F1 = Param(npz['arr_0'], population_size, sigma)
            self.F2 = Param(npz['arr_1'], population_size, sigma)
            self.F3 = Param(npz['arr_2'], population_size, sigma)
            self.F4 = Param(npz['arr_3'], population_size, sigma)
            self.F5 = Param(npz['arr_4'], population_size, sigma)
            self.F6 = Param(npz['arr_5'], population_size, sigma)

            self.g3 = Param(npz['arr_6'], population_size, sigma)
            self.b3 = Param(npz['arr_7'], population_size, sigma)
            self.g4 = Param(npz['arr_8'], population_size, sigma)
            self.b4 = Param(npz['arr_9'], population_size, sigma)
            self.g5 = Param(npz['arr_10'], population_size, sigma)
            self.b5 = Param(npz['arr_11'], population_size, sigma)
            self.g6 = Param(npz['arr_12'], population_size, sigma)
            self.b6 = Param(npz['arr_13'], population_size, sigma)

            self.Wx0 = Param(npz['arr_14'], population_size, sigma)
            self.bx0 = Param(npz['arr_15'], population_size, sigma)
            self.Wx1 = Param(npz['arr_16'], population_size, sigma)
            self.bx1 = Param(npz['arr_17'], population_size, sigma)
            self.Wx2 = Param(npz['arr_18'], population_size, sigma)
            self.bx2 = Param(npz['arr_19'], population_size, sigma)
            self.Wv = Param(npz['arr_20'], population_size, sigma)
            self.bv = Param(npz['arr_21'], population_size, sigma)

            self.lg0 = Param(npz['arr_22'], population_size, sigma)
            self.lb0 = Param(npz['arr_23'], population_size, sigma)
            self.lg1 = Param(npz['arr_24'], population_size, sigma)
            self.lb1 = Param(npz['arr_25'], population_size, sigma)
            self.lg2 = Param(npz['arr_26'], population_size, sigma)
            self.lb2 = Param(npz['arr_27'], population_size, sigma)
        else:
            # filter weight is whdo
            # w = width
            # h = height
            # d = depth (in channels)
            # o = out depth (out channels)?
            self.F1 = Param(
                tf.random.normal([F_size, F_size, 3, NF1_out],
                                 stddev=m.sqrt(2 / F_size)), population_size,
                sigma)
            self.F2 = Param(
                tf.random.normal([F_size, F_size, NF1_out, NF2_out],
                                 stddev=m.sqrt(2 / F_size)), population_size,
                sigma)
            self.F3 = Param(
                tf.random.normal([F_size, F_size, NF2_out, NF3_out],
                                 stddev=m.sqrt(2 / F_size)), population_size,
                sigma)
            self.g3 = Param(tf.ones((NF3_out, 1)), population_size, sigma)
            self.b3 = Param(tf.zeros((NF3_out, 1)), population_size, sigma)
            self.F4 = Param(
                tf.random.normal([F_size, F_size, NF3_out, NF4_out],
                                 stddev=m.sqrt(2 / F_size)), population_size,
                sigma)
            self.g4 = Param(tf.ones((NF4_out, 1)), population_size, sigma)
            self.b4 = Param(tf.zeros((NF4_out, 1)), population_size, sigma)
            self.F5 = Param(
                tf.random.normal([F_size, F_size, NF4_out, NF5_out],
                                 stddev=m.sqrt(2 / F_size)), population_size,
                sigma)
            self.g5 = Param(tf.ones((NF5_out, 1)), population_size, sigma)
            self.b5 = Param(tf.zeros((NF5_out, 1)), population_size, sigma)
            self.F6 = Param(
                tf.random.normal([F_size, F_size, NF5_out, NF6_out],
                                 stddev=m.sqrt(2 / F_size)), population_size,
                sigma)
            self.g6 = Param(tf.ones((NF6_out, 1)), population_size, sigma)
            self.b6 = Param(tf.zeros((NF6_out, 1)), population_size, sigma)

            self.lg0 = Param(tf.ones((H_size, 1)), population_size, sigma)
            self.lb0 = Param(tf.zeros((H_size, 1)), population_size, sigma)
            self.lg1 = Param(tf.ones((H_size, 1)), population_size, sigma)
            self.lb1 = Param(tf.zeros((H_size, 1)), population_size, sigma)
            self.lg2 = Param(tf.ones((H_size, 1)), population_size, sigma)
            self.lb2 = Param(tf.zeros((H_size, 1)), population_size, sigma)

            self.Wx0 = Param(tf.random.normal([H_size * 4, z_size]),
                             population_size, sigma)
            self.bx0 = Param(tf.zeros([H_size * 4, 1]), population_size, sigma)
            self.Wx1 = Param(tf.random.normal([H_size * 4, H_size * 2]),
                             population_size, sigma)
            self.bx1 = Param(tf.zeros([H_size * 4, 1]), population_size, sigma)
            self.Wx2 = Param(tf.random.normal([H_size * 4, H_size * 2]),
                             population_size, sigma)
            self.bx2 = Param(tf.zeros([H_size * 4, 1]), population_size, sigma)
            self.Wv = Param(tf.random.normal([Y_size, H_size]),
                            population_size, sigma)
            self.bv = Param(tf.zeros([Y_size, 1]), population_size, sigma)
Beispiel #15
0
class Parameters:
    def __init__(self, population_size=1, sigma=0, alpha=0, filename=''):
        self.population_size = population_size
        self.sigma = sigma
        self.alpha = alpha
        self.optimizer = Adam()

        if filename:
            npz = np.load(filename)
            self.F1 = Param(npz['arr_0'], population_size, sigma)
            self.F2 = Param(npz['arr_1'], population_size, sigma)
            self.F3 = Param(npz['arr_2'], population_size, sigma)
            self.F4 = Param(npz['arr_3'], population_size, sigma)
            self.F5 = Param(npz['arr_4'], population_size, sigma)
            self.F6 = Param(npz['arr_5'], population_size, sigma)

            self.g3 = Param(npz['arr_6'], population_size, sigma)
            self.b3 = Param(npz['arr_7'], population_size, sigma)
            self.g4 = Param(npz['arr_8'], population_size, sigma)
            self.b4 = Param(npz['arr_9'], population_size, sigma)
            self.g5 = Param(npz['arr_10'], population_size, sigma)
            self.b5 = Param(npz['arr_11'], population_size, sigma)
            self.g6 = Param(npz['arr_12'], population_size, sigma)
            self.b6 = Param(npz['arr_13'], population_size, sigma)

            self.Wx0 = Param(npz['arr_14'], population_size, sigma)
            self.bx0 = Param(npz['arr_15'], population_size, sigma)
            self.Wx1 = Param(npz['arr_16'], population_size, sigma)
            self.bx1 = Param(npz['arr_17'], population_size, sigma)
            self.Wx2 = Param(npz['arr_18'], population_size, sigma)
            self.bx2 = Param(npz['arr_19'], population_size, sigma)
            self.Wv = Param(npz['arr_20'], population_size, sigma)
            self.bv = Param(npz['arr_21'], population_size, sigma)

            self.lg0 = Param(npz['arr_22'], population_size, sigma)
            self.lb0 = Param(npz['arr_23'], population_size, sigma)
            self.lg1 = Param(npz['arr_24'], population_size, sigma)
            self.lb1 = Param(npz['arr_25'], population_size, sigma)
            self.lg2 = Param(npz['arr_26'], population_size, sigma)
            self.lb2 = Param(npz['arr_27'], population_size, sigma)
        else:
            # filter weight is whdo
            # w = width
            # h = height
            # d = depth (in channels)
            # o = out depth (out channels)?
            self.F1 = Param(
                tf.random.normal([F_size, F_size, 3, NF1_out],
                                 stddev=m.sqrt(2 / F_size)), population_size,
                sigma)
            self.F2 = Param(
                tf.random.normal([F_size, F_size, NF1_out, NF2_out],
                                 stddev=m.sqrt(2 / F_size)), population_size,
                sigma)
            self.F3 = Param(
                tf.random.normal([F_size, F_size, NF2_out, NF3_out],
                                 stddev=m.sqrt(2 / F_size)), population_size,
                sigma)
            self.g3 = Param(tf.ones((NF3_out, 1)), population_size, sigma)
            self.b3 = Param(tf.zeros((NF3_out, 1)), population_size, sigma)
            self.F4 = Param(
                tf.random.normal([F_size, F_size, NF3_out, NF4_out],
                                 stddev=m.sqrt(2 / F_size)), population_size,
                sigma)
            self.g4 = Param(tf.ones((NF4_out, 1)), population_size, sigma)
            self.b4 = Param(tf.zeros((NF4_out, 1)), population_size, sigma)
            self.F5 = Param(
                tf.random.normal([F_size, F_size, NF4_out, NF5_out],
                                 stddev=m.sqrt(2 / F_size)), population_size,
                sigma)
            self.g5 = Param(tf.ones((NF5_out, 1)), population_size, sigma)
            self.b5 = Param(tf.zeros((NF5_out, 1)), population_size, sigma)
            self.F6 = Param(
                tf.random.normal([F_size, F_size, NF5_out, NF6_out],
                                 stddev=m.sqrt(2 / F_size)), population_size,
                sigma)
            self.g6 = Param(tf.ones((NF6_out, 1)), population_size, sigma)
            self.b6 = Param(tf.zeros((NF6_out, 1)), population_size, sigma)

            self.lg0 = Param(tf.ones((H_size, 1)), population_size, sigma)
            self.lb0 = Param(tf.zeros((H_size, 1)), population_size, sigma)
            self.lg1 = Param(tf.ones((H_size, 1)), population_size, sigma)
            self.lb1 = Param(tf.zeros((H_size, 1)), population_size, sigma)
            self.lg2 = Param(tf.ones((H_size, 1)), population_size, sigma)
            self.lb2 = Param(tf.zeros((H_size, 1)), population_size, sigma)

            self.Wx0 = Param(tf.random.normal([H_size * 4, z_size]),
                             population_size, sigma)
            self.bx0 = Param(tf.zeros([H_size * 4, 1]), population_size, sigma)
            self.Wx1 = Param(tf.random.normal([H_size * 4, H_size * 2]),
                             population_size, sigma)
            self.bx1 = Param(tf.zeros([H_size * 4, 1]), population_size, sigma)
            self.Wx2 = Param(tf.random.normal([H_size * 4, H_size * 2]),
                             population_size, sigma)
            self.bx2 = Param(tf.zeros([H_size * 4, 1]), population_size, sigma)
            self.Wv = Param(tf.random.normal([Y_size, H_size]),
                            population_size, sigma)
            self.bv = Param(tf.zeros([Y_size, 1]), population_size, sigma)

    def all(self):
        return [self.F1, self.F2, self.F3, self.F4, self.F5, self.F6,\
                self.g3, self.b3, self.g4, self.b4, self.g5, self.b5, self.g6, self.b6,\
                self.Wx0, self.bx0, self.Wx1, self.bx1, self.Wx2, self.bx2,\
                self.Wv, self.bv,\
                self.lg0,self.lb0,self.lg1,self.lb1,self.lg2,self.lb2]

    # return reference to current tensors
    def current(self):
        return [param.current for param in self.all()]

    def set_current_population_member(self, i):
        for param in self.all():
            param.set_current_population_member(i)

    def update_nes(self, reward, reward_mean, reward_std):
        reward = (reward - reward_mean) / (reward_std + .00001)
        grads = []
        means = []
        for param in self.all():
            grads += [
                param.get_grad(reward) * (self.alpha /
                                          (self.population_size * self.sigma))
            ]
            means += [param.mean]
        self.optimizer.update(means, grads)
        for param in self.all():
            param.gen_pop_about_mean(self.sigma)

    def mutate(self, param, i):
        x = param.population[i]
        if random.randint(1, 4) == 1:
            jitter = tf.random.normal(x.shape, stddev=self.sigma)
            return x + jitter
        else:
            return x

    def mate(self, param, i, j):
        if random.randint(1, 4) == 1:
            return self.mutate(param, i)
        else:
            return self.mutate(param, j)

    def update_ga(self, rewards):
        # sort parameters by rewards
        top_reward_indices = rewards.argsort()[-PASS_THROUGH:]
        top_reward_indices = top_reward_indices[::-1]
        for param in self.all():
            # sort population
            for i, j in enumerate(top_reward_indices):
                param.population[i] = param.population[j]
            # generate new population
            for k in range(PASS_THROUGH, self.population_size):
                param.population[k] = self.mate(param, random.randint(0, 9),
                                                random.randint(0, 9))
Beispiel #16
0
def run_models(X,
               y,
               lrdict,
               batch_size=50,
               dont_run=[],
               epochs=500,
               epoch_loss=True):
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)

    opts = OrderedDict({
        'GD': GradientDescent(lr=0.01),
        'SGD': GradientDescent(lr=0.01),
        'SGDM': SGDM(lr=0.01, gamma=0.9),
        'Adam': Adam(lr=0.01),
        'Adagrad': Adagrad(lr=0.01),
        'Adadelta': Adadelta(),
        'RMSProp': RMSProp(lr=0.01),
    })

    opts = {k: v for k, v in opts.items() if k not in dont_run}

    for opt_name, lr in lrdict.items():
        opts[opt_name].lr = lr

    res = pd.DataFrame()
    hist = pd.DataFrame()
    for opt_name, opt in opts.items():
        print("Running Optimizer: ", opt_name)

        if opt_name == 'GD':
            batch_size = None
        elif opt_name == 'SGD':
            batch_size = 1

        reg = LinearRegression(batch_size=batch_size, opt=opt, epochs=epochs)
        reg.fit(X_train, y_train, epoch_loss=epoch_loss)
        final_loss = reg.history['loss'][-1]
        final_betas = [i[0] for i in reg.betas]

        y_train_pred = reg.predict(X_train)
        try:
            train_r2 = round(r2_score(y_train, y_train_pred) * 100, 2)
        except:
            train_r2 = None

        y_test_pred = reg.predict(X_test)

        try:
            test_r2 = round(r2_score(y_test, y_test_pred) * 100, 2)
        except:
            test_r2 = None

        cols = ['opt', 'loss', 'train_r2', 'test_r2'
                ] + ['c' + str(i + 1) for i in range(len(final_betas))]
        vals = [opt_name, final_loss, train_r2, test_r2] + final_betas
        metrics = OrderedDict(zip(cols, vals))
        res = res.append(metrics, ignore_index=True)

        hist = hist.append(pd.DataFrame({
            'epoch':
            np.arange(len(reg.history['loss'])),
            'loss':
            reg.history['loss'],
            'opt': [opt_name] * len(reg.history['loss'])
        }),
                           ignore_index=True)

    return res, hist