def __init__(self, num_params, mu_init=None, sigma_init=0.1, lr=10**-2, pop_size=256, antithetic=True, weight_decay=0, rank_fitness=True): # misc self.num_params = num_params self.first_interation = True # distribution parameters if mu_init is None: self.mu = np.zeros(self.num_params) else: self.mu = np.array(mu_init) self.sigma = sigma_init # optimization stuff self.learning_rate = lr self.optimizer = Adam(self.learning_rate) # sampling stuff self.pop_size = pop_size self.antithetic = antithetic if self.antithetic: assert (self.pop_size % 2 == 0), "Population size must be even" self.weight_decay = weight_decay self.rank_fitness = rank_fitness
def __init__( self, num_params, # number of model parameters sigma_init=0.10, # initial standard deviation sigma_alpha=0.20, # learning rate for std sigma_decay=0.999, # anneal standard deviation sigma_limit=0.01, # stop annealing if less than sigma_max_change=0.2, # clips adaptive sigma to 20% learning_rate=0.01, # learning rate for std learning_rate_decay=0.9999, # annealing the learning rate learning_rate_limit=0.01, # stop annealing learning rate elite_ratio=0, # if >0 then ignore learning_rate pop_size=256, # population size average_baseline=True, # set baseline to average weight_decay=0.01, # weight decay coefficient rank_fitness=True, # use rank rather than fitness forget_best=True): # don't keep the hist best sol self.num_params = num_params self.sigma_init = sigma_init self.sigma_alpha = sigma_alpha self.sigma_decay = sigma_decay self.sigma_limit = sigma_limit self.sigma_max_change = sigma_max_change self.learning_rate = learning_rate self.learning_rate_decay = learning_rate_decay self.learning_rate_limit = learning_rate_limit self.pop_size = pop_size self.average_baseline = average_baseline if self.average_baseline: assert (self.pop_size % 2 == 0), "Population size must be even" self.pop_size = int(self.pop_size / 2) else: assert (self.pop_size & 1), "Population size must be odd" self.pop_size = int((self.pop_size - 1) / 2) # option to use greedy es method to select next mu, # rather than using drift param self.elite_ratio = elite_ratio self.elite_pop_size = int(self.pop_size * self.elite_ratio) self.use_elite = False if self.elite_pop_size > 0: self.use_elite = True self.forget_best = forget_best self.batch_reward = np.zeros(self.pop_size * 2) self.mu = np.zeros(self.num_params) self.sigma = np.ones(self.num_params) * self.sigma_init self.curr_best_mu = np.zeros(self.num_params) self.best_mu = np.zeros(self.num_params) self.best_reward = 0 self.first_interation = True self.weight_decay = weight_decay self.rank_fitness = rank_fitness if self.rank_fitness: self.forget_best = True # always forget the best one if we rank # choose optimizer self.optimizer = Adam(self, learning_rate)
def __init__( self, num_params, # number of model parameters mu_init=None, # initial mean sigma_init=1, # initial standard deviation sigma_decay=0.999, # anneal standard deviation sigma_limit=0.01, # stop annealing if less than learning_rate=0.01, # learning rate for std learning_rate_decay=0.9999, # annealing the learning rate learning_rate_limit=0.001, # stop annealing learning rate pop_size=256, # population size antithetic=False, # whether to use anti sampling weight_decay=0.01, # weight decay coefficient rank_fitness=True, # use rank rather than fitness forget_best=True): # forget historical best # misc self.num_params = num_params self.first_interation = True # distribution parameters if mu_init is None: self.mu = np.zeros(self.num_params) else: self.mu = np.array(mu_init) self.sigma_decay = sigma_decay self.sigma = sigma_init self.sigma_init = sigma_init self.sigma_limit = sigma_limit # optimizarion stuff self.learning_rate = learning_rate self.learning_rate_decay = learning_rate_decay self.learning_rate_limit = learning_rate_limit self.optimizer = Adam(self, learning_rate) # sampling stuff self.pop_size = pop_size self.antithetic = antithetic if self.antithetic: assert (self.pop_size % 2 == 0), "Population size must be even" self.forget_best = forget_best self.weight_decay = weight_decay self.rank_fitness = rank_fitness if self.rank_fitness: self.forget_best = True
def update_thetas(self, learning_rate=0.01, optimizer=''): """ Update the W matrix and biases for the layer. Once we update thetas, the gradW and gradb are set to zero. We also reset the count of how many times we acumulated the gradients. Parameters: - learning_rate: float Learning rate to use to update thetas - optimizer: str Which optimizer to use. Empty string is default """ createInstance = True #We only create one instance of the Optimizer if createInstance and optimizer == 'Adam': #Taking advantage of lazy and self.Optimizer = Adam() createInstance = False self.W = self.Optimizer.newWeight(self.W, self.gradW, learning_rate) self.b = self.Optimizer.newBias(self.b, self.gradb, learning_rate) if (not self.last_layer) and self.batchnorm: self.gamma += -learning_rate * self.gradGamma self.beta += -learning_rate * self.gradBeta
def __init__(self, input_size, num_neurons, Activation=ActivationType.SIGMOID, last_layer=False, drop_out=False, drop_percent=0.2): """ Constructs the layer Parameters: - input_size: int input size of the layer - num_neurons: int number of neurons in layer - Activation: ActivationType Activation type of all neurons in layer - last_layer: bool Specifies if layer is last layer. Only if last_layer = True, we can use calcula_loss method - drop_out: bool If layer support droput - drop_percent: float Percentage of neurons to be randomly shut down during training """ self.W = np.random.uniform(low=-0.1, high=0.1, size=(input_size, num_neurons)) self.b = np.random.uniform(low=-0.1, high=0.1, size=(1, num_neurons)) self.num_neurons = num_neurons self.input_size = input_size self.Activation = Activation self.a = None #The output of the layer self.gradW = np.zeros(shape=self.W.shape) self.gradb = np.zeros(shape=self.b.shape) self.last_layer = last_layer self.delta = None self.input_data = None self.Optimizer = DefaultOptimizer( ) #We use the otimizer to calculate new parameters self.batchnorm = False self.dropM = None self.drop_percent = drop_percent self.drop_out = drop_out self.mode = 'Train' #------------------------------------ for Batch Normalization --------------------------------# if not last_layer: self.gamma = np.random.uniform(low=-0.1, high=0.1, size=self.b.shape) self.beta = np.random.uniform(low=-0.1, high=0.1, size=self.b.shape) self.sihat = None self.sb = None self.variance = None self.mean = None self.deltaBN = None self.gradGamma = np.zeros(shape=self.gamma.shape) self.gradBeta = np.zeros(shape=self.beta.shape) self.cumulative_mean = 0 self.cumulative_variance = 0 self.EPSILON = 10**(-6)
class Layer(object): """Layer class""" def __init__(self, input_size, num_neurons, Activation=ActivationType.SIGMOID, last_layer=False, drop_out=False, drop_percent=0.2): """ Constructs the layer Parameters: - input_size: int input size of the layer - num_neurons: int number of neurons in layer - Activation: ActivationType Activation type of all neurons in layer - last_layer: bool Specifies if layer is last layer. Only if last_layer = True, we can use calcula_loss method - drop_out: bool If layer support droput - drop_percent: float Percentage of neurons to be randomly shut down during training """ self.W = np.random.uniform(low=-0.1, high=0.1, size=(input_size, num_neurons)) self.b = np.random.uniform(low=-0.1, high=0.1, size=(1, num_neurons)) self.num_neurons = num_neurons self.input_size = input_size self.Activation = Activation self.a = None #The output of the layer self.gradW = np.zeros(shape=self.W.shape) self.gradb = np.zeros(shape=self.b.shape) self.last_layer = last_layer self.delta = None self.input_data = None self.Optimizer = DefaultOptimizer( ) #We use the otimizer to calculate new parameters self.batchnorm = False self.dropM = None self.drop_percent = drop_percent self.drop_out = drop_out self.mode = 'Train' #------------------------------------ for Batch Normalization --------------------------------# if not last_layer: self.gamma = np.random.uniform(low=-0.1, high=0.1, size=self.b.shape) self.beta = np.random.uniform(low=-0.1, high=0.1, size=self.b.shape) self.sihat = None self.sb = None self.variance = None self.mean = None self.deltaBN = None self.gradGamma = np.zeros(shape=self.gamma.shape) self.gradBeta = np.zeros(shape=self.beta.shape) self.cumulative_mean = 0 self.cumulative_variance = 0 self.EPSILON = 10**(-6) def forward_pass(self, input_data, mode='Train', batchnorm=False): """ Forward passing data. Parameters: - input_data: np.ndarray Represents data being passed to the layer. Once we forward pass, the layer stores the value as a property - mode: str 'Train' if in training stage. Test otherwise - batchnotm: bool Whether to perform bacth normalization Returns: - self.a: np.ndarray Input for next layer. If self is last layer, this is the actual output Raises - Exception: Exception Raised when condition ActivationType == SIGMOID and laster_layer == True is not valid """ self.input_data = input_data s1 = input_data @ self.W + self.b self.batchnorm = batchnorm #-------------------------- for batch normalization --------------------------------# if (not self.last_layer) and self.batchnorm: if mode == 'Train': self.variance = np.var(s1, axis=0, keepdims=True, dtype=np.float64) self.mean = np.mean(s1, axis=0, keepdims=True) self.cumulative_mean = 0.9 * self.cumulative_mean + ( 1 - 0.9) * self.mean self.cumulative_variance = 0.9 * self.cumulative_variance + ( 1 - 0.9) * self.variance else: self.mean = self.cumulative_mean self.variance = self.cumulative_variance self.sihat = (s1 - self.mean) / np.sqrt(self.variance + self.EPSILON) self.sb = self.gamma * self.sihat + self.beta s1 = self.sb #------------------------- Calculating activations ------------------------------# if (self.Activation == ActivationType.SIGMOID): self.a = Functions.sigmoid(s1) elif (self.Activation == ActivationType.RELU): self.a = Functions.relu(s1) elif (self.Activation == ActivationType.TANH): self.a = Functions.tanh(s1) elif (self.Activation == ActivationType.SOFTMAX and self.last_layer): self.a = Functions.softmax(s1) else: raise Exception( "Please change boolean parameter or activation type") #Zeroing out some weights if (not self.last_layer) and self.drop_out and (mode == 'Train'): self.a = self.a * self.dropM return self.a def back_propagate(self, weighted_deltas): """ Back propagates weighted deltas back to network Parameters: - weighted_deltas: ndarray Deltas of the next layer weighted by next layer's W matrix. Returns weighted deltas of self as np.ndarray """ #------------------------ Back propagating --------------------------# if (self.Activation == ActivationType.SIGMOID): self.delta = weighted_deltas * self.a * (1 - self.a) elif (self.Activation == ActivationType.RELU): derivative = np.where( self.a >= 0, 1, 0) if not self.batchnorm else np.where( self.a > self.EPSILON, 1, self.EPSILON) #Just for batch normalization self.delta = weighted_deltas * derivative elif (self.Activation == ActivationType.TANH): self.delta = weighted_deltas * (1 - self.a**2) elif (self.Activation == ActivationType.SOFTMAX and self.last_layer): #the weighted deltas for this case are the expected result Y self.delta = (self.a - weighted_deltas) else: raise Exception("Please change last_layer or activation type") if (not self.last_layer) and self.drop_out: self.delta = self.delta * self.dropM if self.last_layer or (not self.batchnorm): return (self.W @ self.delta.T).T #------------------------- Calculating deltabn and back propagate ---------------------# denominator = self.input_data.shape[0] * np.sqrt(self.variance + 10**(-8)) self.deltaBN = self.delta * self.gamma * ( self.input_data.shape[0] - 1 - self.sihat**2) / denominator return (self.W @ self.deltaBN.T).T def update_gradients(self, regularized=True, reg_val=0.01): """ Accumulate gradients with regularization by default. Layers keeps track how many times the gradients were accumulated. This feature is useful when implementation iterative algorithms Parameters: - regularized: bool If regularization is implemented in layer - reg_val: float Regularization value """ if self.last_layer or (not self.batchnorm): self.gradW = (self.input_data.T @ self.delta ) / self.input_data.shape[0] #Average gradient self.gradb = np.mean(self.delta, axis=0, keepdims=True) #average gradient else: #-------------------------- for Batch Norm --------------------------# self.gradW = ( self.input_data.T @ self.deltaBN) / self.input_data.shape[0] self.gradb = np.mean(self.deltaBN, axis=0, keepdims=True) self.gradBeta = np.mean(self.delta, axis=0, keepdims=True) self.gradGamma = np.mean(self.sihat * self.delta, axis=0, keepdims=True) if regularized: self.gradW += reg_val * self.W def update_thetas(self, learning_rate=0.01, optimizer=''): """ Update the W matrix and biases for the layer. Once we update thetas, the gradW and gradb are set to zero. We also reset the count of how many times we acumulated the gradients. Parameters: - learning_rate: float Learning rate to use to update thetas - optimizer: str Which optimizer to use. Empty string is default """ createInstance = True #We only create one instance of the Optimizer if createInstance and optimizer == 'Adam': #Taking advantage of lazy and self.Optimizer = Adam() createInstance = False self.W = self.Optimizer.newWeight(self.W, self.gradW, learning_rate) self.b = self.Optimizer.newBias(self.b, self.gradb, learning_rate) if (not self.last_layer) and self.batchnorm: self.gamma += -learning_rate * self.gradGamma self.beta += -learning_rate * self.gradBeta def calculate_loss(self, y_expected): """ Calculates loss only at last layer Parameters: - y_expected: ndarray Exepected value of the last layer Returns: - float Loss value Raises: - Exception: Exception When layer is not last layer and client wants to calculate loss """ if not self.last_layer: raise Exception("Loss can only be calculated at last layer") return self.loss(y_expected) def loss(self, y_expected): return np.sum(-y_expected * np.log(self.a)) def __str__(self): """ String representation of layer object """ layer_type = "Output" if self.last_layer else "Hidden" funcType = '' if self.Activation == ActivationType.SIGMOID: funcType = "SIGMOID" elif self.Activation == ActivationType.RELU: funcType = "RELU" elif self.Activation == ActivationType.SOFTMAX: funcType = "SOFTMAX" else: funcType = "TANH" return "(Type: %s, Activation: %s, Size: %d)" % (layer_type, funcType, self.num_neurons)
def run_all_model(train_input, train_target, test_input, test_target, Sample_number, save_plot=False): # Define constants along the test hidden_nb = 25 std = 0.1 eta = 3e-1 batch_size = 200 epochs_number = 1000 # Model 1. No dropout; constant learning rate (SGD) print('\nModel 1: Optimizer: SGD; No dropout; ReLU; CrossEntropy') # Define model name for plots mname = 'Model1' # Define structure of the network linear_1 = Linear(2, hidden_nb) relu_1 = Relu() linear_2 = Linear(hidden_nb, hidden_nb) relu_2 = Relu() linear_3 = Linear(hidden_nb, hidden_nb) relu_3 = Relu() linear_4 = Linear(hidden_nb, 2) loss = CrossEntropy() model_1 = Sequential(linear_1, relu_1, linear_2, relu_2, linear_3, relu_3, linear_4, loss=CrossEntropy()) # Initialize weights model_1.normalize_parameters(mean=0, std=std) # Define optimizer optimizer = Sgd(eta) # Train model my_loss_1 = train_model(model_1, train_input, train_target, optimizer, epochs_number, Sample_number, batch_size) # Evalute model and produce plots model_1_perf = evaluate_model(model_1, train_input, train_target, test_input, test_target, my_loss_1, save_plot, mname=mname) # Model 2. No dropout; decreasing learning rate (DecreaseSGD) print('\nModel 2: Optimizer: DecreaseSGD; No dropout; ReLU; CrossEntropy') # Define model name for plots mname = 'Model2' # Define structure of the network linear_1 = Linear(2, hidden_nb) relu_1 = Relu() linear_2 = Linear(hidden_nb, hidden_nb) relu_2 = Relu() linear_3 = Linear(hidden_nb, hidden_nb) relu_3 = Relu() linear_4 = Linear(hidden_nb, 2) model_2 = Sequential(linear_1, relu_1, linear_2, relu_2, linear_3, relu_3, linear_4, loss=CrossEntropy()) # Initialize weights model_2.normalize_parameters(mean=0, std=std) # Define optimizer optimizer = DecreaseSGD(eta) # Train model my_loss_2 = train_model(model_2, train_input, train_target, optimizer, epochs_number, Sample_number, batch_size) # Evalute model and produce plots model_2_perf = evaluate_model(model_2, train_input, train_target, test_input, test_target, my_loss_2, save_plot, mname=mname) # Model 3. No dropout; Adam Optimizer print('\nModel 3: Optimizer: Adam; No dropout; ReLU; CrossEntropy') # Define model name for plots mname = 'Model3' # Custom hyperparameters eta_adam = 1e-3 epochs_number_adam = 500 # Define structure of the network linear_1 = Linear(2, hidden_nb) relu_1 = Relu() linear_2 = Linear(hidden_nb, hidden_nb) relu_2 = Relu() linear_3 = Linear(hidden_nb, hidden_nb) relu_3 = Relu() linear_4 = Linear(hidden_nb, 2) loss = CrossEntropy() model_3 = Sequential(linear_1, relu_1, linear_2, relu_2, linear_3, relu_3, linear_4, loss=CrossEntropy()) # Initialize weights model_3.normalize_parameters(mean=0, std=std) # Define optimizer optimizer = Adam(eta_adam, 0.9, 0.99, 1e-8) # Train model my_loss_3 = train_model(model_3, train_input, train_target, optimizer, epochs_number_adam, Sample_number, batch_size) # Evalute model and produce plots model_3_perf = evaluate_model(model_3, train_input, train_target, test_input, test_target, my_loss_3, save_plot, mname=mname) # PLOT TO COMPARE OPTIMIZERS if save_plot: fig = plt.figure(figsize=(10, 4)) plt.plot(range(0, epochs_number), my_loss_1, linewidth=1) plt.plot(range(0, epochs_number), my_loss_2, linewidth=1) plt.plot(range(0, epochs_number_adam), my_loss_3, linewidth=1) plt.legend(["SGD", "Decreasing SGD", "Adam"]) plt.title("Loss") plt.xlabel("Epochs") plt.savefig('output/compare_optimizers.pdf', bbox_inches='tight') plt.close(fig) # Model 4. Dropout; SGD print('\nModel 4: Optimizer: SGD; Dropout; ReLU; CrossEntropy') # Define model name for plots mname = 'Model4' # Define structure of the network dropout = 0.15 linear_1 = Linear(2, hidden_nb) relu_1 = Relu() linear_2 = Linear(hidden_nb, hidden_nb, dropout=dropout) relu_2 = Relu() linear_3 = Linear(hidden_nb, hidden_nb, dropout=dropout) relu_3 = Relu() linear_4 = Linear(hidden_nb, 2) model_4 = Sequential(linear_1, relu_1, linear_2, relu_2, linear_3, relu_3, linear_4, loss=CrossEntropy()) # Initialize weights model_4.normalize_parameters(mean=0, std=std) # Define optimizer optimizer = Sgd(eta) # Train model my_loss_4 = train_model(model_4, train_input, train_target, optimizer, epochs_number, Sample_number, batch_size) # Evalute model and produce plots model_4_perf = evaluate_model(model_4, train_input, train_target, test_input, test_target, my_loss_4, save_plot, mname=mname) # PLOT TO COMPARE DROPOUT AND NO DROPOUT if save_plot: fig = plt.figure(figsize=(10, 4)) plt.plot(range(0, epochs_number), my_loss_1, linewidth=1) plt.plot(range(0, epochs_number), my_loss_4, linewidth=1) plt.legend(["Without Dropout", "With Dropout"]) plt.title("Loss") plt.xlabel("Epochs") plt.savefig('output/compare_dropout.pdf', bbox_inches='tight') plt.close(fig) print('\nEvaluation of different activation functions\n') # Model 5. No Dropout; SGD; Tanh print('\nModel 5: Optimizer: SGD; No dropout; Tanh; CrossEntropy') # Define model name for plots mname = 'Model5' # Define structure of the network linear_1 = Linear(2, hidden_nb) relu_1 = Tanh() linear_2 = Linear(hidden_nb, hidden_nb) relu_2 = Tanh() linear_3 = Linear(hidden_nb, hidden_nb) relu_3 = Tanh() linear_4 = Linear(hidden_nb, 2) model_5 = Sequential(linear_1, relu_1, linear_2, relu_2, linear_3, relu_3, linear_4, loss=CrossEntropy()) # Initialize weights model_5.normalize_parameters(mean=0, std=std) # Define optimizer optimizer = Sgd(eta) # Train model my_loss_5 = train_model(model_5, train_input, train_target, optimizer, epochs_number, Sample_number, batch_size) # Evalute model and produce plots model_5_perf = evaluate_model(model_5, train_input, train_target, test_input, test_target, my_loss_5, save_plot, mname=mname) # Model 6. Xavier Initialization print( '\nModel 6: Optimizer: SGD; No dropout; Tanh; Xavier initialization; CrossEntropy' ) # Define model name for plots mname = 'Model6' # Define network structure linear_1 = Linear(2, hidden_nb) relu_1 = Tanh() linear_2 = Linear(hidden_nb, hidden_nb) relu_2 = Tanh() linear_3 = Linear(hidden_nb, hidden_nb) relu_3 = Tanh() linear_4 = Linear(hidden_nb, 2) model_6 = Sequential(linear_1, relu_1, linear_2, relu_2, linear_3, relu_3, linear_4, loss=CrossEntropy()) model_6.xavier_parameters() optimizer = Sgd() # Train model my_loss_6 = train_model(model_6, train_input, train_target, optimizer, epochs_number, Sample_number, batch_size) # Evalute model and produce plots model_6_perf = evaluate_model(model_6, train_input, train_target, test_input, test_target, my_loss_6, save_plot, mname=mname) # Model 7. Sigmoid print('\nModel 7: Optimizer: SGD; No dropout; Sigmoid; CrossEntropy') # Define model name for plots mname = 'Model7' # Define parameter for sigmoid activation p_lambda = 0.1 # Define network structure linear_1 = Linear(2, hidden_nb) relu_1 = Sigmoid(p_lambda) linear_2 = Linear(hidden_nb, hidden_nb) relu_2 = Sigmoid(p_lambda) linear_3 = Linear(hidden_nb, hidden_nb) relu_3 = Sigmoid(p_lambda) linear_4 = Linear(hidden_nb, 2) model_7 = Sequential(linear_1, relu_1, linear_2, relu_2, linear_3, relu_3, linear_4, loss=CrossEntropy()) model_7.normalize_parameters(mean=0.5, std=1) optimizer = Sgd(eta=0.5) # Train model my_loss_7 = train_model(model_7, train_input, train_target, optimizer, epochs_number, Sample_number, batch_size) # Evalute model and produce plots model_7_perf = evaluate_model(model_7, train_input, train_target, test_input, test_target, my_loss_7, save_plot, mname=mname) # PLOT TO COMPARE EFFECT OF DIFFERENT ACTIVATIONS if save_plot: fig = plt.figure(figsize=(10, 4)) plt.plot(range(0, epochs_number), my_loss_1, linewidth=0.5) plt.plot(range(0, epochs_number), my_loss_5, linewidth=0.5, alpha=0.8) plt.plot(range(0, epochs_number), my_loss_6, linewidth=0.5, alpha=0.8) plt.plot(range(0, epochs_number), my_loss_7, linewidth=0.5) plt.legend(["Relu", "Tanh", "Tanh (Xavier)", "Sigmoid"]) plt.title("Loss") plt.xlabel("Epochs") plt.savefig('output/compare_activations.pdf', bbox_inches='tight') plt.close(fig) print('\nEvaluation of base model with MSE loss\n') # Model 8. MSE loss print('\nModel 8: Optimizer: SGD; No dropout; Relu; MSE') # Define model name for plots mname = 'Model8' linear_1 = Linear(2, hidden_nb) relu_1 = Relu() linear_2 = Linear(hidden_nb, hidden_nb) relu_2 = Relu() linear_3 = Linear(hidden_nb, hidden_nb) relu_3 = Relu() linear_4 = Linear(hidden_nb, 2) loss = LossMSE() model_8 = Sequential(linear_1, relu_1, linear_2, relu_2, linear_3, relu_3, linear_4, loss=loss) model_8.normalize_parameters(mean=0, std=std) optimizer = Sgd(eta) # Train model my_loss_8 = train_model(model_8, train_input, train_target, optimizer, epochs_number, Sample_number, batch_size) # Evalute model and produce plots model_8_perf = evaluate_model(model_8, train_input, train_target, test_input, test_target, my_loss_8, save_plot, mname=mname) print('Evaluation done! ') train_loss = torch.tensor([ model_1_perf[0], model_2_perf[0], model_3_perf[0], model_4_perf[0], model_5_perf[0], model_6_perf[0], model_7_perf[0], model_8_perf[0] ]) train_error = torch.tensor([ model_1_perf[1], model_2_perf[1], model_3_perf[1], model_4_perf[1], model_5_perf[1], model_6_perf[1], model_7_perf[1], model_8_perf[1] ]) test_loss = torch.tensor([ model_1_perf[2], model_2_perf[2], model_3_perf[2], model_4_perf[2], model_5_perf[2], model_6_perf[2], model_7_perf[2], model_8_perf[2] ]) test_error = torch.tensor([ model_1_perf[3], model_2_perf[3], model_3_perf[3], model_4_perf[3], model_5_perf[3], model_6_perf[3], model_7_perf[3], model_8_perf[3] ]) return train_loss, train_error, test_loss, test_error
# # for color, dim in zip(("blue", "green", "red"), range(X_train.shape[1])): # plt.scatter(X_train[:, dim], y_train, marker="^", color=color) # plt.show() # from Models.NeuralNetworks import Layers # from Models.NeuralNetworks import Sequential # # model = Sequential() # model.add(Layers.Dense(5, "Relu")) # model.add(Layers.Dense(10, "Relu")) # model.build("MeanSquaredError", "Adam") # model.call(X_train, y_train) # print(model.layers[1].inputs.shape) # print(model.layers[0].inputs.shape) # print(model.layers[0].outputs.shape) from Models.LinearModels import LogisticRegression from Optimizers import Adam from Losses import MAE model = LogisticRegression(2000, Adam(), MAE()) model(X_train, y_train) predictions = model.inference(X_test) print(MAE()(y_test, predictions)) # # predictions = model.inference(X_test) # for index in range(len(y_test)): # print(y_test[index], predictions[index])
class VES: """ Basic Version of OpenAI Evolution Strategies """ def __init__(self, num_params, mu_init=None, sigma_init=0.1, lr=10**-2, pop_size=256, antithetic=True, weight_decay=0, rank_fitness=True): # misc self.num_params = num_params self.first_interation = True # distribution parameters if mu_init is None: self.mu = np.zeros(self.num_params) else: self.mu = np.array(mu_init) self.sigma = sigma_init # optimization stuff self.learning_rate = lr self.optimizer = Adam(self.learning_rate) # sampling stuff self.pop_size = pop_size self.antithetic = antithetic if self.antithetic: assert (self.pop_size % 2 == 0), "Population size must be even" self.weight_decay = weight_decay self.rank_fitness = rank_fitness def ask(self): """ Returns a list of candidates parameterss """ if self.antithetic: epsilon_half = np.random.randn(self.pop_size // 2, self.num_params) epsilon = np.concatenate([epsilon_half, -epsilon_half]) else: epsilon = np.random.randn(self.pop_size, self.num_params) return self.mu + epsilon * self.sigma def tell(self, scores, solutions): """ Updates the distribution """ assert (len(scores) == self.pop_size ), "Inconsistent reward_table size reported." reward = np.array(scores) if self.rank_fitness: reward = compute_centered_ranks(reward) if self.weight_decay > 0: l2_decay = compute_weight_decay(self.weight_decay, solutions) reward += l2_decay epsilon = (solutions - self.mu) / self.sigma grad = -1 / (self.sigma * self.pop_size) * np.dot(reward, epsilon) # optimization step step = self.optimizer.step(grad) self.mu += step def get_distrib_params(self): """ Returns the parameters of the distrubtion: the mean and sigma """ return np.copy(self.mu), np.copy(self.sigma**2)
class GES: """ Guided Evolution Strategies """ def __init__(self, num_params, mu_init=None, sigma_init=0.1, lr=10**-2, alpha=0.5, beta=2, k=1, pop_size=256, antithetic=True, weight_decay=0, rank_fitness=False): # misc self.num_params = num_params self.first_interation = True # distribution parameters if mu_init is None: self.mu = np.zeros(self.num_params) else: self.mu = np.array(mu_init) self.sigma = sigma_init self.U = np.ones((self.num_params, k)) # optimization stuff self.alpha = alpha self.beta = beta self.k = k self.learning_rate = lr self.optimizer = Adam(self.learning_rate) # sampling stuff self.pop_size = pop_size self.antithetic = antithetic if self.antithetic: assert (self.pop_size % 2 == 0), "Population size must be even" self.weight_decay = weight_decay self.rank_fitness = rank_fitness def ask(self): """ Returns a list of candidates parameterss """ if self.antithetic: epsilon_half = np.sqrt(self.alpha / self.num_params) * \ np.random.randn(self.pop_size // 2, self.num_params) epsilon_half += np.sqrt((1 - self.alpha) / self.k) * \ np.random.randn(self.pop_size // 2, self.k) @ self.U.T epsilon = np.concatenate([epsilon_half, -epsilon_half]) else: epsilon = np.sqrt(self.alpha / self.num_params) * \ np.random.randn(self.pop_size, self.num_params) epsilon += np.sqrt(1 - self.alpha) * \ np.random.randn(self.pop_size, self.num_params) @ self.U.T return self.mu + epsilon * self.sigma def tell(self, scores, solutions): """ Updates the distribution """ assert (len(scores) == self.pop_size ), "Inconsistent reward_table size reported." reward = np.array(scores) if self.rank_fitness: reward = compute_centered_ranks(reward) if self.weight_decay > 0: l2_decay = compute_weight_decay(self.weight_decay, solutions) reward += l2_decay epsilon = (solutions - self.mu) / self.sigma grad = -self.beta/(self.sigma * self.pop_size) * \ np.dot(reward, epsilon) # optimization step step = self.optimizer.step(grad) self.mu += step def add(self, params, grads, fitness): """ Adds new "gradient" to U """ if params is not None: self.mu = params grads = grads / np.linalg.norm(grads) self.U[:, -1] = grads def get_distrib_params(self): """ Returns the parameters of the distrubtion: the mean and sigma """ return np.copy(self.mu), np.copy(self.sigma**2)
class OpenES: """ Basic Version of OpenAI Evolution Strategies """ def __init__( self, num_params, # number of model parameters mu_init=None, # initial mean sigma_init=1, # initial standard deviation sigma_decay=0.999, # anneal standard deviation sigma_limit=0.01, # stop annealing if less than learning_rate=0.01, # learning rate for std learning_rate_decay=0.9999, # annealing the learning rate learning_rate_limit=0.001, # stop annealing learning rate pop_size=256, # population size antithetic=False, # whether to use anti sampling weight_decay=0.01, # weight decay coefficient rank_fitness=True, # use rank rather than fitness forget_best=True): # forget historical best # misc self.num_params = num_params self.first_interation = True # distribution parameters if mu_init is None: self.mu = np.zeros(self.num_params) else: self.mu = np.array(mu_init) self.sigma_decay = sigma_decay self.sigma = sigma_init self.sigma_init = sigma_init self.sigma_limit = sigma_limit # optimizarion stuff self.learning_rate = learning_rate self.learning_rate_decay = learning_rate_decay self.learning_rate_limit = learning_rate_limit self.optimizer = Adam(self, learning_rate) # sampling stuff self.pop_size = pop_size self.antithetic = antithetic if self.antithetic: assert (self.pop_size % 2 == 0), "Population size must be even" self.forget_best = forget_best self.weight_decay = weight_decay self.rank_fitness = rank_fitness if self.rank_fitness: self.forget_best = True def ask(self, pop_size): """ Returns a list of candidates parameterss """ if self.antithetic: epsilon_half = np.random.randn(self.pop_size // 2, self.num_params) epsilon = np.concatenate([epsilon_half, -epsilon_half]) else: epsilon = np.random.randn(pop_size, self.num_params) return self.mu.reshape(1, self.num_params) + epsilon * self.sigma def tell(self, solutions, scores): """ Updates the distribution """ assert (len(scores) == self.pop_size ), "Inconsistent reward_table size reported." reward = np.array(scores) if self.rank_fitness: reward = compute_centered_ranks(reward) if self.weight_decay > 0: l2_decay = compute_weight_decay(self.weight_decay, solutions) reward += l2_decay # TBD check if ok epsilon = (solutions - self.mu.reshape(1, self.num_params)) / self.sigma # standardize the rewards to have a gaussian distribution normalized_reward = (reward - np.mean(reward)) / np.std(reward) change_mu = 1. / (self.pop_size * self.sigma) * \ np.dot(epsilon.T, normalized_reward) # updating stuff idx = np.argsort(reward)[::-1] best_reward = reward[idx[0]] best_mu = solutions[idx[0]] self.curr_best_reward = best_reward self.curr_best_mu = best_mu if self.first_interation: self.first_interation = False self.best_reward = self.curr_best_reward self.best_mu = best_mu else: if self.forget_best or (self.curr_best_reward > self.best_reward): self.best_mu = best_mu self.best_reward = self.curr_best_reward # optimization step self.optimizer.stepsize = self.learning_rate self.optimizer.update(-change_mu) # adjust sigma according to the adaptive sigma calculation if (self.sigma > self.sigma_limit): self.sigma *= self.sigma_decay if (self.learning_rate > self.learning_rate_limit): self.learning_rate *= self.learning_rate_decay def get_distrib_params(self): """ Returns the parameters of the distrubtion: the mean and sigma """ return self.mu, self.sigma def result(self): """ Returns best params so far, best score, current score and sigma """ return (self.best_mu, self.best_reward, self.curr_best_reward, self.sigma) def rms_stdev(self): sigma = self.sigma return np.mean(np.sqrt(sigma * sigma))
class PEPG: '''Extension of PEPG with bells and whistles.''' def __init__( self, num_params, # number of model parameters sigma_init=0.10, # initial standard deviation sigma_alpha=0.20, # learning rate for std sigma_decay=0.999, # anneal standard deviation sigma_limit=0.01, # stop annealing if less than sigma_max_change=0.2, # clips adaptive sigma to 20% learning_rate=0.01, # learning rate for std learning_rate_decay=0.9999, # annealing the learning rate learning_rate_limit=0.01, # stop annealing learning rate elite_ratio=0, # if >0 then ignore learning_rate pop_size=256, # population size average_baseline=True, # set baseline to average weight_decay=0.01, # weight decay coefficient rank_fitness=True, # use rank rather than fitness forget_best=True): # don't keep the hist best sol self.num_params = num_params self.sigma_init = sigma_init self.sigma_alpha = sigma_alpha self.sigma_decay = sigma_decay self.sigma_limit = sigma_limit self.sigma_max_change = sigma_max_change self.learning_rate = learning_rate self.learning_rate_decay = learning_rate_decay self.learning_rate_limit = learning_rate_limit self.pop_size = pop_size self.average_baseline = average_baseline if self.average_baseline: assert (self.pop_size % 2 == 0), "Population size must be even" self.pop_size = int(self.pop_size / 2) else: assert (self.pop_size & 1), "Population size must be odd" self.pop_size = int((self.pop_size - 1) / 2) # option to use greedy es method to select next mu, # rather than using drift param self.elite_ratio = elite_ratio self.elite_pop_size = int(self.pop_size * self.elite_ratio) self.use_elite = False if self.elite_pop_size > 0: self.use_elite = True self.forget_best = forget_best self.batch_reward = np.zeros(self.pop_size * 2) self.mu = np.zeros(self.num_params) self.sigma = np.ones(self.num_params) * self.sigma_init self.curr_best_mu = np.zeros(self.num_params) self.best_mu = np.zeros(self.num_params) self.best_reward = 0 self.first_interation = True self.weight_decay = weight_decay self.rank_fitness = rank_fitness if self.rank_fitness: self.forget_best = True # always forget the best one if we rank # choose optimizer self.optimizer = Adam(self, learning_rate) def rms_stdev(self): sigma = self.sigma return np.mean(np.sqrt(sigma * sigma)) def ask(self): '''returns a list of parameters''' # antithetic sampling self.epsilon = np.random.randn(self.pop_size, self.num_params) self.epsilon *= self.sigma.reshape(1, self.num_params) self.epsilon_full = np.concatenate([self.epsilon, -self.epsilon]) if self.average_baseline: epsilon = self.epsilon_full else: # first population is mu, then positive epsilon, # then negative epsilon epsilon = np.concatenate( [np.zeros((1, self.num_params)), self.epsilon_full]) solutions = self.mu.reshape(1, self.num_params) + epsilon self.solutions = solutions return solutions def tell(self, scores): # input must be a numpy float array assert (len(scores) == self.pop_size ), "Inconsistent reward_table size reported." reward_table = np.array(scores) if self.rank_fitness: reward_table = compute_centered_ranks(reward_table) if self.weight_decay > 0: l2_decay = compute_weight_decay(self.weight_decay, self.solutions) reward_table += l2_decay reward_offset = 1 if self.average_baseline: b = np.mean(reward_table) reward_offset = 0 else: b = reward_table[0] # baseline reward = reward_table[reward_offset:] if self.use_elite: idx = np.argsort(reward)[::-1][0:self.elite_pop_size] else: idx = np.argsort(reward)[::-1] best_reward = reward[idx[0]] if (best_reward > b or self.average_baseline): best_mu = self.mu + self.epsilon_full[idx[0]] best_reward = reward[idx[0]] else: best_mu = self.mu best_reward = b self.curr_best_reward = best_reward self.curr_best_mu = best_mu if self.first_interation: self.sigma = np.ones(self.num_params) * self.sigma_init self.first_interation = False self.best_reward = self.curr_best_reward self.best_mu = best_mu else: if self.forget_best or (self.curr_best_reward > self.best_reward): self.best_mu = best_mu self.best_reward = self.curr_best_reward # short hand epsilon = self.epsilon sigma = self.sigma # update the mean # move mean to the average of the best idx means if self.use_elite: self.mu += self.epsilon_full[idx].mean(axis=0) else: rT = (reward[:self.pop_size] - reward[self.pop_size:]) change_mu = np.dot(rT, epsilon) self.optimizer.stepsize = self.learning_rate # adam, rmsprop, momentum, etc. update_ratio = self.optimizer.update(-change_mu) # self.mu += (change_mu * self.learning_rate) # normal SGD method # adaptive sigma # normalization if (self.sigma_alpha > 0): stdev_reward = 1.0 if not self.rank_fitness: stdev_reward = reward.std() S = epsilon * epsilon - (sigma * sigma).reshape(1, self.num_params) S /= sigma.reshape(1, self.num_params) reward_avg = (reward[:self.pop_size] + reward[self.pop_size:]) / 2.0 rS = reward_avg - b delta_sigma = (np.dot(rS, S)) / \ (2 * self.pop_size * stdev_reward) # adjust sigma according to the adaptive sigma calculation # for stability, don't let sigma move more than 10% of orig value change_sigma = self.sigma_alpha * delta_sigma change_sigma = np.minimum(change_sigma, self.sigma_max_change * self.sigma) change_sigma = np.maximum(change_sigma, -self.sigma_max_change * self.sigma) self.sigma += change_sigma if (self.sigma_decay < 1): self.sigma[self.sigma > self.sigma_limit] *= self.sigma_decay if (self.learning_rate_decay < 1 and self.learning_rate > self.learning_rate_limit): self.learning_rate *= self.learning_rate_decay def current_param(self): return self.curr_best_mu def set_mu(self, mu): self.mu = np.array(mu) def best_param(self): return self.best_mu def result(self): # return best params so far, along with historically # best reward, curr reward, sigma return (self.best_mu, self.best_reward, self.curr_best_reward, self.sigma)
from Layers import Dense_layer, Conv_layer, Pooling_layer, Dropout_layer from Activation_Functions import ReLU, Softmax from Loss_functions import Loss, Softmax, CategoricalCrossentropy, Act_Softmax_Loss_CCentropy from Optimizers import SGD, Adam from Model import Model, Accuracy, Accuracy_Categorical # Exempel model hugh = Model() hugh.add( Dense_layer(2, 512, weight_regularizer_l2=0.0004, bias_regularizer_l2=0.0004)) hugh.add(ReLU()) hugh.add(Dropout_layer(0.2)) hugh.add(Dense_layer(512, 3)) hugh.add(Softmax()) hugh.setters(loss=CategoricalCrossentropy(), optimizer=Adam(learning_rate=0.05, decay=0.00005), accuracy=Accuracy_Categorical()) hugh.finalize() hugh.train(X_train, y_train, validation_data=(X_val, y_val), epochs=1000, print_every=100)
def __init__(self, population_size=1, sigma=0, alpha=0, filename=''): self.population_size = population_size self.sigma = sigma self.alpha = alpha self.optimizer = Adam() if filename: npz = np.load(filename) self.F1 = Param(npz['arr_0'], population_size, sigma) self.F2 = Param(npz['arr_1'], population_size, sigma) self.F3 = Param(npz['arr_2'], population_size, sigma) self.F4 = Param(npz['arr_3'], population_size, sigma) self.F5 = Param(npz['arr_4'], population_size, sigma) self.F6 = Param(npz['arr_5'], population_size, sigma) self.g3 = Param(npz['arr_6'], population_size, sigma) self.b3 = Param(npz['arr_7'], population_size, sigma) self.g4 = Param(npz['arr_8'], population_size, sigma) self.b4 = Param(npz['arr_9'], population_size, sigma) self.g5 = Param(npz['arr_10'], population_size, sigma) self.b5 = Param(npz['arr_11'], population_size, sigma) self.g6 = Param(npz['arr_12'], population_size, sigma) self.b6 = Param(npz['arr_13'], population_size, sigma) self.Wx0 = Param(npz['arr_14'], population_size, sigma) self.bx0 = Param(npz['arr_15'], population_size, sigma) self.Wx1 = Param(npz['arr_16'], population_size, sigma) self.bx1 = Param(npz['arr_17'], population_size, sigma) self.Wx2 = Param(npz['arr_18'], population_size, sigma) self.bx2 = Param(npz['arr_19'], population_size, sigma) self.Wv = Param(npz['arr_20'], population_size, sigma) self.bv = Param(npz['arr_21'], population_size, sigma) self.lg0 = Param(npz['arr_22'], population_size, sigma) self.lb0 = Param(npz['arr_23'], population_size, sigma) self.lg1 = Param(npz['arr_24'], population_size, sigma) self.lb1 = Param(npz['arr_25'], population_size, sigma) self.lg2 = Param(npz['arr_26'], population_size, sigma) self.lb2 = Param(npz['arr_27'], population_size, sigma) else: # filter weight is whdo # w = width # h = height # d = depth (in channels) # o = out depth (out channels)? self.F1 = Param( tf.random.normal([F_size, F_size, 3, NF1_out], stddev=m.sqrt(2 / F_size)), population_size, sigma) self.F2 = Param( tf.random.normal([F_size, F_size, NF1_out, NF2_out], stddev=m.sqrt(2 / F_size)), population_size, sigma) self.F3 = Param( tf.random.normal([F_size, F_size, NF2_out, NF3_out], stddev=m.sqrt(2 / F_size)), population_size, sigma) self.g3 = Param(tf.ones((NF3_out, 1)), population_size, sigma) self.b3 = Param(tf.zeros((NF3_out, 1)), population_size, sigma) self.F4 = Param( tf.random.normal([F_size, F_size, NF3_out, NF4_out], stddev=m.sqrt(2 / F_size)), population_size, sigma) self.g4 = Param(tf.ones((NF4_out, 1)), population_size, sigma) self.b4 = Param(tf.zeros((NF4_out, 1)), population_size, sigma) self.F5 = Param( tf.random.normal([F_size, F_size, NF4_out, NF5_out], stddev=m.sqrt(2 / F_size)), population_size, sigma) self.g5 = Param(tf.ones((NF5_out, 1)), population_size, sigma) self.b5 = Param(tf.zeros((NF5_out, 1)), population_size, sigma) self.F6 = Param( tf.random.normal([F_size, F_size, NF5_out, NF6_out], stddev=m.sqrt(2 / F_size)), population_size, sigma) self.g6 = Param(tf.ones((NF6_out, 1)), population_size, sigma) self.b6 = Param(tf.zeros((NF6_out, 1)), population_size, sigma) self.lg0 = Param(tf.ones((H_size, 1)), population_size, sigma) self.lb0 = Param(tf.zeros((H_size, 1)), population_size, sigma) self.lg1 = Param(tf.ones((H_size, 1)), population_size, sigma) self.lb1 = Param(tf.zeros((H_size, 1)), population_size, sigma) self.lg2 = Param(tf.ones((H_size, 1)), population_size, sigma) self.lb2 = Param(tf.zeros((H_size, 1)), population_size, sigma) self.Wx0 = Param(tf.random.normal([H_size * 4, z_size]), population_size, sigma) self.bx0 = Param(tf.zeros([H_size * 4, 1]), population_size, sigma) self.Wx1 = Param(tf.random.normal([H_size * 4, H_size * 2]), population_size, sigma) self.bx1 = Param(tf.zeros([H_size * 4, 1]), population_size, sigma) self.Wx2 = Param(tf.random.normal([H_size * 4, H_size * 2]), population_size, sigma) self.bx2 = Param(tf.zeros([H_size * 4, 1]), population_size, sigma) self.Wv = Param(tf.random.normal([Y_size, H_size]), population_size, sigma) self.bv = Param(tf.zeros([Y_size, 1]), population_size, sigma)
class Parameters: def __init__(self, population_size=1, sigma=0, alpha=0, filename=''): self.population_size = population_size self.sigma = sigma self.alpha = alpha self.optimizer = Adam() if filename: npz = np.load(filename) self.F1 = Param(npz['arr_0'], population_size, sigma) self.F2 = Param(npz['arr_1'], population_size, sigma) self.F3 = Param(npz['arr_2'], population_size, sigma) self.F4 = Param(npz['arr_3'], population_size, sigma) self.F5 = Param(npz['arr_4'], population_size, sigma) self.F6 = Param(npz['arr_5'], population_size, sigma) self.g3 = Param(npz['arr_6'], population_size, sigma) self.b3 = Param(npz['arr_7'], population_size, sigma) self.g4 = Param(npz['arr_8'], population_size, sigma) self.b4 = Param(npz['arr_9'], population_size, sigma) self.g5 = Param(npz['arr_10'], population_size, sigma) self.b5 = Param(npz['arr_11'], population_size, sigma) self.g6 = Param(npz['arr_12'], population_size, sigma) self.b6 = Param(npz['arr_13'], population_size, sigma) self.Wx0 = Param(npz['arr_14'], population_size, sigma) self.bx0 = Param(npz['arr_15'], population_size, sigma) self.Wx1 = Param(npz['arr_16'], population_size, sigma) self.bx1 = Param(npz['arr_17'], population_size, sigma) self.Wx2 = Param(npz['arr_18'], population_size, sigma) self.bx2 = Param(npz['arr_19'], population_size, sigma) self.Wv = Param(npz['arr_20'], population_size, sigma) self.bv = Param(npz['arr_21'], population_size, sigma) self.lg0 = Param(npz['arr_22'], population_size, sigma) self.lb0 = Param(npz['arr_23'], population_size, sigma) self.lg1 = Param(npz['arr_24'], population_size, sigma) self.lb1 = Param(npz['arr_25'], population_size, sigma) self.lg2 = Param(npz['arr_26'], population_size, sigma) self.lb2 = Param(npz['arr_27'], population_size, sigma) else: # filter weight is whdo # w = width # h = height # d = depth (in channels) # o = out depth (out channels)? self.F1 = Param( tf.random.normal([F_size, F_size, 3, NF1_out], stddev=m.sqrt(2 / F_size)), population_size, sigma) self.F2 = Param( tf.random.normal([F_size, F_size, NF1_out, NF2_out], stddev=m.sqrt(2 / F_size)), population_size, sigma) self.F3 = Param( tf.random.normal([F_size, F_size, NF2_out, NF3_out], stddev=m.sqrt(2 / F_size)), population_size, sigma) self.g3 = Param(tf.ones((NF3_out, 1)), population_size, sigma) self.b3 = Param(tf.zeros((NF3_out, 1)), population_size, sigma) self.F4 = Param( tf.random.normal([F_size, F_size, NF3_out, NF4_out], stddev=m.sqrt(2 / F_size)), population_size, sigma) self.g4 = Param(tf.ones((NF4_out, 1)), population_size, sigma) self.b4 = Param(tf.zeros((NF4_out, 1)), population_size, sigma) self.F5 = Param( tf.random.normal([F_size, F_size, NF4_out, NF5_out], stddev=m.sqrt(2 / F_size)), population_size, sigma) self.g5 = Param(tf.ones((NF5_out, 1)), population_size, sigma) self.b5 = Param(tf.zeros((NF5_out, 1)), population_size, sigma) self.F6 = Param( tf.random.normal([F_size, F_size, NF5_out, NF6_out], stddev=m.sqrt(2 / F_size)), population_size, sigma) self.g6 = Param(tf.ones((NF6_out, 1)), population_size, sigma) self.b6 = Param(tf.zeros((NF6_out, 1)), population_size, sigma) self.lg0 = Param(tf.ones((H_size, 1)), population_size, sigma) self.lb0 = Param(tf.zeros((H_size, 1)), population_size, sigma) self.lg1 = Param(tf.ones((H_size, 1)), population_size, sigma) self.lb1 = Param(tf.zeros((H_size, 1)), population_size, sigma) self.lg2 = Param(tf.ones((H_size, 1)), population_size, sigma) self.lb2 = Param(tf.zeros((H_size, 1)), population_size, sigma) self.Wx0 = Param(tf.random.normal([H_size * 4, z_size]), population_size, sigma) self.bx0 = Param(tf.zeros([H_size * 4, 1]), population_size, sigma) self.Wx1 = Param(tf.random.normal([H_size * 4, H_size * 2]), population_size, sigma) self.bx1 = Param(tf.zeros([H_size * 4, 1]), population_size, sigma) self.Wx2 = Param(tf.random.normal([H_size * 4, H_size * 2]), population_size, sigma) self.bx2 = Param(tf.zeros([H_size * 4, 1]), population_size, sigma) self.Wv = Param(tf.random.normal([Y_size, H_size]), population_size, sigma) self.bv = Param(tf.zeros([Y_size, 1]), population_size, sigma) def all(self): return [self.F1, self.F2, self.F3, self.F4, self.F5, self.F6,\ self.g3, self.b3, self.g4, self.b4, self.g5, self.b5, self.g6, self.b6,\ self.Wx0, self.bx0, self.Wx1, self.bx1, self.Wx2, self.bx2,\ self.Wv, self.bv,\ self.lg0,self.lb0,self.lg1,self.lb1,self.lg2,self.lb2] # return reference to current tensors def current(self): return [param.current for param in self.all()] def set_current_population_member(self, i): for param in self.all(): param.set_current_population_member(i) def update_nes(self, reward, reward_mean, reward_std): reward = (reward - reward_mean) / (reward_std + .00001) grads = [] means = [] for param in self.all(): grads += [ param.get_grad(reward) * (self.alpha / (self.population_size * self.sigma)) ] means += [param.mean] self.optimizer.update(means, grads) for param in self.all(): param.gen_pop_about_mean(self.sigma) def mutate(self, param, i): x = param.population[i] if random.randint(1, 4) == 1: jitter = tf.random.normal(x.shape, stddev=self.sigma) return x + jitter else: return x def mate(self, param, i, j): if random.randint(1, 4) == 1: return self.mutate(param, i) else: return self.mutate(param, j) def update_ga(self, rewards): # sort parameters by rewards top_reward_indices = rewards.argsort()[-PASS_THROUGH:] top_reward_indices = top_reward_indices[::-1] for param in self.all(): # sort population for i, j in enumerate(top_reward_indices): param.population[i] = param.population[j] # generate new population for k in range(PASS_THROUGH, self.population_size): param.population[k] = self.mate(param, random.randint(0, 9), random.randint(0, 9))
def run_models(X, y, lrdict, batch_size=50, dont_run=[], epochs=500, epoch_loss=True): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) opts = OrderedDict({ 'GD': GradientDescent(lr=0.01), 'SGD': GradientDescent(lr=0.01), 'SGDM': SGDM(lr=0.01, gamma=0.9), 'Adam': Adam(lr=0.01), 'Adagrad': Adagrad(lr=0.01), 'Adadelta': Adadelta(), 'RMSProp': RMSProp(lr=0.01), }) opts = {k: v for k, v in opts.items() if k not in dont_run} for opt_name, lr in lrdict.items(): opts[opt_name].lr = lr res = pd.DataFrame() hist = pd.DataFrame() for opt_name, opt in opts.items(): print("Running Optimizer: ", opt_name) if opt_name == 'GD': batch_size = None elif opt_name == 'SGD': batch_size = 1 reg = LinearRegression(batch_size=batch_size, opt=opt, epochs=epochs) reg.fit(X_train, y_train, epoch_loss=epoch_loss) final_loss = reg.history['loss'][-1] final_betas = [i[0] for i in reg.betas] y_train_pred = reg.predict(X_train) try: train_r2 = round(r2_score(y_train, y_train_pred) * 100, 2) except: train_r2 = None y_test_pred = reg.predict(X_test) try: test_r2 = round(r2_score(y_test, y_test_pred) * 100, 2) except: test_r2 = None cols = ['opt', 'loss', 'train_r2', 'test_r2' ] + ['c' + str(i + 1) for i in range(len(final_betas))] vals = [opt_name, final_loss, train_r2, test_r2] + final_betas metrics = OrderedDict(zip(cols, vals)) res = res.append(metrics, ignore_index=True) hist = hist.append(pd.DataFrame({ 'epoch': np.arange(len(reg.history['loss'])), 'loss': reg.history['loss'], 'opt': [opt_name] * len(reg.history['loss']) }), ignore_index=True) return res, hist