def _backpropagate(self, output_word_index): dE_dz_y = self.y.copy() # don't remove the copy() part dE_dz_y[range(len(output_word_index)), output_word_index] -= 1. self.dE_dWy = np.dot(self.h.T, dE_dz_y) dE_dh = np.dot(dE_dz_y, self.Wy.T) self.dE_dWe = {} self.dE_dW = np.zeros_like(self.W) self.dE_dWr = np.zeros_like(self.Wr) self.dE_dWip = np.zeros_like(self.Wip) self.dE_dWfp = np.zeros_like(self.Wfp) self.dE_dWop = np.zeros_like(self.Wop) self.dE_dWp = np.zeros_like(self.Wp) dE_dm_tm1 = 0. dE_dh_tm1 = 0. m = self.m pause_history = self.pause_history if self.use_pauses else [None]*len(self.word_history) for pauses, words, W, Wr, Wip, Wfp, Wop, x, m_tm1, h_tm1, z, i, ig, fg, og in reversed(zip( pause_history, self.word_history, self.W_history, self.Wr_history, self.Wip_history, self.Wfp_history, self.Wop_history, self.x_history, self.m_tm1_history, self.h_tm1_history, self.z_history, self.i_history, self.ig_history, self.fg_history, self.og_history)): dE_dh = dE_dh + dE_dh_tm1 dE_dog = dE_dh * z * Sigmoid.dy_dz(y=og) dE_dz = dE_dh * og * self.hidden_activation.dy_dz(y=z) dE_dm = dE_dz + dE_dm_tm1 + dE_dog * Wop dE_dfg = dE_dm * m_tm1 * Sigmoid.dy_dz(y=fg) dE_di = dE_dm * ig * self.hidden_activation.dy_dz(y=i) dE_dig = dE_dm * i * Sigmoid.dy_dz(y=ig) dE_dm_tm1 = dE_dm * fg + dE_dig * Wip + dE_dfg * Wfp self.dE_dWip += (dE_dig * m_tm1).sum(0) self.dE_dWfp += (dE_dfg * m_tm1).sum(0) self.dE_dWop += (dE_dog * m).sum(0) d = np.hstack((dE_di, dE_dig, dE_dfg, dE_dog)) dE_dx = np.dot(d, W.T) * self.hidden_activation.dy_dz(y=x) dE_dh_tm1 = np.dot(d, Wr.T) self.dE_dW += np.dot(x.T, d) self.dE_dWr += np.dot(h_tm1.T, d) for word, dE_dx_word in izip(words, dE_dx): self.dE_dWe[word] = self.dE_dWe.get(word, 0.) + dE_dx_word if self.use_pauses: self.dE_dWp += np.dot(pauses.T, dE_dx) dE_dh = 0. m = m_tm1
def predict(self, input_word_index, pause_duration=None): assert self.initialized, "initialize or load before using" self.t_lstm.predict(input_word_index, pause_duration, compute_only_features=True) self.m_tm1 = self.m self.h_tm1 = self.h r = np.dot(self.h_tm1, self.Wr) z1 = np.dot(self.t_lstm.h, self.W) if self.use_pauses: z1 += np.dot(pause_duration[:,np.newaxis], self.Wp) z = self.slice(r, self.hidden_size, 0) + self.slice(z1, self.hidden_size, 0) self.i = self.hidden_activation.y(z) z = self.slice(r, self.hidden_size, 1) + self.slice(z1, self.hidden_size, 1) + self.m_tm1 * self.Wip self.ig = Sigmoid.y(z) z = self.slice(r, self.hidden_size, 2) + self.slice(z1, self.hidden_size, 2) + self.m_tm1 * self.Wfp self.fg = Sigmoid.y(z) self.m = self.i * self.ig + self.m_tm1 * self.fg z = self.slice(r, self.hidden_size, 3) + self.slice(z1, self.hidden_size, 3) + self.m * self.Wop self.og = Sigmoid.y(z) self.z = self.hidden_activation.y(self.m) self.h = self.z * self.og z_y = np.dot(self.h, self.Wy) self.y = Softmax.y(z=z_y) self._remember_state(pause_duration)
def __init__(self, learning_rate=.1, momentum=0.3, gradient_descent=True): self.param = None self.learning_rate = learning_rate self.momentum = momentum self.gradient_descent = gradient_descent self.sigmoid = Sigmoid() self.log_loss = LogisticLoss()
def test_sigma(self): self.assertAlmostEqual(Sigmoid.activation(0), 0.5, places=2) self.assertAlmostEqual(Sigmoid.activation(50), 1, places=2) self.assertAlmostEqual(Sigmoid.activation(-50), 0, places=2) self.assertAlmostEqual(Sigmoid.activation(1), 0.731, places=2) self.assertAlmostEqual(Sigmoid.activation(-1), 0.2689, places=2)
def __init__(self, n_hidden, n_iterations=3000, learning_rate=0.01): self.n_hidden = n_hidden self.n_iterations = n_iterations self.learning_rate = learning_rate self.hidden_activation = Sigmoid() self.output_activation = Softmax() self.loss = CrossEntropy()
def __init__(self, activation_function): Layer.__init__(self) # Instantiate the chosen activation function if activation_function is "relu": self.activation_function = Relu() if activation_function is "sigmoid": self.activation_function = Sigmoid()
def _backpropagate(self, output_word_index): dE_dz_y = self.y.copy() # don't remove the copy() part dE_dz_y[range(len(output_word_index)), output_word_index] -= 1. self.dE_dWy = np.dot(self.h.T, dE_dz_y) dE_dh = np.dot(dE_dz_y, self.Wy.T) * self.hidden_activation.dy_dz(y=self.h) self.dE_dWr = np.zeros_like(self.Wr) self.dE_dW = np.zeros_like(self.W) self.dE_dWip = np.zeros_like(self.Wip) self.dE_dWfp = np.zeros_like(self.Wfp) self.dE_dWop = np.zeros_like(self.Wop) self.dE_dWp = np.zeros_like(self.Wp) dE_dm_tm1 = 0. dE_dh_tm1 = 0. m = self.m pause_history = self.pause_history if self.use_pauses else [ None ] * len(self.h_tm1_history) for pauses, Wr, Wip, Wfp, Wop, t_lstm_h, m_tm1, h_tm1, z, i, ig, fg, og in reversed( list( zip(pause_history, self.Wr_history, self.Wip_history, self.Wfp_history, self.Wop_history, self.t_lstm_h_history, self.m_tm1_history, self.h_tm1_history, self.z_history, self.i_history, self.ig_history, self.fg_history, self.og_history))): dE_dh = dE_dh + dE_dh_tm1 dE_dog = dE_dh * z * Sigmoid.dy_dz(y=og) dE_dz = dE_dh * og * self.hidden_activation.dy_dz(y=z) dE_dm = dE_dz + dE_dm_tm1 + dE_dog * Wop dE_dfg = dE_dm * m_tm1 * Sigmoid.dy_dz(y=fg) dE_di = dE_dm * ig * self.hidden_activation.dy_dz(y=i) dE_dig = dE_dm * i * Sigmoid.dy_dz(y=ig) dE_dm_tm1 = dE_dm * fg + dE_dig * Wip + dE_dfg * Wfp self.dE_dWip += (dE_dig * m_tm1).sum(0) self.dE_dWfp += (dE_dfg * m_tm1).sum(0) self.dE_dWop += (dE_dog * m).sum(0) d = np.hstack((dE_di, dE_dig, dE_dfg, dE_dog)) dE_dh_tm1 = np.dot(d, Wr.T) if self.use_pauses: self.dE_dWp += np.dot(pauses.T, d) self.dE_dW += np.dot(t_lstm_h.T, d) self.dE_dWr += np.dot(h_tm1.T, d) dE_dh = 0. m = m_tm1
class LogisticRegression(): """The Logistic Regression classifier. Parameters: ----------- learning_rate: float The step length that will be taken when following the negative gradient during training. gradient_descent: boolean True or false depending if gradient descent should be used when training. If false then we use batch optimization by least squares. """ def __init__(self, learning_rate=.1, gradient_descent=True): self.param = None self.learning_rate = learning_rate self.gradient_descent = gradient_descent self.sigmoid = Sigmoid() self.log_loss = LogisticLoss() def fit(self, X, y, n_iterations=4000): # Add dummy ones for bias weights X = np.insert(X, 0, 1, axis=1) n_samples, n_features = np.shape(X) # Initial parameters between [-1/sqrt(N), 1/sqrt(N)] a = -1 / math.sqrt(n_features) b = -a self.param = (b - a) * np.random.random((n_features, )) + a # Tune parameters for n iterations for i in range(n_iterations): # Make a new prediction y_pred = self.sigmoid.function(X.dot(self.param)) if self.gradient_descent: # Move against the gradient of the loss function with # respect to the parameters to minimize the loss self.param -= self.learning_rate * self.log_loss.gradient( y, X, self.param) else: # Make a diagonal matrix of the sigmoid gradient column vector diag_gradient = make_diagonal( self.sigmoid.gradient(X.dot(self.param))) # Batch opt: self.param = np.linalg.pinv(X.T.dot(diag_gradient).dot(X)).dot( X.T).dot( diag_gradient.dot(X).dot(self.param) + y - y_pred) def predict(self, X): # Add dummy ones for bias weights X = np.insert(X, 0, 1, axis=1) # Print a final prediction dot = X.dot(self.param) y_pred = np.round(self.sigmoid.function(dot)).astype(int) return y_pred
def test_get_final_layer_error_for_arrays(self): quadratic = cost_functions.QuadraticCost(neural_net=self.net) z_last = np.array([3, -1], float) z_last_prime = Sigmoid.gradient(z_last) y = np.array([0, 0.5], float) a_last = Sigmoid.activation(z_last) nabla = quadratic.get_final_layer_error(a_last, y, z_last_prime) self.assertAlmostEqual(nabla[0], (a_last[0] - y[0]) * z_last_prime[0], places=2) self.assertAlmostEqual(nabla[1], (a_last[1] - y[1]) * Sigmoid.gradient(z_last[1]), places=2)
def predict(self, input_word_index, pause_duration=None, compute_only_features=False): assert self.initialized, "initialize or load before using" self.m_tm1 = self.m self.h_tm1 = self.h r = np.dot(self.h_tm1, self.Wr) z = self.We[input_word_index] if self.use_pauses: z += np.dot(pause_duration[:, np.newaxis], self.Wp) self.x = self.hidden_activation.y(z) z1 = np.dot(self.x, self.W) z = self.slice(r, self.hidden_size, 0) + self.slice( z1, self.hidden_size, 0) self.i = self.hidden_activation.y(z) z = self.slice(r, self.hidden_size, 1) + self.slice( z1, self.hidden_size, 1) + self.m_tm1 * self.Wip self.ig = Sigmoid.y(z) z = self.slice(r, self.hidden_size, 2) + self.slice( z1, self.hidden_size, 2) + self.m_tm1 * self.Wfp self.fg = Sigmoid.y(z) self.m = self.i * self.ig + self.m_tm1 * self.fg z = self.slice(r, self.hidden_size, 3) + self.slice( z1, self.hidden_size, 3) + self.m * self.Wop self.og = Sigmoid.y(z) self.z = self.hidden_activation.y(self.m) self.h = self.z * self.og if not compute_only_features: z_y = np.dot(self.h, self.Wy) self.y = Softmax.y(z=z_y) if self.use_pauses: self._remember_state(input_word_index, pause_duration[:, np.newaxis]) else: self._remember_state(input_word_index)
class Activation(Layer): def __init__(self, activation_function): Layer.__init__(self) # Instantiate the chosen activation function if activation_function is "relu": self.activation_function = Relu() if activation_function is "sigmoid": self.activation_function = Sigmoid() def forward_propagation(self, X): # Save the input for the layer self.X = X A = self.activation_function(self.X) # If there is a subsequent layer then return it's output, else return the output of this layer if self.next_layer is not None: return self.next_layer.forward_propagation(A) else: return A def backward_propogation(self, error_signal): # If there is a preceding layer then pass the error_signal if self.previous_layer is not None: self.previous_layer.backward_propogation( error_signal * self.activation_function.derivative(self.X)) def initalise(self): self.shape = self.previous_layer.shape
def test_get_final_layer_error_for_1_element_vectors(self): cross_entropy = cost_functions.CrossEntropyCost(self.net) z_last = np.array([3], float) z_last_prime = Sigmoid.gradient(z_last) y = np.array([0], float) a_last = Sigmoid.activation(z_last) nabla = cross_entropy.get_final_layer_error(a_last, y, z_last_prime) self.assertAlmostEqual(nabla[0], (a_last - y), places=2) z_last = np.array([-1], float) z_last_prime = Rectifier.gradient(z_last) y = np.array([0.5], float) a_last = Sigmoid.activation(z_last) nabla = cross_entropy.get_final_layer_error(a_last, y, z_last_prime) self.assertAlmostEqual(nabla[0], (a_last - y), places=2)
def forward(self, inputs): self.x = inputs.reshape(1, X_DIM) self.y1 = np.matmul(self.x, self.w1) + self.b1 self.y1 = LeakyReLU(self.y1) self.y2 = np.matmul(self.y1, self.w2) + self.b2 self.y2 = LeakyReLU(self.y2) self.y3 = np.matmul(self.y2, self.w3) + self.b3 self.y = Sigmoid(self.y3) return self.y
def __init__(self, grad_wrt_theta=True): sigmoid = Sigmoid() self.log_func = sigmoid.function self.log_grad = sigmoid.gradient if grad_wrt_theta: self.gradient = self._grad_wrt_theta if not grad_wrt_theta: self.gradient = self._grad_wrt_pred self.hess = self._hess_wrt_pred
class LogisticRegression(): """ Logistic Regression classifier. Parameters: ----------- learning_rate: float The step length that will be taken when following the negative gradient during training. gradient_descent: boolean True or false depending if gradient descent should be used when training. If false then we use batch optimization by least squares. """ def __init__(self, learning_rate=.1, gradient_descent=True): self.param = None self.learning_rate = learning_rate self.gradient_descent = gradient_descent self.sigmoid = Sigmoid() def _initialize_parameters(self, X): n_features = np.shape(X)[1] # Initialize parameters between [-1/sqrt(N), 1/sqrt(N)] limit = 1 / math.sqrt(n_features) self.param = np.random.uniform(-limit, limit, (n_features, )) def fit(self, X, y, n_iterations=1000): self._initialize_parameters(X) # Tune parameters for n iterations for i in range(n_iterations): # Make a new prediction y_pred = self.sigmoid(X.dot(self.param)) if self.gradient_descent: # Move against the gradient of the loss function with # respect to the parameters to minimize the loss self.param -= self.learning_rate * -(y - y_pred).dot(X) else: # Make a diagonal matrix of the sigmoid gradient column vector diag_gradient = make_diagonal( self.sigmoid.gradient(X.dot(self.param))) # diag_gradient = np.zeros((len(X), len(X))) # Batch opt: self.param = np.linalg.pinv(X.T.dot(diag_gradient).dot(X)).dot( X.T).dot( diag_gradient.dot(X).dot(self.param) + y - y_pred) def predict(self, X): y_pred = np.round(self.sigmoid(X.dot(self.param))).astype(int) return y_pred #TODO def predict_proba(self, X): y_prob = self.sigmoid(X.dot(self.param)) return [y_prob, y_prob]
def test_sigma_prime(self): self.assertAlmostEqual(Sigmoid.gradient(0), 0.25, places=3) self.assertAlmostEqual(Sigmoid.gradient(-50), 0, places=3) self.assertAlmostEqual(Sigmoid.gradient(50), 0, places=3) self.assertAlmostEqual(Sigmoid.gradient(50), Sigmoid.activation(50) * (1 - Sigmoid.activation(50)), places=3)
def __init__(self): sigmoid = Sigmoid() self.log_func = sigmoid self.log_grad = sigmoid.gradient
class MultilayerPerceptron(): """Multilayer Perceptron classifier. A fully-connected neural network with one hidden layer. Unrolled to display the whole forward and backward pass. Parameters: ----------- n_hidden: int: The number of processing nodes (neurons) in the hidden layer. n_iterations: float The number of training iterations the algorithm will tune the weights for. learning_rate: float The step length that will be used when updating the weights. """ def __init__(self, n_hidden, n_iterations=3000, learning_rate=0.01): self.n_hidden = n_hidden self.n_iterations = n_iterations self.learning_rate = learning_rate self.hidden_activation = Sigmoid() self.output_activation = Softmax() self.loss = CrossEntropy() def _initialize_weights(self, X, y): n_samples, n_features = X.shape _, n_outputs = y.shape # Hidden layer limit = 1 / math.sqrt(n_features) self.W = np.random.uniform(-limit, limit, (n_features, self.n_hidden)) self.w0 = np.zeros((1, self.n_hidden)) # Output layer limit = 1 / math.sqrt(self.n_hidden) self.V = np.random.uniform(-limit, limit, (self.n_hidden, n_outputs)) self.v0 = np.zeros((1, n_outputs)) def fit(self, X, y): self._initialize_weights(X, y) for i in range(self.n_iterations): # .............. # Forward Pass # .............. # HIDDEN LAYER hidden_input = X.dot(self.W) + self.w0 #(1079*64)(64,16)+(1,16) -> (1079,16)+(1,16)->(1079,16) hidden_output = self.hidden_activation(hidden_input) # OUTPUT LAYER output_layer_input = hidden_output.dot(self.V) + self.v0 y_pred = self.output_activation(output_layer_input) # ............... # Backward Pass # ............... # OUTPUT LAYER # Grad. w.r.t input of output layer grad_wrt_out_l_input = self.loss.gradient(y, y_pred) * self.output_activation.gradient(output_layer_input) #(1079,10)(1079,10)->(1079,10) grad_v = hidden_output.T.dot(grad_wrt_out_l_input) # (16,1079)(1079,10)->(16,10) grad_v0 = np.sum(grad_wrt_out_l_input, axis=0, keepdims=True) # (1,10) # HIDDEN LAYER # Grad. w.r.t input of hidden layer # (1079,10) grad_wrt_hidden_l_input = grad_wrt_out_l_input.dot(self.V.T) * self.hidden_activation.gradient(hidden_input) grad_w = X.T.dot(grad_wrt_hidden_l_input) grad_w0 = np.sum(grad_wrt_hidden_l_input, axis=0, keepdims=True) # Update weights (by gradient descent) # Move against the gradient to minimize loss self.V -= self.learning_rate * grad_v self.v0 -= self.learning_rate * grad_v0 self.W -= self.learning_rate * grad_w self.w0 -= self.learning_rate * grad_w0 # Use the trained model to predict labels of X def predict(self, X): # Forward pass: hidden_input = X.dot(self.W) + self.w0 hidden_output = self.hidden_activation(hidden_input) output_layer_input = hidden_output.dot(self.V) + self.v0 y_pred = self.output_activation(output_layer_input) return y_pred
def __init__(self, learning_rate=.1, gradient_descent=True): self.param = None self.learning_rate = learning_rate self.gradient_descent = gradient_descent self.sigmoid = Sigmoid()
import random import numpy as np from shallow_network import ShallowNetwork from activation_functions import Sigmoid, LeakyRelu # Train the network to behave like a binary "AND" function training_data = [[[0, 0], [0]], [[0, 1], [0]], [[1, 0], [0]], [[1, 1], [1]]] # network = ShallowNetwork(2, 1, LeakyRelu(), 0.03) network = ShallowNetwork(2, 1, Sigmoid(), 0.5) for training_session in range(10000): training_set = random.choice(training_data) inputs = training_set[0] target_output = training_set[1] outputs = network.feed_forward(inputs) network.back_propagate(inputs, outputs, target_output) error = np.subtract(outputs, target_output) print('error:', '{:.4f}'.format(abs(error[0])), 'target_output', target_output, 'output:', outputs)
import random import numpy as np from deep_network import DeepNetwork from activation_functions import Sigmoid, LeakyRelu # Train the network to give us the XOR on neuron 0 and the OR on neuron 1 training_data = [[[0, 0], [0, 0]], [[0, 1], [1, 1]], [[1, 0], [1, 1]], [[1, 1], [0, 1]]] # network = DeepNetwork(2, 4, 1, LeakyRelu(), 0.03) network = DeepNetwork(2, 4, 2, Sigmoid(), 0.5) for training_session in range(20000): training_set = random.choice(training_data) inputs = training_set[0] target_output = training_set[1] outputs = network.feed_forward(inputs) network.back_propagate(inputs, outputs, target_output) error = np.subtract(outputs, target_output) print('error:', ['{:.4f}'.format(abs(error[0])), '{:.4f}'.format(abs(error[1]))], 'target_output', target_output, 'output:', outputs)
score_sum = sum(np.array(self.game.gamegrid.matrix).flatten().tolist()) penalty = self.fitness_penalty return score_max + score_sum + penalty GENERATION_SIZE = 4 GENRATION_COUNT = 2 PRINT_STEPS = True WEIGHTS_METHOD = 'random' nn_parameters = { 'neurons_per_hidden_layer': [17, 17, 17], 'input_layer_size': 17, 'output_layer_size': 4, 'input_af': Log2(), 'hidden_af': [TanH(), ReLU(), Sigmoid()], 'output_af': TanH() } game_parameters = { 'manual_input': True, 'random': False, 'steps': 0, 'sleep': 0 } ga = GeneticAlgorithm(generation_size=GENERATION_SIZE, **nn_parameters) ga.add_new_generation(weights_method=WEIGHTS_METHOD) ga.populate_new_generation(ga[0], ga[0], weights_method=WEIGHTS_METHOD) for k in range(GENRATION_COUNT):