class Net: def __init__(self, method=Method.Sigmoid): self.weights = [] # Current weights self.old_weights = [] # Last time weights self.output = 0.0 # Neuron output self.inputted_features = [] # Inputted features self.summed_signal = 0.0 # Summed singal (the summation of input) self.learning_rate = 0.8 # Learning rate self.activition = Activation() # Activation function # 加总信号 def summarize_inputs(self, features=[]): if not features: return 0.0 self.inputted_features = copy.deepcopy(features) self.summed_signal = np.dot(self.inputted_features, self.weights) # a = X * W self.output = self.activition.activate(self.summed_signal) # b = f(a) return self.output # 更新权重 def update_weights(self, error_value=0.0): self.old_weights = copy.deepcopy(self.weights) for index, old_weight in enumerate(self.old_weights): # new weight = old weight + learning rate * error_value(i) * input new_weight = old_weight + self.learning_rate * error_value * self.inputted_features[ index] self.weights[index] = new_weight def differential_activition(self): return self.activition.differentiate(self.output)
class Net(object): # Read, Write activation_method = None # Readonly output_value = None # 输出值 output_partial = None previous_output = None previous_output_partial = None def __init__(self, has_recurrent=False): self.weights = [] # <number> self.recurrent_weights = [] # <number> self.bias = 0.0 self.delta_value = 0.0 # Current delta value will be next delta value. self.has_recurrent = has_recurrent # Has recurrent inputs ? (hidden net has recurrent, but output net not. self.activation = Activation( ) # 活化函式的 Get, Set 都在这里: net.activation.method. # 有另外开 self.activation_method 来方便存取 self.output = NetOutput() self.timesteps = [] # <Timestep Object> # weights<number> def reset_weights(self, weights=[]): if not weights: return self.weights = np.copy(weights).tolist() # recurrent_weights<number> def reset_recurrent_weights(self, recurrent_weights=[]): if not recurrent_weights: return self.recurrent_weights = np.copy(recurrent_weights).tolist() def weight_for_index(self, index=0): return self.weights[index] def recurrent_weight_for_index(self, index=0): return self.recurrent_weights[index] # bias 也在这里归零 def remove_all_weights(self): del self.weights[:] del self.recurrent_weights[:] self.bias = 0.0 # Randomizing weights and bias. def randomize_weights(self, random_count=1, min=-0.5, max=0.5): del self.weights[:] self.bias = 0.0 random = np.random for i in range(0, random_count): self.weights.append(random.uniform(min, max)) self.bias = random.uniform(min, max) def randomize_recurrent_weights(self, random_count=1, min=-0.5, max=0.5): if self.has_recurrent: del self.recurrent_weights[:] random = np.random for i in range(0, random_count): self.recurrent_weights.append(random.uniform(min, max)) # Net output. (Hidden net with recurrent, Output net without recurrent) def net_output(self, inputs=[], recurrent_outputs=[]): # 先走一般前馈(Forward)至 Hidden Layer summed_singal = np.dot(inputs, self.weights) + self.bias # 如果有递归层,再走递归(Recurrent) 至 Hidden Layer if len(recurrent_outputs) > 0: summed_singal += np.dot(recurrent_outputs, self.recurrent_weights) # 神经元输出 output_value = self.activation.activate(summed_singal) self.output.add_sum_input(summed_singal) self.output.add_output_value(output_value) return output_value def clear(self): self.output.refresh() # For hidden layer nets to calculate their delta weights with recurrent layer, # and for output layer nets to calculate their delta weights without recurrent layer. # layer_outputs: hidden layer outputs or output layer outputs. def calculate_delta_weights(self, learning_rate=1.0, layer_outputs=[], recurrent_outputs=[]): # 利用 Timestep 来当每一个 BP timestep 算权重修正值时的记录容器 timestep = Timestep() # For delta bias. timestep.delta_bias = learning_rate * self.delta_value # For delta of weights. for weight_index, weight in enumerate(self.weights): # To calculate and delta of weight. last_layer_output = layer_outputs[weight_index] # SGD: new w = old w + (-learning rate * delta_value * x) # -> x 可为 b[t][h] (hidden output) 或 b[t-1][h] (recurrent output) 或 x[i] (input feature) # Output layer 的 delta_value = aE/aw[hk] = -error value * f'(net) # Hidden layer 的 delta_value = aE/aw[ih] = SUM(delta_value[t][hk] * w[hk] + SUM(delta_value[t+1][h'h] * w delta_weight = learning_rate * self.delta_value * last_layer_output timestep.add_delta_weight(delta_weight) # For delta of recurrent weights. (Noted: Output Layer is without Recurrent) for recurrent_index, recurrent_weight in enumerate( self.recurrent_weights): last_recurrent_output = recurrent_outputs[recurrent_index] recurrent_delta_weight = learning_rate * self.delta_value * last_recurrent_output timestep.add_recurrent_delta_weight(recurrent_delta_weight) self.timesteps.append(timestep) def renew_weights(self, new_weights=[], new_recurrent_weights=[]): if not new_weights and not new_recurrent_weights: return self.remove_all_weights() self.reset_weights(new_weights) self.reset_recurrent_weights(new_recurrent_weights) def renew_bias(self): sum_changes = 0.0 for timestep in self.timesteps: sum_changes += timestep.delta_bias # new b(j) = old b(j) + [-L * -delta(j)] self.bias += sum_changes # Renew weights and bias. def renew(self): # 累加每个 Timestep 里相同 Index 的 Delta Weight new_weights = [] new_recurrent_weights = [] for weight_index, weight in enumerate(self.weights): # For normal weight. sum_delta_changes = 0.0 for timestep in self.timesteps: sum_delta_changes += timestep.delta_weight(weight_index) new_weight = weight + sum_delta_changes new_weights.append(new_weight) for recurrent_index, recurrent_weight in enumerate( self.recurrent_weights): # for recurrent weight. sum_recurrent_changes = 0.0 for timestep in self.timesteps: sum_recurrent_changes += timestep.recurrent_delta_weight( recurrent_index) new_recurrent_weight = recurrent_weight + sum_recurrent_changes new_recurrent_weights.append(new_recurrent_weight) self.renew_weights(new_weights, new_recurrent_weights) self.renew_bias() del self.timesteps[:] self.delta_value = 0.0 # 一定要归零, 因为走 BPTT 的原故 ''' @ Getters that all Readonly ''' @property # The last output value from output_values. def output_value(self): return self.output.last_output_value @property # The last output value partial from output_values. def output_partial(self): # 是线性输出的活化函式(e.g. SGN, ReLU ... etc.),必须用输入信号(Sum Input)来求函式偏微分 # 非线性的活化函式则用输出值(Output Value)来求偏微。 output_value = self.output.last_sum_input if self.activation.is_linear == True else self.output_value return self.activation.partial(output_value) @property # The last moment output. e.g. b[t-1][h] def previous_output(self): return self.output.previous_output @property # 上一刻的输出偏微分 def previous_output_partial(self): output_value = self.output.previous_sum_input if self.activation.is_linear == True else self.previous_output return self.activation.partial(output_value) @property def activation_method(self): return self.activation.method ''' @ Setter ''' @activation_method.setter def activation_method(self, method): self.activation.method = method
class Neuron: # Private usage. __iteration_times = 0 # 迭代次数 __iteration_error = 0.0 #迭代误差总和 def __init__(self): self.tag = self.__class__.__name__ self.samples = [] # 所有的训练样本(特征值) self.targets = [] # 范本的目标输出 self.weights = [] # 权重 self.bias = 0.0 # 偏权值 self.learning_rate = 1.0 # 学习速率 self.max_iteration = 1 # 最大迭代数 self.convergence = 0.001 # 收敛误差 self.activation = Activation() # Iteration Cost Function: 每个完整迭代运算后,把每一个训练样本的cost function取平均(用于判断是否收敛) def _iteration_cost_function(self): # 1/2 * (所有训练样本的cost function总和 / (训练样本数量 * 每笔训练样本的目标输出数量)) return 0.5 * (self.__iteration_error / (len(self.samples) * 1)) # 训练样本的Cost Function: 由于会在 _iteration_cost_function()计算迭代的cost function时去统一除 1/2。 # 故在这里计算训练样本的cost function 时不除以 1/2。 def _cost_function(self, error_value=0.0): self.__iteration_error += (error_value**2) def _net_input(self, features=[]): return np.dot(features, self.weights) def _net_output(self, net_input=0.0): return self.activation.activate(net_input) def _start(self, iteration, completion): self.__iteration_times += 1 self.__iteration_error += 0.0 # 这里刻意反每一个步骤都写出来,一步步的代算清楚流程 for index, features in enumerate(self.samples): # Forward target_value = self.targets[index] net_input = self._net_input(features) net_output = self._net_output(net_input) # Backward error_value = target_value - net_output derived_activation = self.activation.partial(net_output) # Calculates cost function of the training sample. self._cost_function(error_value) # Updates all weights, the formula: # delta_value = -(target value - net output) * f'(net) # delta_weight = -learning rate * delta_value * x1 (Noted: 这里 learning rate 和 delta_value的负号会相) # new weights, e.g. new s1 = old w1 + delta_weight w1 delta_value = error_value * derived_activation delta_weights = np.multiply(self.learning_rate * delta_value, features) new_weights = np.add(self.weights, delta_weights) self.weights = new_weights # Finished an iteration then adjusts conditions if (self.__iteration_times >= self.max_iteration) or ( self._iteration_cost_function() <= self.convergence): if not completion is None: completion(self.__iteration_times, self.weights) else: if not iteration is None: iteration(self.__iteration_times, self.weights) self._start(iteration, completion) # One training sample: features -> one target def add_pattern(self, features=[], target=0): # If features is not an array that still working on here if not features: return # samples[features array] # targets[target value] self.samples.append(features) self.targets.append(target) def initialize_weights(self, weights=[]): if not weights: return self.weights = weights # 全零的初始权重 def zero_weights(self): if not self.samples: return length = len(self.samples[0]) for i in range(length): self.weights.append(0.0) def randomize_weights(self, min=0.0, max=1.0): # Float random = np.random input_count = len(self.samples[0]) weights = [] for i in range(0, input_count): weights.append(random.uniform(min, max)) self.initialize_weights(weights) # iteration and completion are callback functions def training(self, iteration, completion): self.__iteration_times = 0 self.__iteration_error = 0.0 self._start(iteration, completion) def predict(self, features=[]): return self._net_output(self._net_input(features))
class RNN: def __init__(self, input_layer_size, state_layer_size, state_layer_activation, output_layer_size, output_layer_activation, epochs=100, bptt_truncate=None, learning_rule='bptt', kernel=None, eta=0.001, rand=None, verbose=0): """ Notes: U - weight matrix from input into hidden layer. W - weight matrix from hidden layer to hidden layer. V - weight matrix from hidden layer to output layer. Inputs: input_size: Size of the input vector. We expect a 2D numpy array, so this should be X.shape[1] state_layer_size: State layer size. state_layer_activation: A string. Refer to activation.py output_size: Size of the output vector. We expect a 2D numpy array, so this should be Y.shape[1] output_layer_activation: A string. Refer to activation.py epochs(opt): Number of epochs for a single training sample. learning_rule(opt): Choose between 'bptt' and 'modified' bptt_truncate(opt): If left at None, back propagation through time will be applied for all time steps. Otherwise, a value for bptt_truncate means that bptt will only be applied for at most bptt_truncate steps. Only considered when learning_rule == 'bptt' kernel(opt): # TODO - fill this Only considered when learning_rule == 'modified' eta (opt): Learning rate. Initialized to 0.001. rand (opt): Random seed. Initialized to None (no random seed). verbose (opt): Verbosity: levels 0 - 2 Outputs: None """ np.random.seed(rand) self.learning_rule = learning_rule.lower() if self.learning_rule == 'bptt': self.gradient_function = self.bptt elif self.learning_rule == 'modified': self.gradietn_function = self.modified_learning_rule else: raise ValueError self.input_layer_size = input_layer_size self.state_layer_size = state_layer_size self.state_layer_activation = state_layer_activation self.state_activation = Activation(state_layer_activation) self.output_layer_size = output_layer_size self.output_layer_activation = output_layer_activation self.output_activation = Activation(output_layer_activation) self.epochs = epochs self.kernel = kernel self.bptt_truncate = bptt_truncate # U - weight matrix from input into state layer. # W - weight matrix from state layer to state layer. # V - weight matrix from state layer to output layer. self.U = np.random.uniform(-np.sqrt(1. / input_layer_size), np.sqrt(1. / input_layer_size), (state_layer_size, input_layer_size)) self.V = np.random.uniform(-np.sqrt(1. / state_layer_size), np.sqrt(1. / state_layer_size), (output_layer_size, state_layer_size)) self.W = np.random.uniform(-np.sqrt(1. / state_layer_size), np.sqrt(1. / state_layer_size), (state_layer_size, state_layer_size)) self.state_bias = np.zeros((state_layer_size, 1)) self.output_bias = np.zeros((output_layer_size, 1)) self.eta = eta self.verbose = verbose self.show_progress_bar = verbose > 0 def fit(self, X_train, y_train): """ Notes: Inputs: X_train: Training inputs. Expect a list with numpy arrays of size (input_layer_size, N) where N is the number of samples. Y_train: Training outputs. Expect a list with numpy arrays of size (output_layer_size, N) where N is the number of samples. Outputs: None """ eta = self.eta if self.show_progress_bar: bar = ProgressBar(max_value=self.epochs) for epoch in range(self.epochs): for x, y in zip(X_train, y_train): dLdU, dLdV, dLdW, dLdOb, dLdSb = self.gradient_function(x, y) self.U -= eta * dLdU self.V -= eta * dLdV self.W -= eta * dLdW self.output_bias -= dLdOb self.state_bias -= dLdSb if self.show_progress_bar: bar.update(epoch) def forward_propagation(self, x): """ Inputs: x: Expect size (T, input_layer_size), where T is the length of time. Outputs: o: The activation of the output layer. s: The activation of the hidden state. """ T = x.shape[0] s = np.zeros((T + 1, self.state_layer_size)) o = np.zeros((T, self.output_layer_size)) s_linear = np.zeros((T + 1, self.state_layer_size)) o_linear = np.zeros((T, self.output_layer_size)) state_bias = Convert2DTo1D(self.state_bias) output_bias = Convert2DTo1D(self.output_bias) for t in np.arange(T): state_linear = np.dot(self.U, x[t]) + np.dot(self.W, s[t - 1]) + state_bias s_linear[t] = state_linear s[t] = self.state_activation.activate(state_linear) output_linear = np.dot(self.V, s[t]) + output_bias o[t] = self.output_activation.activate(output_linear) o_linear[t] = output_linear return (o, s, s_linear, o_linear) def modified_learning_rule(self, x, y): """ Output: dLdU: Gradient for U matrix dLdV: Gradient for V matrix dLdW: Gradient for W matrix dLdOb: Gradient for output layer bias dLdSb: Gradient for state layer bias TODO - implement this """ raise NotImplementedError def bptt(self, x, y): """ Output: dLdU: Gradient for U matrix dLdV: Gradient for V matrix dLdW: Gradient for W matrix dLdOb: Gradient for output layer bias dLdSb: Gradient for state layer bias """ # TODO - numpy likes to provide 1D matrices instead of 2D, and unfortunately # we need 2D matrices. Therefore we have a lot of converting 1D to 2D matrices # and we might want to clean that later somehow... # TODO - also this can probably be cleaned more. T = len(y) assert T == len(x) if self.bptt_truncate is None: bptt_truncate = T else: bptt_truncate = self.bptt_truncate o, s, s_linear, o_linear = self.forward_propagation(x) dLdU = np.zeros(self.U.shape) dLdV = np.zeros(self.V.shape) dLdW = np.zeros(self.W.shape) dLdOb = np.zeros(self.output_bias.shape) dLdSb = np.zeros(self.state_bias.shape) num_dU_additions = 0 num_dVdW_additions = 0 delta_o = o - y for t in reversed(range(T)): # Backprop the error at the output layer g = delta_o[t] o_linear_val = o_linear[t] state_activation = s[t] g = Convert1DTo2D(g) o_linear_val = Convert1DTo2D(o_linear_val) state_activation = Convert1DTo2D(state_activation) g = g * self.output_activation.dactivate(o_linear_val) dLdV += np.dot(g, state_activation.T) dLdOb += g num_dU_additions += 1 g = np.dot(self.V.T, g) # Backpropagation through time for at most bptt truncate steps for bptt_step in reversed(range(max(0, t - bptt_truncate), t + 1)): state_linear = s_linear[bptt_step] state_activation_prev = s[bptt_step - 1] x_present = x[t] state_linear = Convert1DTo2D(state_linear) state_activation_prev = Convert1DTo2D(state_activation_prev) x_present = Convert1DTo2D(x_present) g = g * self.state_activation.dactivate(state_linear) dLdW += np.dot(g, state_activation_prev.T) dLdU += np.dot(g, x_present.T) dLdSb += g num_dVdW_additions += 1 g = g * np.dot(self.W.T, g) return [ dLdU / num_dU_additions, dLdV / num_dVdW_additions, dLdW / num_dVdW_additions, dLdOb / num_dU_additions, dLdSb / num_dVdW_additions ] def predict(self, X): """ Inputs: X: Training inputs. Expect a list with numpy arrays of size (input_layer_size, N) where N is the number of samples. Outputs: predictions """ predictions = [] for x in X: o, _, _, _ = self.forward_propagation(x) predictions.append(o) return predictions def score(self, X, Y): """ Inputs: X: Training inputs. Expect a list with numpy arrays of size (input_layer_size, N) where N is the number of samples. Y: Training outputs. Expect a list with numpy arrays of size (output_layer_size, N) where N is the number of samples. Outputs: MSE """ predictions = self.predict(X) mses = [] for prediction, y in zip(predictions, Y): mses.append(np.mean((predictions - y)**2)) return np.mean(mses)
class Dense: # Definiation and Initialization of the Dense Layer def __init__(self, num_neurons, input_shape): print( 'Adding Layer: input_shape: {}, number of neurons: {}, output_shape: {}' .format(input_shape, num_neurons, num_neurons)) # Let's initialize the weights in interval [0,1) for respective synaptic inputs self.weights = np.random.uniform(low=0, high=1, size=input_shape) # Lets initialize the biases all with value '1' for every neuron in current layer self.biases = np.ones(num_neurons) # Lets initialize the activation_potentials all with value '0' for every neuron in current layer self.activation_potentials = np.zeros(num_neurons) # Outputs of this layer self.outputs = np.zeros(num_neurons) # Local Gradients of all the neurons in current layer self.local_gradients = np.zeros(num_neurons) # And finally the activation function, for non-linearity of outputs self.activation = Activation() # Inputs to this layer -> Outputs from previous layer self.previous_layers_outputs = [] print('Added Layer ... ') def get_gradients(self): return self.local_gradients def get_weights(self): return self.weights # The activation potential vector calculator for a given layer def activation_potential(self, inputs): self.activation_potentials = np.dot(inputs, self.weights) + self.biases #Local Gradient def local_gradient(self, error_at_end, layer, network, next_gradients, next_weights): if layer == network[-1]: # Output layer = error * derivative of activation function self.local_gradients = np.dot( error_at_end, self.activation.activate_derivative( self.activation_potentials)) else: # Hidden layers = derivative of activation function * sum of all (derivative of activation # function of next layer * weights associated with this neuron going to all neurons in next layer) diff_of_activation_funct = self.activation.activate_derivative( self.activation_potentials) self.local_gradients = np.dot(diff_of_activation_funct, np.dot(next_gradients, next_weights)) # Forward Signal Definition def forward_signal(self, inputs): # Calculate activation potentials for all neurons in this layer self.activation_potential(inputs) # Activate activation_potentials and save it in self.outputs self.outputs = self.activation.activate(self.activation_potentials) # Return the outputs of this layer, will need it in next layer return self.outputs # Backward Signal Definition def backward_signal(self, error_at_end, layer, network, learning_rate, next_gradients, next_weights, inputs): # Calculate local_gradients self.local_gradient(error_at_end, layer, network, next_gradients, next_weights) # Update weights self.weights = np.sum( self.weights, learning_rate * np.dot(self.local_gradients, inputs)) # Update Biases self.biases = np.sum(self.biases, learning_rate * np.dot(self.local_gradients, 1))
class RNN: def __init__(self, input_layer_size, state_layer_size, state_layer_activation, output_layer_size, output_layer_activation, epochs=100, bptt_truncate=None, learning_rule='bptt', tau=None, eta=1e-5, rand=None, verbose=0): """ Notes: U - weight matrix from input into hidden layer. W - weight matrix from hidden layer to hidden layer. V - weight matrix from hidden layer to output layer. Inputs: input_size: Size of the input vector. We expect a 2D numpy array, so this should be X.shape[1] state_layer_size: State layer size. state_layer_activation: A string. Refer to activation.py output_size: Size of the output vector. We expect a 2D numpy array, so this should be Y.shape[1] output_layer_activation: A string. Refer to activation.py epochs(opt): Number of epochs for a single training sample. learning_rule(opt): Choose between 'bptt','fa' or 'modified' bptt_truncate(opt): If left at None, back propagation through time will be applied for all time steps. Otherwise, a value for bptt_truncate means that bptt will only be applied for at most bptt_truncate steps. Only considered when learning_rule == 'bptt' kernel(opt): # TODO - fill this Only considered when learning_rule == 'modified' eta (opt): Learning rate. Initialized to 0.001. rand (opt): Random seed. Initialized to None (no random seed). verbose (opt): Verbosity: levels 0 - 2 Outputs: None """ np.random.seed(rand) self.learning_rule = learning_rule.lower() if self.learning_rule == 'bptt': self.gradient_function = self.bptt elif self.learning_rule == 'fa': self.gradient_function = self.feedback_alignment elif self.learning_rule == 'modified': self.gradient_function = self.modified_learning_rule else: raise ValueError self.input_layer_size = input_layer_size self.state_layer_size = state_layer_size self.state_layer_activation = state_layer_activation self.state_activation = Activation(state_layer_activation) self.output_layer_size = output_layer_size self.output_layer_activation = output_layer_activation self.output_activation = Activation(output_layer_activation) self.epochs = epochs self.tau = tau self.convolutions = None if self.tau: self.convolutions = np.zeros((state_layer_size, )) self.bptt_truncate = bptt_truncate # U - weight matrix from input into state layer. # W - weight matrix from state layer to state layer. # V - weight matrix from state layer to output layer. """ self.U = np.eye(-np.sqrt(1./input_layer_size), np.sqrt(1./input_layer_size), (state_layer_size, input_layer_size)) self.V = np.random.uniform(-np.sqrt(1./state_layer_size), np.sqrt(1./state_layer_size), (output_layer_size, state_layer_size)) self.W = np.random.uniform(-np.sqrt(1./state_layer_size), np.sqrt(1./state_layer_size), (state_layer_size, state_layer_size)) """ self.U = np.eye(state_layer_size) self.V = np.eye(state_layer_size) self.W = np.random.uniform(-np.sqrt(1. / state_layer_size), np.sqrt(1. / state_layer_size), (state_layer_size, state_layer_size)) self.state_bias = np.zeros((state_layer_size, 1)) self.output_bias = np.zeros((output_layer_size, 1)) # B - Feedback weight matrix for all layers self.B = np.random.uniform(-np.sqrt(1. / state_layer_size), np.sqrt(1. / state_layer_size), (state_layer_size, input_layer_size)) self.eta = eta self.verbose = verbose self.show_progress_bar = verbose > 0 def kernel_compute(self, t): time_const = 1 M = np.exp(-t / time_const) return (M) def fit(self, X_train, y_train): """ Notes: Inputs: X_train: Training inputs. Expect a list with numpy arrays of size (input_layer_size, N) where N is the number of samples. Y_train: Training outputs. Expect a list with numpy arrays of size (output_layer_size, N) where N is the number of samples. Outputs: None """ eta = self.eta if self.show_progress_bar: bar = ProgressBar() for epoch in bar(range(self.epochs)): print 'epoch {}'.format(epoch) if self.convolutions is not None: self.convolutions *= 0. for x, y in zip(X_train, y_train): dLdU, dLdV, dLdW, dLdOb, dLdSb = self.gradient_function(x, y) #self.U -= eta * dLdU #self.V -= eta * dLdV self.W -= eta * dLdW #self.output_bias -= dLdOb self.state_bias -= dLdSb def forward_propagation(self, x): """ Inputs: x: Expect size (T, input_layer_size), where T is the length of time. Outputs: o: The activation of the output layer. s: The activation of the hidden state. """ T = x.shape[0] s = np.zeros((T + 1, self.state_layer_size)) o = np.zeros((T, self.output_layer_size)) s_linear = np.zeros((T + 1, self.state_layer_size)) o_linear = np.zeros((T, self.output_layer_size)) state_bias = Convert2DTo1D(self.state_bias) output_bias = Convert2DTo1D(self.output_bias) for t in np.arange(T): state_linear = np.dot(self.U, x[t]) + np.dot(self.W, s[t - 1]) + state_bias s_linear[t] = state_linear s[t] = self.state_activation.activate(state_linear) output_linear = np.dot(self.V, s[t]) + output_bias o[t] = self.output_activation.activate(output_linear) o_linear[t] = output_linear if self.convolutions is not None: if all(self.convolutions == 0): self.convolutions = s[t] self.convolutions = ( 1 - 1 / self.tau) * self.convolutions + 1 / self.tau * s[t] return (o, s, s_linear, o_linear) def modified_learning_rule(self, x, y): """ Output: dLdU: Gradient for U matrix dLdV: Gradient for V matrix dLdW: Gradient for W matrix dLdOb: Gradient for output layer bias dLdSb: Gradient for state layer bias Hyper Parameters: K : Kernel T : Timesteps after which the weights are updated Learning Rule: Take a Random Backward Weight Vector(B) in same direction as W and minimize the error """ T = len(y) assert T == len(x) if self.bptt_truncate is None: bptt_truncate = T else: bptt_truncate = self.bptt_truncate o, s, s_linear, o_linear = self.forward_propagation(x) #Initialize Random backward weights dLdU = np.zeros(self.U.shape) dLdV = np.zeros(self.V.shape) dLdW = np.zeros(self.W.shape) dLdOb = np.zeros(self.output_bias.shape) dLdSb = np.zeros(self.state_bias.shape) num_dU_additions = 0 num_dVdW_additions = 0 delta_o = o - y for t in reversed(range(T)): # Backprop the error at the output layer e = delta_o[t] o_linear_val = o_linear[t] state_activation = s[t] e = Convert1DTo2D(e) o_linear_val = Convert1DTo2D(o_linear_val) state_activation = Convert1DTo2D(state_activation) e = e * self.output_activation.dactivate(o_linear_val) e = np.dot(self.V.T, e) #kernel_sum = 0 # Backpropagation through time for at most bptt truncate steps #for t_prime in (range(t+1)): # k = self.kernel_compute(t - t_prime) # kernel_sum += k * x[t] # dLdW += e * kernel_sum * self.B # TODO fix this # num_dVdW_additions +=1 assert self.convolutions is not None dLdW += self.B.dot(e).dot(Convert1DTo2D(self.convolutions).T) return [dLdU, dLdV, dLdW / T, dLdOb, dLdSb] #raise NotImplementedError def feedback_alignment(self, x, y): """ Output: dLdU: Gradient for U matrix dLdV: Gradient for V matrix dLdW: Gradient for W matrix dLdOb: Gradient for output layer bias dLdSb: Gradient for state layer bias """ # TODO - numpy likes to provide 1D matrices instead of 2D, and unfortunately # we need 2D matrices. Therefore we have a lot of converting 1D to 2D matrices # and we might want to clean that later somehow... # TODO - also this can probably be cleaned more. T = len(y) assert T == len(x) if self.bptt_truncate is None: bptt_truncate = T else: bptt_truncate = self.bptt_truncate o, s, s_linear, o_linear = self.forward_propagation(x) dLdU = np.zeros(self.U.shape) dLdV = np.zeros(self.V.shape) dLdW = np.zeros(self.W.shape) dLdOb = np.zeros(self.output_bias.shape) dLdSb = np.zeros(self.state_bias.shape) num_dU_additions = 0 num_dVdW_additions = 0 delta_o = o - y for t in reversed(range(T)): # Backprop the error at the output layer g = delta_o[t] o_linear_val = o_linear[t] state_activation = s[t] g = Convert1DTo2D(g) o_linear_val = Convert1DTo2D(o_linear_val) state_activation = Convert1DTo2D(state_activation) g = g * self.output_activation.dactivate(o_linear_val) dLdV += np.dot(g, state_activation.T) dLdOb += g num_dU_additions += 1 g = np.dot(self.V.T, g) # Backpropagation through time for at most bptt truncate steps for bptt_step in reversed(range(max(0, t - bptt_truncate), t + 1)): state_linear = s_linear[bptt_step] state_activation_prev = s[bptt_step - 1] x_present = x[t] state_linear = Convert1DTo2D(state_linear) state_activation_prev = Convert1DTo2D(state_activation_prev) x_present = Convert1DTo2D(x_present) g = g * self.state_activation.dactivate(state_linear) dLdW += np.dot(g, state_activation_prev.T) dLdU += np.dot(g, x_present.T) dLdSb += g num_dVdW_additions += 1 g = g * np.dot(self.B.T, g) return [ dLdU / num_dU_additions, dLdV / num_dVdW_additions, dLdW / num_dVdW_additions, dLdOb / num_dU_additions, dLdSb / num_dVdW_additions ] def bptt(self, x, y): """ Output: dLdU: Gradient for U matrix dLdV: Gradient for V matrix dLdW: Gradient for W matrix dLdOb: Gradient for output layer bias dLdSb: Gradient for state layer bias """ # TODO - numpy likes to provide 1D matrices instead of 2D, and unfortunately # we need 2D matrices. Therefore we have a lot of converting 1D to 2D matrices # and we might want to clean that later somehow... # TODO - also this can probably be cleaned more. T = len(y) assert T == len(x) if self.bptt_truncate is None: bptt_truncate = T else: bptt_truncate = self.bptt_truncate o, s, s_linear, o_linear = self.forward_propagation(x) dLdU = np.zeros(self.U.shape) dLdV = np.zeros(self.V.shape) dLdW = np.zeros(self.W.shape) dLdOb = np.zeros(self.output_bias.shape) dLdSb = np.zeros(self.state_bias.shape) num_dU_additions = 0 num_dVdW_additions = 0 delta_o = o - y for t in reversed(range(T)): # Backprop the error at the output layer g = delta_o[t] o_linear_val = o_linear[t] state_activation = s[t] g = Convert1DTo2D(g) o_linear_val = Convert1DTo2D(o_linear_val) state_activation = Convert1DTo2D(state_activation) g = g * self.output_activation.dactivate(o_linear_val) dLdV += np.dot(g, state_activation.T) dLdOb += g num_dU_additions += 1 g = np.dot(self.V.T, g) # Backpropagation through time for at most bptt truncate steps for bptt_step in reversed(range(max(0, t - bptt_truncate), t + 1)): state_linear = s_linear[bptt_step] state_activation_prev = s[bptt_step - 1] x_present = x[t] state_linear = Convert1DTo2D(state_linear) state_activation_prev = Convert1DTo2D(state_activation_prev) x_present = Convert1DTo2D(x_present) g = g * self.state_activation.dactivate(state_linear) dLdW += np.dot(g, state_activation_prev.T) dLdU += np.dot(g, x_present.T) dLdSb += g num_dVdW_additions += 1 g = g * np.dot(self.W.T, g) return [ dLdU / num_dU_additions, dLdV / num_dVdW_additions, dLdW / num_dVdW_additions, dLdOb / num_dU_additions, dLdSb / num_dVdW_additions ] def predict(self, X): """ Inputs: X: Training inputs. Expect a list with numpy arrays of size (input_layer_size, N) where N is the number of samples. Outputs: predictions """ predictions = [] for x in X: o, _, _, _ = self.forward_propagation(x) predictions.append(o) return predictions def score(self, X, Y): """ Inputs: X: Training inputs. Expect a list with numpy arrays of size (input_layer_size, N) where N is the number of samples. Y: Training outputs. Expect a list with numpy arrays of size (output_layer_size, N) where N is the number of samples. Outputs: MSE """ predictions = self.predict(X) mses = [] for prediction, y in zip(predictions, Y): mses.append(np.mean((predictions - y)**2)) return np.mean(mses)
class RNN(object): def __init__(self, input_layer_size, state_layer_size, state_layer_activation, output_layer_size, output_layer_activation, epochs=100, bptt_truncate=None, learning_rule='bptt', tau=None, eta=0.001, rand=None, verbose=0): """ Notes: U - weight matrix from input into hidden layer. W - weight matrix from hidden layer to hidden layer. V - weight matrix from hidden layer to output layer. Inputs: input_size: Size of the input vector. We expect a 2D numpy array, so this should be X.shape[1] state_layer_size: State layer size. state_layer_activation: A string. Refer to activation.py output_size: Size of the output vector. We expect a 2D numpy array, so this should be Y.shape[1] output_layer_activation: A string. Refer to activation.py epochs(opt): Number of epochs for a single training sample. learning_rule(opt): Choose between 'bptt','fa', 'dfa' or 'modified' bptt_truncate(opt): If left at None, back propagation through time will be applied for all time steps. Otherwise, a value for bptt_truncate means that bptt will only be applied for at most bptt_truncate steps. Only considered when learning_rule == 'bptt' kernel(opt): # TODO - fill this Only considered when learning_rule == 'modified' eta (opt): Learning rate. Initialized to 0.001. rand (opt): Random seed. Initialized to None (no random seed). verbose (opt): Verbosity: levels 0 - 2 Outputs: None """ np.random.seed(rand) self.learning_rule = learning_rule.lower() if self.learning_rule == 'bptt': self.gradient_function = self.bptt elif self.learning_rule == 'fa': self.gradient_function = self.feedback_alignment elif self.learning_rule == 'dfa': self.gradient_function = self.direct_feedback_alignment elif self.learning_rule == 'modified': self.gradient_function = self.modified_learning_rule else: raise ValueError self.input_layer_size = input_layer_size self.state_layer_size = state_layer_size self.state_layer_activation = state_layer_activation self.state_activation = Activation(state_layer_activation) self.output_layer_size = output_layer_size self.output_layer_activation = output_layer_activation self.output_activation = Activation(output_layer_activation) self.epochs = epochs self.tau = tau self.bptt_truncate = bptt_truncate self.kernel_convs = None # U - weight matrix from input into state layer. # W - weight matrix from state layer to state layer. # V - weight matrix from state layer to output layer. """ if self.learning_rule == 'bptt': self.U = np.random.uniform(-np.sqrt(1./input_layer_size), np.sqrt(1./input_layer_size), (state_layer_size, input_layer_size)) self.V = np.random.uniform(-np.sqrt(1./state_layer_size), np.sqrt(1./state_layer_size), (output_layer_size, state_layer_size)) self.W = np.random.uniform(-np.sqrt(1./state_layer_size), np.sqrt(1./state_layer_size), (state_layer_size, state_layer_size)) else: """ if state_layer_size == input_layer_size and state_layer_size == output_layer_size: print "Using identity matrices for U and V" self.U = np.eye(state_layer_size) self.V = np.eye(state_layer_size) else: self.U = np.random.uniform(1, 2., (state_layer_size, input_layer_size)) self.V = np.random.uniform(1, 2., (output_layer_size, state_layer_size)) self.W = np.random.uniform(-0.5, 0.5, (state_layer_size, state_layer_size)) # see if W matrix randomization is the cause #self.W = np.random.rand(2, 2) - 1/2#np.array([[0.51940038, -0.57702151],[0.64065148, 0.31259335]]) #self.W = np.array([[0.51940038, -0.57702151],[0.64065148, 0.31259335]]) self.state_bias = np.zeros((state_layer_size, 1)) self.output_bias = np.zeros((output_layer_size, 1)) # B - Feedback weight matrix for all layers """ self.B = np.random.uniform(-np.sqrt(1./state_layer_size), np.sqrt(1./state_layer_size), (state_layer_size, input_layer_size)) """ self.B = np.random.uniform(0., 0.5, self.W.shape) self.eta = eta self.verbose = verbose self.show_progress_bar = verbose > 0 def kernel_compute(self, t): return np.exp(-t / self.tau) def eWBe(self, x, y): o, s, s_linear, o_linear = self.forward_propagation(x) delta_o = o - y T = len(x) eWBe = [] for t in reversed(range(T)): e = delta_o[t] eWBe.append(np.dot(np.dot(np.dot(e.T, self.W), self.B), e)) return eWBe def fit(self, X, y, validation_size=0.1): """ Notes: Inputs: X_train: Training inputs. Expect a list with numpy arrays of size (input_layer_size, N) where N is the number of samples. Y_train: Training outputs. Expect a list with numpy arrays of size (output_layer_size, N) where N is the number of samples. Outputs: None """ eta = self.eta X = np.array(X) y = np.array(y) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=validation_size, random_state=0) if self.verbose: print "Validation size: {0}".format(validation_size) print "Training on {0} samples".format(len(X_train)) training_losses = [] validation_losses = [] # Non-online if self.show_progress_bar: bar = ProgressBar(max_value=len(X_train)) for epoch in range(self.epochs): #if self.learning_rule == 'modified': # self.kernel_convs = np.zeros_like(self.kernel_convs) training_loss = self.score(X_train, y_train) validation_loss = self.score(X_test, y_test) training_losses.append(training_loss) validation_losses.append(validation_loss) if self.verbose == 2: print "--------" print "Epoch {0}/{1}".format(epoch, self.epochs) print "Training loss: {0}".format(training_loss) print "Validation loss: {0}".format(validation_loss) print "--------" #eWBe = [] for i, (x, y) in enumerate(zip(X_train, y_train)): dLdU, dLdV, dLdW, dLdOb, dLdSb = self.gradient_function(x, y) self.W -= eta * dLdW #self.U -= eta * dLdU #self.V -= eta * dLdV self.state_bias -= eta * dLdSb #self.output_bias -= eta * dLdOb #eWBe.append(np.mean(self.eWBe(x, y))) if self.show_progress_bar: bar.update(i) if self.show_progress_bar: bar.update(0) return training_losses, validation_losses def forward_propagation(self, x): """ Inputs: x: Expect size (T, input_layer_size), where T is the length of time. Outputs: o: The activation of the output layer. s: The activation of the hidden state. """ if self.learning_rule == 'modified': self.kernel_convs = np.zeros((self.state_layer_size, x.shape[0])) T = x.shape[0] s = np.zeros((T + 1, self.state_layer_size)) o = np.zeros((T, self.output_layer_size)) s_linear = np.zeros((T + 1, self.state_layer_size)) o_linear = np.zeros((T, self.output_layer_size)) state_bias = Convert2DTo1D(self.state_bias) output_bias = Convert2DTo1D(self.output_bias) for t in np.arange(T): state_linear = np.dot(self.U, x[t]) + np.dot(self.W, s[t - 1]) + state_bias s_linear[t] = state_linear s[t] = self.state_activation.activate(state_linear) if self.learning_rule == 'modified' and t > 0: alpha = 1 / self.tau self.kernel_convs[:, t] = alpha * s[t] + ( 1 - alpha) * self.kernel_convs[:, t - 1] output_linear = np.dot(self.V, s[t]) + output_bias o[t] = self.output_activation.activate(output_linear) o_linear[t] = output_linear return (o, s, s_linear, o_linear) def modified_learning_rule(self, x, y): """ Output: dLdU: Gradient for U matrix dLdV: Gradient for V matrix dLdW: Gradient for W matrix dLdOb: Gradient for output layer bias dLdSb: Gradient for state layer bias Hyper Parameters: K : Kernel T : Timesteps after which the weights are updated Learning Rule: Take a Random Backward Weight Vector(B) in same direction as W and minimize the error """ T = len(y) assert T == len(x) if self.bptt_truncate is None: bptt_truncate = T else: bptt_truncate = self.bptt_truncate o, s, s_linear, o_linear = self.forward_propagation(x) #Initialize Random backward weights dLdU = np.zeros(self.U.shape) dLdV = np.zeros(self.V.shape) dLdW = np.zeros(self.W.shape) dLdOb = np.zeros(self.output_bias.shape) dLdSb = np.zeros(self.state_bias.shape) num_dW_additions = 0 delta_o = o - y for t in reversed(range(T)): # Get the error at the output layer e = self.V.T.dot(delta_o[t]) o_linear_val = o_linear[t] e = Convert1DTo2D(e) o_linear_val = Convert1DTo2D(o_linear_val) #kernel_sum = 0 # Backpropagation through time for at most bptt truncate steps #for t_prime in (range(max(0,t-50),t+1)): #for t_prime in (range(t+1)): # state_activation = s[t_prime] # state_linear = s_linear[t_prime - 1] # k = self.kernel_compute(t - t_prime) # kernel_sum += k * state_activation * self.state_activation.dactivate(state_linear) #kernel_sum = kernel_sum/(t+1) #kernel_sum = Convert1DTo2D(kernel_sum) dLdW += self.B.dot(e).dot( Convert1DTo2D(self.kernel_convs[:, t]).T ) #np.dot(np.dot(self.B, e), kernel_sum.T) dLdSb += np.dot(self.B, e) num_dW_additions += 1 return [ dLdU, dLdV, dLdW / num_dW_additions, dLdOb, dLdSb / num_dW_additions ] def direct_feedback_alignment(self, x, y): """ Output: dLdU: Gradient for U matrix dLdV: Gradient for V matrix dLdW: Gradient for W matrix dLdOb: Gradient for output layer bias dLdSb: Gradient for state layer bias """ T = len(y) assert T == len(x) if self.bptt_truncate is None: bptt_truncate = T else: bptt_truncate = self.bptt_truncate o, s, s_linear, o_linear = self.forward_propagation(x) dLdU = np.zeros(self.U.shape) dLdV = np.zeros(self.V.shape) dLdW = np.zeros(self.W.shape) dLdOb = np.zeros(self.output_bias.shape) dLdSb = np.zeros(self.state_bias.shape) num_dU_additions = 0 num_dVdW_additions = 0 delta_o = o - y for t in reversed(range(T)): # Backprop the error at the output layer e = self.V.T.dot(delta_o[t]) o_linear_val = o_linear[t] state_activation = s[t] e = Convert1DTo2D(e) o_linear_val = Convert1DTo2D(o_linear_val) state_activation = Convert1DTo2D(state_activation) num_dU_additions += 1 # Backpropagation through time for at most bptt truncate steps for bptt_step in reversed(range(max(0, t - bptt_truncate), t + 1)): state_linear = s_linear[bptt_step] state_activation_prev = s[bptt_step - 1] x_present = x[t] g = self.B.dot(e.copy()) state_linear = Convert1DTo2D(state_linear) state_activation_prev = Convert1DTo2D(state_activation_prev) x_present = Convert1DTo2D(x_present) g = g * self.state_activation.dactivate(state_linear) dLdW += np.dot(g, state_activation_prev.T) dLdSb += g num_dVdW_additions += 1 num_dVdW_additions = T return [ dLdU / num_dU_additions, dLdV / num_dVdW_additions, dLdW / num_dVdW_additions, dLdOb / num_dU_additions, dLdSb / num_dVdW_additions ] def feedback_alignment(self, x, y): """ Output: dLdU: Gradient for U matrix dLdV: Gradient for V matrix dLdW: Gradient for W matrix dLdOb: Gradient for output layer bias dLdSb: Gradient for state layer bias """ T = len(y) assert T == len(x) if self.bptt_truncate is None: bptt_truncate = T else: bptt_truncate = self.bptt_truncate o, s, s_linear, o_linear = self.forward_propagation(x) dLdU = np.zeros(self.U.shape) dLdV = np.zeros(self.V.shape) dLdW = np.zeros(self.W.shape) dLdOb = np.zeros(self.output_bias.shape) dLdSb = np.zeros(self.state_bias.shape) num_dU_additions = 0 num_dVdW_additions = 0 delta_o = o - y for t in reversed(range(T)): # Backprop the error at the output layer g = self.V.T.dot(delta_o[t]) o_linear_val = o_linear[t] state_activation = s[t] g = Convert1DTo2D(g) o_linear_val = Convert1DTo2D(o_linear_val) state_activation = Convert1DTo2D(state_activation) num_dU_additions += 1 # Backpropagation through time for at most bptt truncate steps for bptt_step in reversed(range(max(0, t - bptt_truncate), t + 1)): state_linear = s_linear[bptt_step] state_activation_prev = s[bptt_step - 1] x_present = x[t] state_linear = Convert1DTo2D(state_linear) state_activation_prev = Convert1DTo2D(state_activation_prev) x_present = Convert1DTo2D(x_present) g = g * self.state_activation.dactivate(state_linear) dLdW += np.dot(g, state_activation_prev.T) dLdSb += g num_dVdW_additions += 1 g = np.dot(self.B, g) num_dVdW_additions = T return [ dLdU / num_dU_additions, dLdV / num_dVdW_additions, dLdW / num_dVdW_additions, dLdOb / num_dU_additions, dLdSb / num_dVdW_additions ] # online version #def bptt(self, x, y): """ Output: dLdU: Gradient for U matrix dLdV: Gradient for V matrix dLdW: Gradient for W matrix dLdOb: Gradient for output layer bias dLdSb: Gradient for state layer bias """ """ # TODO - numpy likes to provide 1D matrices instead of 2D, and unfortunately # we need 2D matrices. Therefore we have a lot of converting 1D to 2D matrices # and we might want to clean that later somehow... # TODO - also this can probably be cleaned more. t = len(y) assert t == len(x) if self.bptt_truncate is None: bptt_truncate = t else: bptt_truncate = self.bptt_truncate o, s, s_linear, o_linear = self.forward_propagation(x) dLdU = np.zeros(self.U.shape) dLdV = np.zeros(self.V.shape) dLdW = np.zeros(self.W.shape) dLdOb = np.zeros(self.output_bias.shape) dLdSb = np.zeros(self.state_bias.shape) num_dU_additions = 0 num_dVdW_additions = 0 delta_o = o - y # Backprop the error at the output layer g = delta_o[t - 1] o_linear_val = o_linear[t - 1] state_activation = s[t - 1] g = Convert1DTo2D(g) o_linear_val = Convert1DTo2D(o_linear_val) state_activation = Convert1DTo2D(state_activation) g = g * self.output_activation.dactivate(o_linear_val) dLdV += np.dot(g, state_activation.T) dLdOb += g num_dU_additions += 1 g = np.dot(self.V.T, g) # Backpropagation through time for at most bptt truncate steps for bptt_step in reversed(range(max(0, t - bptt_truncate), t + 1)): state_linear = s_linear[bptt_step] state_activation_prev = s[bptt_step - 1] x_present = x[t - 1] state_linear = Convert1DTo2D(state_linear) state_activation_prev = Convert1DTo2D(state_activation_prev) x_present = Convert1DTo2D(x_present) g = g * self.state_activation.dactivate(state_linear) dLdW += np.dot(g, state_activation_prev.T) dLdU += np.dot(g, x_present.T) dLdSb += g num_dVdW_additions += 1 g = g * np.dot(self.W.T, g) return [dLdU/num_dU_additions, dLdV/num_dVdW_additions, dLdW/num_dVdW_additions, dLdOb/num_dU_additions, dLdSb/num_dVdW_additions] """ # Non-online version def bptt(self, x, y): # TODO - numpy likes to provide 1D matrices instead of 2D, and unfortunately # we need 2D matrices. Therefore we have a lot of converting 1D to 2D matrices # and we might want to clean that later somehow... # TODO - also this can probably be cleaned more. T = len(y) assert T == len(x) if self.bptt_truncate is None: bptt_truncate = T else: bptt_truncate = self.bptt_truncate o, s, s_linear, o_linear = self.forward_propagation(x) dLdU = np.zeros(self.U.shape) dLdV = np.zeros(self.V.shape) dLdW = np.zeros(self.W.shape) dLdOb = np.zeros(self.output_bias.shape) dLdSb = np.zeros(self.state_bias.shape) num_dU_additions = 0 num_dVdW_additions = 0 delta_o = o - y for t in reversed(range(T)): # Backprop the error at the output layer g = delta_o[t] o_linear_val = o_linear[t] state_activation = s[t] g = Convert1DTo2D(g) o_linear_val = Convert1DTo2D(o_linear_val) state_activation = Convert1DTo2D(state_activation) g = g * self.output_activation.dactivate(o_linear_val) dLdV += np.dot(g, state_activation.T) dLdOb += g num_dU_additions += 1 g = np.dot(self.V.T, g) # Backpropagation through time for at most bptt truncate steps for bptt_step in reversed(range(max(0, t - bptt_truncate), t + 1)): state_linear = s_linear[bptt_step] state_activation_prev = s[bptt_step - 1] x_present = x[t] state_linear = Convert1DTo2D(state_linear) state_activation_prev = Convert1DTo2D(state_activation_prev) x_present = Convert1DTo2D(x_present) g = g * self.state_activation.dactivate(state_linear) dLdW += np.dot(g, state_activation_prev.T) dLdU += np.dot(g, x_present.T) dLdSb += g num_dVdW_additions += 1 g = np.dot(self.W.T, g) num_dVdW_additions = T return [ dLdU / num_dU_additions, dLdV / num_dVdW_additions, dLdW / num_dVdW_additions, dLdOb / num_dU_additions, dLdSb / num_dVdW_additions ] def predict(self, X): """ Inputs: X: Training inputs. Expect a list with numpy arrays of size (input_layer_size, N) where N is the number of samples. Outputs: predictions """ predictions = [] for x in X: o, _, _, _ = self.forward_propagation(x) predictions.append(o) return predictions def score(self, X, Y): """ Inputs: X: Training inputs. Expect a list with numpy arrays of size (input_layer_size, N) where N is the number of samples. Y: Training outputs. Expect a list with numpy arrays of size (output_layer_size, N) where N is the number of samples. Outputs: MSE """ predictions = self.predict(X) mses = [] for prediction, y in zip(predictions, Y): #mses.append(np.mean((prediction - y)**2)) mses.append(mean_squared_error(prediction, y)) return np.mean(mses)