class RNN: def __init__(self, input_layer_size, state_layer_size, state_layer_activation, output_layer_size, output_layer_activation, epochs=100, bptt_truncate=None, learning_rule='bptt', tau=None, eta=1e-5, rand=None, verbose=0): """ Notes: U - weight matrix from input into hidden layer. W - weight matrix from hidden layer to hidden layer. V - weight matrix from hidden layer to output layer. Inputs: input_size: Size of the input vector. We expect a 2D numpy array, so this should be X.shape[1] state_layer_size: State layer size. state_layer_activation: A string. Refer to activation.py output_size: Size of the output vector. We expect a 2D numpy array, so this should be Y.shape[1] output_layer_activation: A string. Refer to activation.py epochs(opt): Number of epochs for a single training sample. learning_rule(opt): Choose between 'bptt','fa' or 'modified' bptt_truncate(opt): If left at None, back propagation through time will be applied for all time steps. Otherwise, a value for bptt_truncate means that bptt will only be applied for at most bptt_truncate steps. Only considered when learning_rule == 'bptt' kernel(opt): # TODO - fill this Only considered when learning_rule == 'modified' eta (opt): Learning rate. Initialized to 0.001. rand (opt): Random seed. Initialized to None (no random seed). verbose (opt): Verbosity: levels 0 - 2 Outputs: None """ np.random.seed(rand) self.learning_rule = learning_rule.lower() if self.learning_rule == 'bptt': self.gradient_function = self.bptt elif self.learning_rule == 'fa': self.gradient_function = self.feedback_alignment elif self.learning_rule == 'modified': self.gradient_function = self.modified_learning_rule else: raise ValueError self.input_layer_size = input_layer_size self.state_layer_size = state_layer_size self.state_layer_activation = state_layer_activation self.state_activation = Activation(state_layer_activation) self.output_layer_size = output_layer_size self.output_layer_activation = output_layer_activation self.output_activation = Activation(output_layer_activation) self.epochs = epochs self.tau = tau self.convolutions = None if self.tau: self.convolutions = np.zeros((state_layer_size, )) self.bptt_truncate = bptt_truncate # U - weight matrix from input into state layer. # W - weight matrix from state layer to state layer. # V - weight matrix from state layer to output layer. """ self.U = np.eye(-np.sqrt(1./input_layer_size), np.sqrt(1./input_layer_size), (state_layer_size, input_layer_size)) self.V = np.random.uniform(-np.sqrt(1./state_layer_size), np.sqrt(1./state_layer_size), (output_layer_size, state_layer_size)) self.W = np.random.uniform(-np.sqrt(1./state_layer_size), np.sqrt(1./state_layer_size), (state_layer_size, state_layer_size)) """ self.U = np.eye(state_layer_size) self.V = np.eye(state_layer_size) self.W = np.random.uniform(-np.sqrt(1. / state_layer_size), np.sqrt(1. / state_layer_size), (state_layer_size, state_layer_size)) self.state_bias = np.zeros((state_layer_size, 1)) self.output_bias = np.zeros((output_layer_size, 1)) # B - Feedback weight matrix for all layers self.B = np.random.uniform(-np.sqrt(1. / state_layer_size), np.sqrt(1. / state_layer_size), (state_layer_size, input_layer_size)) self.eta = eta self.verbose = verbose self.show_progress_bar = verbose > 0 def kernel_compute(self, t): time_const = 1 M = np.exp(-t / time_const) return (M) def fit(self, X_train, y_train): """ Notes: Inputs: X_train: Training inputs. Expect a list with numpy arrays of size (input_layer_size, N) where N is the number of samples. Y_train: Training outputs. Expect a list with numpy arrays of size (output_layer_size, N) where N is the number of samples. Outputs: None """ eta = self.eta if self.show_progress_bar: bar = ProgressBar() for epoch in bar(range(self.epochs)): print 'epoch {}'.format(epoch) if self.convolutions is not None: self.convolutions *= 0. for x, y in zip(X_train, y_train): dLdU, dLdV, dLdW, dLdOb, dLdSb = self.gradient_function(x, y) #self.U -= eta * dLdU #self.V -= eta * dLdV self.W -= eta * dLdW #self.output_bias -= dLdOb self.state_bias -= dLdSb def forward_propagation(self, x): """ Inputs: x: Expect size (T, input_layer_size), where T is the length of time. Outputs: o: The activation of the output layer. s: The activation of the hidden state. """ T = x.shape[0] s = np.zeros((T + 1, self.state_layer_size)) o = np.zeros((T, self.output_layer_size)) s_linear = np.zeros((T + 1, self.state_layer_size)) o_linear = np.zeros((T, self.output_layer_size)) state_bias = Convert2DTo1D(self.state_bias) output_bias = Convert2DTo1D(self.output_bias) for t in np.arange(T): state_linear = np.dot(self.U, x[t]) + np.dot(self.W, s[t - 1]) + state_bias s_linear[t] = state_linear s[t] = self.state_activation.activate(state_linear) output_linear = np.dot(self.V, s[t]) + output_bias o[t] = self.output_activation.activate(output_linear) o_linear[t] = output_linear if self.convolutions is not None: if all(self.convolutions == 0): self.convolutions = s[t] self.convolutions = ( 1 - 1 / self.tau) * self.convolutions + 1 / self.tau * s[t] return (o, s, s_linear, o_linear) def modified_learning_rule(self, x, y): """ Output: dLdU: Gradient for U matrix dLdV: Gradient for V matrix dLdW: Gradient for W matrix dLdOb: Gradient for output layer bias dLdSb: Gradient for state layer bias Hyper Parameters: K : Kernel T : Timesteps after which the weights are updated Learning Rule: Take a Random Backward Weight Vector(B) in same direction as W and minimize the error """ T = len(y) assert T == len(x) if self.bptt_truncate is None: bptt_truncate = T else: bptt_truncate = self.bptt_truncate o, s, s_linear, o_linear = self.forward_propagation(x) #Initialize Random backward weights dLdU = np.zeros(self.U.shape) dLdV = np.zeros(self.V.shape) dLdW = np.zeros(self.W.shape) dLdOb = np.zeros(self.output_bias.shape) dLdSb = np.zeros(self.state_bias.shape) num_dU_additions = 0 num_dVdW_additions = 0 delta_o = o - y for t in reversed(range(T)): # Backprop the error at the output layer e = delta_o[t] o_linear_val = o_linear[t] state_activation = s[t] e = Convert1DTo2D(e) o_linear_val = Convert1DTo2D(o_linear_val) state_activation = Convert1DTo2D(state_activation) e = e * self.output_activation.dactivate(o_linear_val) e = np.dot(self.V.T, e) #kernel_sum = 0 # Backpropagation through time for at most bptt truncate steps #for t_prime in (range(t+1)): # k = self.kernel_compute(t - t_prime) # kernel_sum += k * x[t] # dLdW += e * kernel_sum * self.B # TODO fix this # num_dVdW_additions +=1 assert self.convolutions is not None dLdW += self.B.dot(e).dot(Convert1DTo2D(self.convolutions).T) return [dLdU, dLdV, dLdW / T, dLdOb, dLdSb] #raise NotImplementedError def feedback_alignment(self, x, y): """ Output: dLdU: Gradient for U matrix dLdV: Gradient for V matrix dLdW: Gradient for W matrix dLdOb: Gradient for output layer bias dLdSb: Gradient for state layer bias """ # TODO - numpy likes to provide 1D matrices instead of 2D, and unfortunately # we need 2D matrices. Therefore we have a lot of converting 1D to 2D matrices # and we might want to clean that later somehow... # TODO - also this can probably be cleaned more. T = len(y) assert T == len(x) if self.bptt_truncate is None: bptt_truncate = T else: bptt_truncate = self.bptt_truncate o, s, s_linear, o_linear = self.forward_propagation(x) dLdU = np.zeros(self.U.shape) dLdV = np.zeros(self.V.shape) dLdW = np.zeros(self.W.shape) dLdOb = np.zeros(self.output_bias.shape) dLdSb = np.zeros(self.state_bias.shape) num_dU_additions = 0 num_dVdW_additions = 0 delta_o = o - y for t in reversed(range(T)): # Backprop the error at the output layer g = delta_o[t] o_linear_val = o_linear[t] state_activation = s[t] g = Convert1DTo2D(g) o_linear_val = Convert1DTo2D(o_linear_val) state_activation = Convert1DTo2D(state_activation) g = g * self.output_activation.dactivate(o_linear_val) dLdV += np.dot(g, state_activation.T) dLdOb += g num_dU_additions += 1 g = np.dot(self.V.T, g) # Backpropagation through time for at most bptt truncate steps for bptt_step in reversed(range(max(0, t - bptt_truncate), t + 1)): state_linear = s_linear[bptt_step] state_activation_prev = s[bptt_step - 1] x_present = x[t] state_linear = Convert1DTo2D(state_linear) state_activation_prev = Convert1DTo2D(state_activation_prev) x_present = Convert1DTo2D(x_present) g = g * self.state_activation.dactivate(state_linear) dLdW += np.dot(g, state_activation_prev.T) dLdU += np.dot(g, x_present.T) dLdSb += g num_dVdW_additions += 1 g = g * np.dot(self.B.T, g) return [ dLdU / num_dU_additions, dLdV / num_dVdW_additions, dLdW / num_dVdW_additions, dLdOb / num_dU_additions, dLdSb / num_dVdW_additions ] def bptt(self, x, y): """ Output: dLdU: Gradient for U matrix dLdV: Gradient for V matrix dLdW: Gradient for W matrix dLdOb: Gradient for output layer bias dLdSb: Gradient for state layer bias """ # TODO - numpy likes to provide 1D matrices instead of 2D, and unfortunately # we need 2D matrices. Therefore we have a lot of converting 1D to 2D matrices # and we might want to clean that later somehow... # TODO - also this can probably be cleaned more. T = len(y) assert T == len(x) if self.bptt_truncate is None: bptt_truncate = T else: bptt_truncate = self.bptt_truncate o, s, s_linear, o_linear = self.forward_propagation(x) dLdU = np.zeros(self.U.shape) dLdV = np.zeros(self.V.shape) dLdW = np.zeros(self.W.shape) dLdOb = np.zeros(self.output_bias.shape) dLdSb = np.zeros(self.state_bias.shape) num_dU_additions = 0 num_dVdW_additions = 0 delta_o = o - y for t in reversed(range(T)): # Backprop the error at the output layer g = delta_o[t] o_linear_val = o_linear[t] state_activation = s[t] g = Convert1DTo2D(g) o_linear_val = Convert1DTo2D(o_linear_val) state_activation = Convert1DTo2D(state_activation) g = g * self.output_activation.dactivate(o_linear_val) dLdV += np.dot(g, state_activation.T) dLdOb += g num_dU_additions += 1 g = np.dot(self.V.T, g) # Backpropagation through time for at most bptt truncate steps for bptt_step in reversed(range(max(0, t - bptt_truncate), t + 1)): state_linear = s_linear[bptt_step] state_activation_prev = s[bptt_step - 1] x_present = x[t] state_linear = Convert1DTo2D(state_linear) state_activation_prev = Convert1DTo2D(state_activation_prev) x_present = Convert1DTo2D(x_present) g = g * self.state_activation.dactivate(state_linear) dLdW += np.dot(g, state_activation_prev.T) dLdU += np.dot(g, x_present.T) dLdSb += g num_dVdW_additions += 1 g = g * np.dot(self.W.T, g) return [ dLdU / num_dU_additions, dLdV / num_dVdW_additions, dLdW / num_dVdW_additions, dLdOb / num_dU_additions, dLdSb / num_dVdW_additions ] def predict(self, X): """ Inputs: X: Training inputs. Expect a list with numpy arrays of size (input_layer_size, N) where N is the number of samples. Outputs: predictions """ predictions = [] for x in X: o, _, _, _ = self.forward_propagation(x) predictions.append(o) return predictions def score(self, X, Y): """ Inputs: X: Training inputs. Expect a list with numpy arrays of size (input_layer_size, N) where N is the number of samples. Y: Training outputs. Expect a list with numpy arrays of size (output_layer_size, N) where N is the number of samples. Outputs: MSE """ predictions = self.predict(X) mses = [] for prediction, y in zip(predictions, Y): mses.append(np.mean((predictions - y)**2)) return np.mean(mses)
class RNN: def __init__(self, input_layer_size, state_layer_size, state_layer_activation, output_layer_size, output_layer_activation, epochs=100, bptt_truncate=None, learning_rule='bptt', kernel=None, eta=0.001, rand=None, verbose=0): """ Notes: U - weight matrix from input into hidden layer. W - weight matrix from hidden layer to hidden layer. V - weight matrix from hidden layer to output layer. Inputs: input_size: Size of the input vector. We expect a 2D numpy array, so this should be X.shape[1] state_layer_size: State layer size. state_layer_activation: A string. Refer to activation.py output_size: Size of the output vector. We expect a 2D numpy array, so this should be Y.shape[1] output_layer_activation: A string. Refer to activation.py epochs(opt): Number of epochs for a single training sample. learning_rule(opt): Choose between 'bptt' and 'modified' bptt_truncate(opt): If left at None, back propagation through time will be applied for all time steps. Otherwise, a value for bptt_truncate means that bptt will only be applied for at most bptt_truncate steps. Only considered when learning_rule == 'bptt' kernel(opt): # TODO - fill this Only considered when learning_rule == 'modified' eta (opt): Learning rate. Initialized to 0.001. rand (opt): Random seed. Initialized to None (no random seed). verbose (opt): Verbosity: levels 0 - 2 Outputs: None """ np.random.seed(rand) self.learning_rule = learning_rule.lower() if self.learning_rule == 'bptt': self.gradient_function = self.bptt elif self.learning_rule == 'modified': self.gradietn_function = self.modified_learning_rule else: raise ValueError self.input_layer_size = input_layer_size self.state_layer_size = state_layer_size self.state_layer_activation = state_layer_activation self.state_activation = Activation(state_layer_activation) self.output_layer_size = output_layer_size self.output_layer_activation = output_layer_activation self.output_activation = Activation(output_layer_activation) self.epochs = epochs self.kernel = kernel self.bptt_truncate = bptt_truncate # U - weight matrix from input into state layer. # W - weight matrix from state layer to state layer. # V - weight matrix from state layer to output layer. self.U = np.random.uniform(-np.sqrt(1. / input_layer_size), np.sqrt(1. / input_layer_size), (state_layer_size, input_layer_size)) self.V = np.random.uniform(-np.sqrt(1. / state_layer_size), np.sqrt(1. / state_layer_size), (output_layer_size, state_layer_size)) self.W = np.random.uniform(-np.sqrt(1. / state_layer_size), np.sqrt(1. / state_layer_size), (state_layer_size, state_layer_size)) self.state_bias = np.zeros((state_layer_size, 1)) self.output_bias = np.zeros((output_layer_size, 1)) self.eta = eta self.verbose = verbose self.show_progress_bar = verbose > 0 def fit(self, X_train, y_train): """ Notes: Inputs: X_train: Training inputs. Expect a list with numpy arrays of size (input_layer_size, N) where N is the number of samples. Y_train: Training outputs. Expect a list with numpy arrays of size (output_layer_size, N) where N is the number of samples. Outputs: None """ eta = self.eta if self.show_progress_bar: bar = ProgressBar(max_value=self.epochs) for epoch in range(self.epochs): for x, y in zip(X_train, y_train): dLdU, dLdV, dLdW, dLdOb, dLdSb = self.gradient_function(x, y) self.U -= eta * dLdU self.V -= eta * dLdV self.W -= eta * dLdW self.output_bias -= dLdOb self.state_bias -= dLdSb if self.show_progress_bar: bar.update(epoch) def forward_propagation(self, x): """ Inputs: x: Expect size (T, input_layer_size), where T is the length of time. Outputs: o: The activation of the output layer. s: The activation of the hidden state. """ T = x.shape[0] s = np.zeros((T + 1, self.state_layer_size)) o = np.zeros((T, self.output_layer_size)) s_linear = np.zeros((T + 1, self.state_layer_size)) o_linear = np.zeros((T, self.output_layer_size)) state_bias = Convert2DTo1D(self.state_bias) output_bias = Convert2DTo1D(self.output_bias) for t in np.arange(T): state_linear = np.dot(self.U, x[t]) + np.dot(self.W, s[t - 1]) + state_bias s_linear[t] = state_linear s[t] = self.state_activation.activate(state_linear) output_linear = np.dot(self.V, s[t]) + output_bias o[t] = self.output_activation.activate(output_linear) o_linear[t] = output_linear return (o, s, s_linear, o_linear) def modified_learning_rule(self, x, y): """ Output: dLdU: Gradient for U matrix dLdV: Gradient for V matrix dLdW: Gradient for W matrix dLdOb: Gradient for output layer bias dLdSb: Gradient for state layer bias TODO - implement this """ raise NotImplementedError def bptt(self, x, y): """ Output: dLdU: Gradient for U matrix dLdV: Gradient for V matrix dLdW: Gradient for W matrix dLdOb: Gradient for output layer bias dLdSb: Gradient for state layer bias """ # TODO - numpy likes to provide 1D matrices instead of 2D, and unfortunately # we need 2D matrices. Therefore we have a lot of converting 1D to 2D matrices # and we might want to clean that later somehow... # TODO - also this can probably be cleaned more. T = len(y) assert T == len(x) if self.bptt_truncate is None: bptt_truncate = T else: bptt_truncate = self.bptt_truncate o, s, s_linear, o_linear = self.forward_propagation(x) dLdU = np.zeros(self.U.shape) dLdV = np.zeros(self.V.shape) dLdW = np.zeros(self.W.shape) dLdOb = np.zeros(self.output_bias.shape) dLdSb = np.zeros(self.state_bias.shape) num_dU_additions = 0 num_dVdW_additions = 0 delta_o = o - y for t in reversed(range(T)): # Backprop the error at the output layer g = delta_o[t] o_linear_val = o_linear[t] state_activation = s[t] g = Convert1DTo2D(g) o_linear_val = Convert1DTo2D(o_linear_val) state_activation = Convert1DTo2D(state_activation) g = g * self.output_activation.dactivate(o_linear_val) dLdV += np.dot(g, state_activation.T) dLdOb += g num_dU_additions += 1 g = np.dot(self.V.T, g) # Backpropagation through time for at most bptt truncate steps for bptt_step in reversed(range(max(0, t - bptt_truncate), t + 1)): state_linear = s_linear[bptt_step] state_activation_prev = s[bptt_step - 1] x_present = x[t] state_linear = Convert1DTo2D(state_linear) state_activation_prev = Convert1DTo2D(state_activation_prev) x_present = Convert1DTo2D(x_present) g = g * self.state_activation.dactivate(state_linear) dLdW += np.dot(g, state_activation_prev.T) dLdU += np.dot(g, x_present.T) dLdSb += g num_dVdW_additions += 1 g = g * np.dot(self.W.T, g) return [ dLdU / num_dU_additions, dLdV / num_dVdW_additions, dLdW / num_dVdW_additions, dLdOb / num_dU_additions, dLdSb / num_dVdW_additions ] def predict(self, X): """ Inputs: X: Training inputs. Expect a list with numpy arrays of size (input_layer_size, N) where N is the number of samples. Outputs: predictions """ predictions = [] for x in X: o, _, _, _ = self.forward_propagation(x) predictions.append(o) return predictions def score(self, X, Y): """ Inputs: X: Training inputs. Expect a list with numpy arrays of size (input_layer_size, N) where N is the number of samples. Y: Training outputs. Expect a list with numpy arrays of size (output_layer_size, N) where N is the number of samples. Outputs: MSE """ predictions = self.predict(X) mses = [] for prediction, y in zip(predictions, Y): mses.append(np.mean((predictions - y)**2)) return np.mean(mses)
class RNN(object): def __init__(self, input_layer_size, state_layer_size, state_layer_activation, output_layer_size, output_layer_activation, epochs=100, bptt_truncate=None, learning_rule='bptt', tau=None, eta=0.001, rand=None, verbose=0): """ Notes: U - weight matrix from input into hidden layer. W - weight matrix from hidden layer to hidden layer. V - weight matrix from hidden layer to output layer. Inputs: input_size: Size of the input vector. We expect a 2D numpy array, so this should be X.shape[1] state_layer_size: State layer size. state_layer_activation: A string. Refer to activation.py output_size: Size of the output vector. We expect a 2D numpy array, so this should be Y.shape[1] output_layer_activation: A string. Refer to activation.py epochs(opt): Number of epochs for a single training sample. learning_rule(opt): Choose between 'bptt','fa', 'dfa' or 'modified' bptt_truncate(opt): If left at None, back propagation through time will be applied for all time steps. Otherwise, a value for bptt_truncate means that bptt will only be applied for at most bptt_truncate steps. Only considered when learning_rule == 'bptt' kernel(opt): # TODO - fill this Only considered when learning_rule == 'modified' eta (opt): Learning rate. Initialized to 0.001. rand (opt): Random seed. Initialized to None (no random seed). verbose (opt): Verbosity: levels 0 - 2 Outputs: None """ np.random.seed(rand) self.learning_rule = learning_rule.lower() if self.learning_rule == 'bptt': self.gradient_function = self.bptt elif self.learning_rule == 'fa': self.gradient_function = self.feedback_alignment elif self.learning_rule == 'dfa': self.gradient_function = self.direct_feedback_alignment elif self.learning_rule == 'modified': self.gradient_function = self.modified_learning_rule else: raise ValueError self.input_layer_size = input_layer_size self.state_layer_size = state_layer_size self.state_layer_activation = state_layer_activation self.state_activation = Activation(state_layer_activation) self.output_layer_size = output_layer_size self.output_layer_activation = output_layer_activation self.output_activation = Activation(output_layer_activation) self.epochs = epochs self.tau = tau self.bptt_truncate = bptt_truncate self.kernel_convs = None # U - weight matrix from input into state layer. # W - weight matrix from state layer to state layer. # V - weight matrix from state layer to output layer. """ if self.learning_rule == 'bptt': self.U = np.random.uniform(-np.sqrt(1./input_layer_size), np.sqrt(1./input_layer_size), (state_layer_size, input_layer_size)) self.V = np.random.uniform(-np.sqrt(1./state_layer_size), np.sqrt(1./state_layer_size), (output_layer_size, state_layer_size)) self.W = np.random.uniform(-np.sqrt(1./state_layer_size), np.sqrt(1./state_layer_size), (state_layer_size, state_layer_size)) else: """ if state_layer_size == input_layer_size and state_layer_size == output_layer_size: print "Using identity matrices for U and V" self.U = np.eye(state_layer_size) self.V = np.eye(state_layer_size) else: self.U = np.random.uniform(1, 2., (state_layer_size, input_layer_size)) self.V = np.random.uniform(1, 2., (output_layer_size, state_layer_size)) self.W = np.random.uniform(-0.5, 0.5, (state_layer_size, state_layer_size)) # see if W matrix randomization is the cause #self.W = np.random.rand(2, 2) - 1/2#np.array([[0.51940038, -0.57702151],[0.64065148, 0.31259335]]) #self.W = np.array([[0.51940038, -0.57702151],[0.64065148, 0.31259335]]) self.state_bias = np.zeros((state_layer_size, 1)) self.output_bias = np.zeros((output_layer_size, 1)) # B - Feedback weight matrix for all layers """ self.B = np.random.uniform(-np.sqrt(1./state_layer_size), np.sqrt(1./state_layer_size), (state_layer_size, input_layer_size)) """ self.B = np.random.uniform(0., 0.5, self.W.shape) self.eta = eta self.verbose = verbose self.show_progress_bar = verbose > 0 def kernel_compute(self, t): return np.exp(-t / self.tau) def eWBe(self, x, y): o, s, s_linear, o_linear = self.forward_propagation(x) delta_o = o - y T = len(x) eWBe = [] for t in reversed(range(T)): e = delta_o[t] eWBe.append(np.dot(np.dot(np.dot(e.T, self.W), self.B), e)) return eWBe def fit(self, X, y, validation_size=0.1): """ Notes: Inputs: X_train: Training inputs. Expect a list with numpy arrays of size (input_layer_size, N) where N is the number of samples. Y_train: Training outputs. Expect a list with numpy arrays of size (output_layer_size, N) where N is the number of samples. Outputs: None """ eta = self.eta X = np.array(X) y = np.array(y) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=validation_size, random_state=0) if self.verbose: print "Validation size: {0}".format(validation_size) print "Training on {0} samples".format(len(X_train)) training_losses = [] validation_losses = [] # Non-online if self.show_progress_bar: bar = ProgressBar(max_value=len(X_train)) for epoch in range(self.epochs): #if self.learning_rule == 'modified': # self.kernel_convs = np.zeros_like(self.kernel_convs) training_loss = self.score(X_train, y_train) validation_loss = self.score(X_test, y_test) training_losses.append(training_loss) validation_losses.append(validation_loss) if self.verbose == 2: print "--------" print "Epoch {0}/{1}".format(epoch, self.epochs) print "Training loss: {0}".format(training_loss) print "Validation loss: {0}".format(validation_loss) print "--------" #eWBe = [] for i, (x, y) in enumerate(zip(X_train, y_train)): dLdU, dLdV, dLdW, dLdOb, dLdSb = self.gradient_function(x, y) self.W -= eta * dLdW #self.U -= eta * dLdU #self.V -= eta * dLdV self.state_bias -= eta * dLdSb #self.output_bias -= eta * dLdOb #eWBe.append(np.mean(self.eWBe(x, y))) if self.show_progress_bar: bar.update(i) if self.show_progress_bar: bar.update(0) return training_losses, validation_losses def forward_propagation(self, x): """ Inputs: x: Expect size (T, input_layer_size), where T is the length of time. Outputs: o: The activation of the output layer. s: The activation of the hidden state. """ if self.learning_rule == 'modified': self.kernel_convs = np.zeros((self.state_layer_size, x.shape[0])) T = x.shape[0] s = np.zeros((T + 1, self.state_layer_size)) o = np.zeros((T, self.output_layer_size)) s_linear = np.zeros((T + 1, self.state_layer_size)) o_linear = np.zeros((T, self.output_layer_size)) state_bias = Convert2DTo1D(self.state_bias) output_bias = Convert2DTo1D(self.output_bias) for t in np.arange(T): state_linear = np.dot(self.U, x[t]) + np.dot(self.W, s[t - 1]) + state_bias s_linear[t] = state_linear s[t] = self.state_activation.activate(state_linear) if self.learning_rule == 'modified' and t > 0: alpha = 1 / self.tau self.kernel_convs[:, t] = alpha * s[t] + ( 1 - alpha) * self.kernel_convs[:, t - 1] output_linear = np.dot(self.V, s[t]) + output_bias o[t] = self.output_activation.activate(output_linear) o_linear[t] = output_linear return (o, s, s_linear, o_linear) def modified_learning_rule(self, x, y): """ Output: dLdU: Gradient for U matrix dLdV: Gradient for V matrix dLdW: Gradient for W matrix dLdOb: Gradient for output layer bias dLdSb: Gradient for state layer bias Hyper Parameters: K : Kernel T : Timesteps after which the weights are updated Learning Rule: Take a Random Backward Weight Vector(B) in same direction as W and minimize the error """ T = len(y) assert T == len(x) if self.bptt_truncate is None: bptt_truncate = T else: bptt_truncate = self.bptt_truncate o, s, s_linear, o_linear = self.forward_propagation(x) #Initialize Random backward weights dLdU = np.zeros(self.U.shape) dLdV = np.zeros(self.V.shape) dLdW = np.zeros(self.W.shape) dLdOb = np.zeros(self.output_bias.shape) dLdSb = np.zeros(self.state_bias.shape) num_dW_additions = 0 delta_o = o - y for t in reversed(range(T)): # Get the error at the output layer e = self.V.T.dot(delta_o[t]) o_linear_val = o_linear[t] e = Convert1DTo2D(e) o_linear_val = Convert1DTo2D(o_linear_val) #kernel_sum = 0 # Backpropagation through time for at most bptt truncate steps #for t_prime in (range(max(0,t-50),t+1)): #for t_prime in (range(t+1)): # state_activation = s[t_prime] # state_linear = s_linear[t_prime - 1] # k = self.kernel_compute(t - t_prime) # kernel_sum += k * state_activation * self.state_activation.dactivate(state_linear) #kernel_sum = kernel_sum/(t+1) #kernel_sum = Convert1DTo2D(kernel_sum) dLdW += self.B.dot(e).dot( Convert1DTo2D(self.kernel_convs[:, t]).T ) #np.dot(np.dot(self.B, e), kernel_sum.T) dLdSb += np.dot(self.B, e) num_dW_additions += 1 return [ dLdU, dLdV, dLdW / num_dW_additions, dLdOb, dLdSb / num_dW_additions ] def direct_feedback_alignment(self, x, y): """ Output: dLdU: Gradient for U matrix dLdV: Gradient for V matrix dLdW: Gradient for W matrix dLdOb: Gradient for output layer bias dLdSb: Gradient for state layer bias """ T = len(y) assert T == len(x) if self.bptt_truncate is None: bptt_truncate = T else: bptt_truncate = self.bptt_truncate o, s, s_linear, o_linear = self.forward_propagation(x) dLdU = np.zeros(self.U.shape) dLdV = np.zeros(self.V.shape) dLdW = np.zeros(self.W.shape) dLdOb = np.zeros(self.output_bias.shape) dLdSb = np.zeros(self.state_bias.shape) num_dU_additions = 0 num_dVdW_additions = 0 delta_o = o - y for t in reversed(range(T)): # Backprop the error at the output layer e = self.V.T.dot(delta_o[t]) o_linear_val = o_linear[t] state_activation = s[t] e = Convert1DTo2D(e) o_linear_val = Convert1DTo2D(o_linear_val) state_activation = Convert1DTo2D(state_activation) num_dU_additions += 1 # Backpropagation through time for at most bptt truncate steps for bptt_step in reversed(range(max(0, t - bptt_truncate), t + 1)): state_linear = s_linear[bptt_step] state_activation_prev = s[bptt_step - 1] x_present = x[t] g = self.B.dot(e.copy()) state_linear = Convert1DTo2D(state_linear) state_activation_prev = Convert1DTo2D(state_activation_prev) x_present = Convert1DTo2D(x_present) g = g * self.state_activation.dactivate(state_linear) dLdW += np.dot(g, state_activation_prev.T) dLdSb += g num_dVdW_additions += 1 num_dVdW_additions = T return [ dLdU / num_dU_additions, dLdV / num_dVdW_additions, dLdW / num_dVdW_additions, dLdOb / num_dU_additions, dLdSb / num_dVdW_additions ] def feedback_alignment(self, x, y): """ Output: dLdU: Gradient for U matrix dLdV: Gradient for V matrix dLdW: Gradient for W matrix dLdOb: Gradient for output layer bias dLdSb: Gradient for state layer bias """ T = len(y) assert T == len(x) if self.bptt_truncate is None: bptt_truncate = T else: bptt_truncate = self.bptt_truncate o, s, s_linear, o_linear = self.forward_propagation(x) dLdU = np.zeros(self.U.shape) dLdV = np.zeros(self.V.shape) dLdW = np.zeros(self.W.shape) dLdOb = np.zeros(self.output_bias.shape) dLdSb = np.zeros(self.state_bias.shape) num_dU_additions = 0 num_dVdW_additions = 0 delta_o = o - y for t in reversed(range(T)): # Backprop the error at the output layer g = self.V.T.dot(delta_o[t]) o_linear_val = o_linear[t] state_activation = s[t] g = Convert1DTo2D(g) o_linear_val = Convert1DTo2D(o_linear_val) state_activation = Convert1DTo2D(state_activation) num_dU_additions += 1 # Backpropagation through time for at most bptt truncate steps for bptt_step in reversed(range(max(0, t - bptt_truncate), t + 1)): state_linear = s_linear[bptt_step] state_activation_prev = s[bptt_step - 1] x_present = x[t] state_linear = Convert1DTo2D(state_linear) state_activation_prev = Convert1DTo2D(state_activation_prev) x_present = Convert1DTo2D(x_present) g = g * self.state_activation.dactivate(state_linear) dLdW += np.dot(g, state_activation_prev.T) dLdSb += g num_dVdW_additions += 1 g = np.dot(self.B, g) num_dVdW_additions = T return [ dLdU / num_dU_additions, dLdV / num_dVdW_additions, dLdW / num_dVdW_additions, dLdOb / num_dU_additions, dLdSb / num_dVdW_additions ] # online version #def bptt(self, x, y): """ Output: dLdU: Gradient for U matrix dLdV: Gradient for V matrix dLdW: Gradient for W matrix dLdOb: Gradient for output layer bias dLdSb: Gradient for state layer bias """ """ # TODO - numpy likes to provide 1D matrices instead of 2D, and unfortunately # we need 2D matrices. Therefore we have a lot of converting 1D to 2D matrices # and we might want to clean that later somehow... # TODO - also this can probably be cleaned more. t = len(y) assert t == len(x) if self.bptt_truncate is None: bptt_truncate = t else: bptt_truncate = self.bptt_truncate o, s, s_linear, o_linear = self.forward_propagation(x) dLdU = np.zeros(self.U.shape) dLdV = np.zeros(self.V.shape) dLdW = np.zeros(self.W.shape) dLdOb = np.zeros(self.output_bias.shape) dLdSb = np.zeros(self.state_bias.shape) num_dU_additions = 0 num_dVdW_additions = 0 delta_o = o - y # Backprop the error at the output layer g = delta_o[t - 1] o_linear_val = o_linear[t - 1] state_activation = s[t - 1] g = Convert1DTo2D(g) o_linear_val = Convert1DTo2D(o_linear_val) state_activation = Convert1DTo2D(state_activation) g = g * self.output_activation.dactivate(o_linear_val) dLdV += np.dot(g, state_activation.T) dLdOb += g num_dU_additions += 1 g = np.dot(self.V.T, g) # Backpropagation through time for at most bptt truncate steps for bptt_step in reversed(range(max(0, t - bptt_truncate), t + 1)): state_linear = s_linear[bptt_step] state_activation_prev = s[bptt_step - 1] x_present = x[t - 1] state_linear = Convert1DTo2D(state_linear) state_activation_prev = Convert1DTo2D(state_activation_prev) x_present = Convert1DTo2D(x_present) g = g * self.state_activation.dactivate(state_linear) dLdW += np.dot(g, state_activation_prev.T) dLdU += np.dot(g, x_present.T) dLdSb += g num_dVdW_additions += 1 g = g * np.dot(self.W.T, g) return [dLdU/num_dU_additions, dLdV/num_dVdW_additions, dLdW/num_dVdW_additions, dLdOb/num_dU_additions, dLdSb/num_dVdW_additions] """ # Non-online version def bptt(self, x, y): # TODO - numpy likes to provide 1D matrices instead of 2D, and unfortunately # we need 2D matrices. Therefore we have a lot of converting 1D to 2D matrices # and we might want to clean that later somehow... # TODO - also this can probably be cleaned more. T = len(y) assert T == len(x) if self.bptt_truncate is None: bptt_truncate = T else: bptt_truncate = self.bptt_truncate o, s, s_linear, o_linear = self.forward_propagation(x) dLdU = np.zeros(self.U.shape) dLdV = np.zeros(self.V.shape) dLdW = np.zeros(self.W.shape) dLdOb = np.zeros(self.output_bias.shape) dLdSb = np.zeros(self.state_bias.shape) num_dU_additions = 0 num_dVdW_additions = 0 delta_o = o - y for t in reversed(range(T)): # Backprop the error at the output layer g = delta_o[t] o_linear_val = o_linear[t] state_activation = s[t] g = Convert1DTo2D(g) o_linear_val = Convert1DTo2D(o_linear_val) state_activation = Convert1DTo2D(state_activation) g = g * self.output_activation.dactivate(o_linear_val) dLdV += np.dot(g, state_activation.T) dLdOb += g num_dU_additions += 1 g = np.dot(self.V.T, g) # Backpropagation through time for at most bptt truncate steps for bptt_step in reversed(range(max(0, t - bptt_truncate), t + 1)): state_linear = s_linear[bptt_step] state_activation_prev = s[bptt_step - 1] x_present = x[t] state_linear = Convert1DTo2D(state_linear) state_activation_prev = Convert1DTo2D(state_activation_prev) x_present = Convert1DTo2D(x_present) g = g * self.state_activation.dactivate(state_linear) dLdW += np.dot(g, state_activation_prev.T) dLdU += np.dot(g, x_present.T) dLdSb += g num_dVdW_additions += 1 g = np.dot(self.W.T, g) num_dVdW_additions = T return [ dLdU / num_dU_additions, dLdV / num_dVdW_additions, dLdW / num_dVdW_additions, dLdOb / num_dU_additions, dLdSb / num_dVdW_additions ] def predict(self, X): """ Inputs: X: Training inputs. Expect a list with numpy arrays of size (input_layer_size, N) where N is the number of samples. Outputs: predictions """ predictions = [] for x in X: o, _, _, _ = self.forward_propagation(x) predictions.append(o) return predictions def score(self, X, Y): """ Inputs: X: Training inputs. Expect a list with numpy arrays of size (input_layer_size, N) where N is the number of samples. Y: Training outputs. Expect a list with numpy arrays of size (output_layer_size, N) where N is the number of samples. Outputs: MSE """ predictions = self.predict(X) mses = [] for prediction, y in zip(predictions, Y): #mses.append(np.mean((prediction - y)**2)) mses.append(mean_squared_error(prediction, y)) return np.mean(mses)