def train(self, X, y, num_passes=1000, lr=0.01, regularization=0.01, to_print=True): # add gates m_Gate = MultiplyGate() a_Gate = AddGate() # activate nonlinear layer if self.activation_func == 'sigmoid': layer = Sigmoid() elif self.activation_func == 'tanh': layer = Tanh() # activate output layer if self.output_func == 'softmax': output = Softmax() elif self.output_func == 'lse': output = LSE() # for each epoch for epoch in range(num_passes): # Forward propagation input = X forward = [(None, None, input)] # for each layer except the last one for i in range(len(self.W)): mul = m_Gate.forward(self.W[i], input) add = a_Gate.forward(mul, self.b[i]) input = layer.forward(add) forward.append((mul, add, input)) # last output of forward propagation is an array: num_samples * num_neurons_last_layer # Back propagation # derivative of cumulative error from output layer dfunc = output.calc_diff(forward[len(forward) - 1][2], y) for i in range(len(forward) - 1, 0, -1): # 1 layer consists of mul, add and layer dadd = layer.backward(forward[i][1], dfunc) # dLdb and dLdmul are functions of dLdadd db, dmul = a_Gate.backward(forward[i][0], self.b[i - 1], dadd) dW, dfunc = m_Gate.backward(self.W[i - 1], forward[i - 1][2], dmul) # Add regularization terms (b1 and b2 don't have regularization terms) dW += regularization * self.W[i - 1] # Gradient descent parameter update self.b[i - 1] += -lr * db self.W[i - 1] += -lr * dW if to_print and epoch % 100 == 0: print("Loss after iteration %i: %f" % (epoch, self.calculate_loss(X, y)))
def calculate_loss(self, X, y): m_Gate = MultiplyGate() a_Gate = AddGate() if self.activation_func == 'sigmoid': layer = Sigmoid() elif self.activation_func == 'tanh': layer = Tanh() if self.output_func == 'softmax': output = Softmax() elif self.output_func == 'lse': output = LSE() input = X # loop through each layer for i in range(len(self.W)): # X*W mul = m_Gate.forward(self.W[i], input) # X*W + b add = a_Gate.forward(mul, self.b[i]) # nonlinear activation input = layer.forward(add) return output.eval_error(input, y)
def predict(self, X): m_Gate = MultiplyGate() a_Gate = AddGate() if self.activation_func == 'sigmoid': layer = Sigmoid() elif self.activation_func == 'tanh': layer = Tanh() if self.output_func == 'softmax': output = Softmax() elif self.output_func == 'lse': output = LSE() input = X for i in range(len(self.W)): mul = m_Gate.forward(self.W[i], input) add = a_Gate.forward(mul, self.b[i]) input = layer.forward(add) if self.output_func == 'softmax': probs = output.eval(input) return np.argmax(probs, axis=1) elif self.output_func == 'lse': return (np.greater(input, 0.5)) * 1
def calculate_loss(self, X, y): mulGate = MultiplyGate() addGate = AddGate() layer = Tanh() softmaxOutput = Softmax() input = X for i in range(len(self.W)): mul = mulGate.forward(self.W[i], input) add = addGate.forward(mul, self.b[i]) input = layer.forward(add) return softmaxOutput.loss(input, y)
def predict(self, X): mulGate = MultiplyGate() addGate = AddGate() layer = Tanh() softmaxOutput = Softmax() input = X for i in range(len(self.W)): mul = mulGate.forward(self.W[i], input) add = addGate.forward(mul, self.b[i]) input = layer.forward(add) probs = softmaxOutput.predict(input) return np.argmax(probs, axis=1)
def train(self, X, y, num_passes=20000, epsilon=0.01, reg_lambda=0.01, print_loss=False): mulGate = MultiplyGate() addGate = AddGate() layer = Tanh() softmaxOutput = Softmax() for epoch in range(num_passes): # Forward propagation input = X forward = [(None, None, input)] for i in range(len(self.W)): mul = mulGate.forward(self.W[i], input) add = addGate.forward(mul, self.b[i]) input = layer.forward(add) forward.append((mul, add, input)) # Back propagation dtanh = softmaxOutput.diff(forward[len(forward) - 1][2], y) for i in range(len(forward) - 1, 0, -1): dadd = layer.backward(forward[i][1], dtanh) db, dmul = addGate.backward(forward[i][0], self.b[i - 1], dadd) dW, dtanh = mulGate.backward(self.W[i - 1], forward[i - 1][2], dmul) # Add regularization terms (b1 and b2 don't have regularization terms) dW += reg_lambda * self.W[i - 1] # Gradient descent parameter update self.b[i - 1] += -epsilon * db self.W[i - 1] += -epsilon * dW # write log nn_log_instance.w = self.W nn_log_instance.b = self.b nn_log_instance.forward = forward nn_log_instance.write_log() if print_loss and epoch % 1000 == 0: print("Loss after iteration %i: %f" % (epoch, self.calculate_loss(X, y)))
from activation import Tanh from gate import AddGate, MultiplyGate mulgate = MultiplyGate() addgate = AddGate() tanh = Tanh() class RNNLayer: def foward(self, x, prev_a, waa, wax, wya): self.mulax = mulgate.forward(wax, x) self.mulaa = mulgate.forward(waa, prev_a) self.add = addgate.forward(self.mulax, self.mulaa) self.a = tanh.forward(self.add) self.mulya = mulgate.forward(wya, a) ## dmulya = y^t - yt ## dV = (y^t - yt) * at def backward(self, x, prev_a, waa, wax, wya, diff_a, dmulya): self.forward(x, prev_a, waa, wax, wya) dV, dav = mulgate.backward(wya, self.a, dmulya) da = dav + diff_a
from activation import Tanh from gate import AddGate, MultiplyGate import numpy as np mulGate = MultiplyGate() addGate = AddGate() activation = Tanh() class RNNLayer: def forward(self, x, prev_s, U, W, V): self.mulu = mulGate.forward(U, x) self.mulw = mulGate.forward(W, prev_s) self.add = addGate.forward(self.mulw, self.mulu) self.s = activation.forward(self.add) self.mulv = mulGate.forward(V, self.s) def backward(self, x, prev_s, U, W, V, diff_s, dmulv, forward=True): if forward: self.forward(x, prev_s, U, W, V) dV, dsv = mulGate.backward(V, self.s, dmulv) ds = dsv + diff_s dadd = activation.backward(self.add, ds) dmulw, dmulu = addGate.backward(self.mulw, self.mulu, dadd) dW, dprev_s = mulGate.backward(W, prev_s, dmulw) dU, dx = mulGate.backward(U, x, dmulu) return (dprev_s, dU, dW, dV) def backward1(self, x, prev_s, U, W, V, delta1, dmulv, forward=True): if forward: self.forward(x, prev_s, U, W, V)