def bptt(self, x, y): assert len(x) == len(y) output = Softmax() layers = self.forward_propagation(x) dU = np.zeros(self.U.shape) dV = np.zeros(self.V.shape) dW = np.zeros(self.W.shape) T = len(layers) prev_s_t = np.zeros(self.hidden_dim) diff_s = np.zeros(self.hidden_dim) for t in range(0, T): dmulv = output.diff(layers[t].mulv, y[t]) input = np.zeros(self.word_dim) input[x[t]] = 1 dprev_s, dU_t, dW_t, dV_t = layers[t].backward(input, prev_s_t, self.U, self.W, self.V, diff_s, dmulv) prev_s_t = layers[t].s dmulv = np.zeros(self.word_dim) for i in range(t-1, max(-1, t-self.bptt_truncate-1), -1): input = np.zeros(self.word_dim) input[x[i]] = 1 prev_s_i = np.zeros(self.hidden_dim) if i == 0 else layers[i-1].s dprev_s, dU_i, dW_i, dV_i = layers[i].backward(input, prev_s_i, self.U, self.W, self.V, dprev_s, dmulv) dU_t += dU_i dW_t += dW_i dV += dV_t dU += dU_t dW += dW_t return (dU, dW, dV)
def bptt(self, x, y): assert len(x) == len(y) output = Softmax() layers = self.forward_propagation(x) dU = np.zeros(self.U.shape) dV = np.zeros(self.V.shape) dW = np.zeros(self.W.shape) T = len(layers) prev_s_t = np.zeros(self.hidden_dim) diff_s = np.zeros(self.hidden_dim) delta = np.zeros(self.hidden_dim) for k in range(0, T): t = T - k - 1 input = np.zeros(self.word_dim) input[x[t]] = 1 if t == 0: prev_s_t = np.zeros(self.hidden_dim) else: prev_s_t = layers[t - 1].s dmulv = output.diff(layers[t].mulv, y[t]) delta, dU_t, dW_t, dV_t = layers[t].backward1( input, prev_s_t, self.U, self.W, self.V, delta, dmulv) dV += dV_t dU += dU_t dW += dW_t return (dU, dW, dV)
def train(self, X, y, num_passes=20000, epsilon=0.01, reg_lambda=0.01, print_loss=False): mulGate = MultiplyGate() addGate = AddGate() layer = Tanh() softmaxOutput = Softmax() for epoch in range(num_passes): # Forward propagation input = X forward = [(None, None, input)] for i in range(len(self.W)): mul = mulGate.forward(self.W[i], input) add = addGate.forward(mul, self.b[i]) input = layer.forward(add) forward.append((mul, add, input)) # Back propagation dtanh = softmaxOutput.diff(forward[len(forward) - 1][2], y) for i in range(len(forward) - 1, 0, -1): dadd = layer.backward(forward[i][1], dtanh) db, dmul = addGate.backward(forward[i][0], self.b[i - 1], dadd) dW, dtanh = mulGate.backward(self.W[i - 1], forward[i - 1][2], dmul) # Add regularization terms (b1 and b2 don't have regularization terms) dW += reg_lambda * self.W[i - 1] # Gradient descent parameter update self.b[i - 1] += -epsilon * db self.W[i - 1] += -epsilon * dW # write log nn_log_instance.w = self.W nn_log_instance.b = self.b nn_log_instance.forward = forward nn_log_instance.write_log() if print_loss and epoch % 1000 == 0: print("Loss after iteration %i: %f" % (epoch, self.calculate_loss(X, y)))