def compute_grad_by_params(w, b): layer = Dense(32, 64) layer.weights = np.array(w) layer.biases = np.array(b) x = np.linspace(-1, 1, 10 * 32).reshape([10, 32]) layer.backward(x, np.ones([10, 64]), optim='gd', lr=1) return w - layer.weights, b - layer.biases
class MnistNetMiniBatch: def __init__(self): self.d1_layer = Dense(784, 100) self.a1_layer = ReLu() self.drop1_layer = Dropout(0.5) self.d2_layer = Dense(100, 50) self.a2_layer = ReLu() self.drop2_layer = Dropout(0.25) self.d3_layer = Dense(50, 10) self.a3_layer = Softmax() def forward(self, x, train=True): net = self.d1_layer.forward(x) net = self.a1_layer.forward(net) net = self.drop1_layer.forward(net, train) net = self.d2_layer.forward(net) net = self.a2_layer.forward(net) net = self.drop2_layer.forward(net, train) net = self.d3_layer.forward(net) net = self.a3_layer.forward(net) return (net) def backward(self, dz, learning_rate=0.01, mini_batch=True, update=False, len_mini_batch=None): dz = self.a3_layer.backward(dz) dz = self.d3_layer.backward(dz, learning_rate=learning_rate, mini_batch=mini_batch, update=update, len_mini_batch=len_mini_batch) dz = self.drop2_layer.backward(dz) dz = self.a2_layer.backward(dz) dz = self.d2_layer.backward(dz, learning_rate=learning_rate, mini_batch=mini_batch, update=update, len_mini_batch=len_mini_batch) dz = self.drop1_layer.backward(dz) dz = self.a1_layer.backward(dz) dz = self.d1_layer.backward(dz, learning_rate=learning_rate, mini_batch=mini_batch, update=update, len_mini_batch=len_mini_batch) return dz
class Sampler(): def __init__(self, inputDim=1, outputDim=1, optimizer=Adam()): self.inputDim = inputDim self.outputDim = outputDim self.mean = Dense(self.inputDim, self.outputDim, activation=Identity(), optimizer=copy.copy(optimizer)) self.logVar = Dense(self.inputDim, self.outputDim, activation=Identity(), optimizer=copy.copy(optimizer)) def feedforward(self, input): self.latentMean = self.mean.feedforward(input) self.latentLogVar = self.logVar.feedforward(input) self.epsilon = np.random.standard_normal(size=(self.outputDim, input.shape[1])) self.sample = self.latentMean + np.exp( self.latentLogVar / 2.) * self.epsilon return self.sample def backpropagate(self, lastGradient): gradLogVar = {} gradMean = {} tmp = self.outputDim * lastGradient.shape[1] # KL divergence gradients gradLogVar["KL"] = (np.exp(self.latentLogVar) - 1) / (2 * tmp) gradMean["KL"] = self.latentMean / tmp # MSE gradients gradLogVar["MSE"] = 0.5 * lastGradient * self.epsilon * np.exp( self.latentLogVar / 2.) gradMean["MSE"] = lastGradient # backpropagate gradients thorugh self.mean and self.logVar return self.mean.backward(gradMean["KL"] + gradMean["MSE"]) + self.logVar.backward( gradLogVar["KL"] + gradLogVar["MSE"]) def getKLDivergence(self, output): # output.shape[1] == batchSize return -np.sum(1 + self.latentLogVar - np.square(self.latentMean) - np.exp(self.latentLogVar)) / (2 * self.outputDim * output.shape[1])
def test_dense_layer_NUMERICAL_GRADIENT_CHECK(self): x = np.linspace(-1, 1, 10 * 32).reshape([10, 32]) l = Dense(32, 64) numeric_grads = eval_numerical_gradient(lambda x: l.forward(x).sum(), x) grads = l.backward(x, np.ones([10, 64]), optim='gd', lr=0) self.assertTrue(np.allclose(grads, numeric_grads, rtol=1e-5, atol=0), msg="input gradient does not match numeric grad")
layer1.forward(trainingData[batch]) activation1.forward(layer1.outputs) layer2.forward(activation1.outputs) activation2.forward(layer2.outputs) cost.forward(activation2.outputs, labels[batch], 10) for sample in range(activation2.outputs.shape[1]): if np.argmax(activation2.outputs[:, sample]) == np.argmax( labels[batch, sample]): correct += 1 cost.backward(activation2.outputs, labels[batch], 10) activation2.backward(layer2.outputs, layer2.weights.shape[0], BATCH_SIZE) layer2.backward(activation1.outputs) activation1.backward(layer1.outputs) layer1.backward(trainingData[batch]) delta1 = np.zeros((cost.prime.shape[0], cost.prime.shape[1])) for i in range(cost.prime.shape[0]): delta1[i] = np.matmul(cost.prime[i], activation2.prime[i]) delta1_wrt_L2 = np.matmul(delta1, layer2.input_prime) delta2 = np.zeros( (activation1.prime.shape[0], activation1.prime.shape[2])) for i in range(activation1.prime.shape[2]): delta2[:, i] = np.matmul(delta1_wrt_L2[i], activation1.prime[:, :, i]) C_wrt_W2 = np.zeros(
sw1 = swish1.forward(z1) sw2 = dense2.forward(sw1) y_pre = swish2.forward(sw2) # loss = loss_fn.loss(y_true, y_pred) # print("loss: ", loss) # print("loss's mean: ",np.mean(loss)) # sigloss = loss_fn.loss(y_true, y_pre) # print("SIG loss: ", sigloss) # print("sig loss's mean: ",np.mean(sigloss)) # dldy_pred = loss_fn.gradient(y_true , y_pred) # print("lldy: ",dldy_pred) # dldz2 = activation2.backward(dldy_pred) # print("dldz2: ",dldz2) # dLda1 = dense2.backward(dldz2) # print("dLda1: ",dLda1) # dLz1 = sigmoid.backward(dLda1) # dLdw = dense.backward(dLz1) d1 = loss_fn.gradient(y_true, y_pre) # a = swish2.gradient(d1) # print(d1) d2 = swish2.backward(d1) d3 = dense2.backward(d2) d4 = swish1.backward(d3) d5 = dense.backward(d4) print(d2)
# Dense -> Activation -> Dense -> Activation -> y_pred z1 = dense.forward(x) a1 = activation1.forward(z1) print("Activation Value:", a1) z2 = dense2.forward(a1) a2 = activation2.forward(z2) y_pred = a2 loss = loss_func.loss(y_true, y_pred) print("Individual Loss:", loss) total_loss = np.mean(loss) print("Total Loss:", total_loss) #Backward Propagation dLdy_pred = loss_func.gradient(y_true, y_pred) print("dLdy:", dLdy_pred) ''' dydz=activation2.gradient(z2) dLdz2-dLdy_pred*dydz ''' dLdz2 = activation2.backward(dLdy_pred) dLda1 = dense2.backward(dLdz2) dLdz1 = sigmoid.backward(dLda1) dLdw = dense.backward(dLdz1)