def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, K=10): """ Negative sampling cost function for word2vec models """ # Implement the cost and gradients for one predicted word vector # and one target word vector as a building block for word2vec # models, using the negative sampling technique. K is the sample # size. You might want to use dataset.sampleTokenIdx() to sample # a random word index. # # Note: See test_word2vec below for dataset's initialization. # # Input/Output Specifications: same as softmaxCostAndGradient # We will not provide starter code for this function, but feel # free to reference the code you previously wrote for this # assignment! ### YOUR CODE HERE grad = np.zeros(outputVectors.shape) s = sigmoid(np.dot(outputVectors[target,:], predicted)) cost = -np.log(s) gradPred = - sigmoid_grad(s)/s*outputVectors[target,:] grad[target,:] = - sigmoid_grad(s)/s*predicted for k in range(K): i = dataset.sampleTokenIdx() s = sigmoid( - np.dot(outputVectors[i,:], predicted)) cost -= np.log(s) gradPred += sigmoid_grad(s)/s*outputVectors[i,:] grad[i,:] += sigmoid_grad(s)/s*predicted ### END YOUR CODE return cost, gradPred, grad
def test_sigmoid_gradient(dim_1, dim_2): a1 = np.random.normal(loc=0., scale=20., size=(dim_1,dim_2)) shift = np.random.uniform(low=1e-9, high=1e-5, size=(dim_1,dim_2)) ap = a1 + shift am = a1 - shift dsigmoid = (sigmoid(ap) - sigmoid(am)) / (2*shift) assert np.abs(np.max(dsigmoid - sigmoid_grad(sigmoid(a1)))) <= 1e-7 assert np.abs(np.min(dsigmoid - sigmoid_grad(sigmoid(a1)))) <= 1e-7
def test_sigmoidgrad(): """ Original sigmoid gradient test defined in q2_sigmoid.py; """ x = np.array([[1, 2], [-1, -2]]) f = sigmoid(x) g = sigmoid_grad(f) assert rel_error(g, np.array([[0.19661193, 0.10499359], [0.19661193, 0.10499359]])) <= 1e-7
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation #z1 = data.dot(W1) + b1 #hidden = sigmoid(z1) #z2 = hidden.dot(W2) + b2 #print 'z2.shape: ', z2.shape #prediction = softmax(z2) ### END YOUR CODE hidden = sigmoid(data.dot(W1) + b1) prediction = softmax(hidden.dot(W2) + b2) cost = -np.sum(np.log(prediction) * labels) ### YOUR CODE HERE: backward propagation #print 'NN: ', Dx, H, Dy #print 'b1.shape: ', b1.shape #print 'prediction.shape: ', prediction.shape #print 'labels.shape : ', labels.shape #print 'W2.shape: ', W2.shape #print 'hidden.shape: ', hidden.shape #print 'hidden.T.shape: ', hidden.T.shape #print 'delta.shape: ', delta.shape #print 'W1.shape: ', W1.shape #print 'data.shape: ', data.shape #gradW2 = delta * hidden #print 'sigmoid_grad(hidden).shape: ', sigmoid_grad(hidden).shape delta = prediction - labels gradW2 = hidden.T.dot(delta) gradb2 = np.sum(delta, axis = 0) hidden_delta = delta.dot(W2.T) * sigmoid_grad(hidden) gradW1 = data.T.dot(hidden_delta) gradb1 = np.sum(hidden_delta, axis = 0) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation # data: N x Dx, W1: Dx x H, b: 1 x H a = data.dot(W1) + b1 h = sigmoid(a) # h: N x H, W2: H x Dy, b2: 1 x Dy t = h.dot(W2) + b2 y_hat = softmax(t) # y_hat: N x Dy, labels: N x Dy (as int) probs = labels * y_hat cost = np.sum(-np.log(probs.sum(axis=1))) ### END YOUR CODE ### YOUR CODE HERE: backward propagation # obtain the softmax gradient dJdt = (y_hat - labels) # N x Dy # b2 grad is sum along each index of the Dy vectors gradb2 = np.sum(dJdt, 0) # h: N x H, dJdt: N x Dy gradW2 = h.T.dot(dJdt) # H x Dy # dJdt: N x Dy, W2: H x Dy dJdh = dJdt.dot(W2.T) # h: N x H dhda = sigmoid_grad(h) # data: N x Dx, dhda: N x H, DJdh: N x H gradW1 = data.T.dot(dhda * dJdh) # dhda: N x H, DJdh: N x H gradb1 = np.sum(dhda * dJdh, 0) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) print Dx,H,Dy W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation h=sigmoid(data.dot(W1)+b1) y=softmax(h.dot(W2)+b2) cost=-np.sum(labels*np.log(y)) ### END YOUR CODE ### YOUR CODE HERE: backward propagation gradW1 = data.T.dot((y-labels).dot(W2.T)*sigmoid_grad(h)) gradW2=h.T.dot(y-labels) print (y-labels) .shape gradb1=np.sum((y-labels).dot(W2.T)*sigmoid_grad(h),axis=0) gradb2=np.sum(y-labels,axis=0) #delta = prediction - labels #gradW2 = hidden.T.dot(delta) #gradb2 = np.sum(delta,axis=0) #delta = delta.dot(W2.T)*sigmoid_grad(hidden) #gradW1=data.T.dot(delta) #gradb1=np.sum(delta,axis=0) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def test_sigmoid(dim_1, dim_2): a1 = np.random.normal(loc=0., scale=20., size=(dim_1,dim_2)) a1_copy = a1.copy() s_a1 = sigmoid(a1) s_sol_a1 = sigmoid_sol(a1_copy) assert rel_error(sigmoid_grad(s_a1), sigmoid_grad_sol(s_sol_a1)) <= 1e-10
def forward_backward_prop(X, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, the backward propagation for the gradients for all parameters. Notice the gradients computed here are different from the gradients in the assignment sheet: they are w.r.t. weights, not inputs. Arguments: X -- M x Dx matrix, where each row is a training example x. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) # Note: compute cost based on `sum` not `mean`. ### YOUR CODE HERE: forward propagation h = sigmoid(np.dot(X, W1) + b1) # [m, Dy] yhat = softmax(np.dot(h, W2) + b2) # /m cost = np.sum(-np.log(yhat[labels == 1])) / X.shape[0] ### END YOUR CODE ### YOUR CODE HERE: backward propagation d3 = (yhat - labels) / X.shape[0] gradW2 = np.dot(h.T, d3) gradb2 = np.sum(d3, 0, keepdims=True) dh = np.dot(d3, W2.T) grad_h = sigmoid_grad(h) * dh gradW1 = np.dot(X.T, grad_h) gradb1 = np.sum(grad_h, 0) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(X, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, the backward propagation for the gradients for all parameters. Notice the gradients computed here are different from the gradients in the assignment sheet: they are w.r.t. weights, not inputs. Arguments: X -- M x Dx matrix, where each row is a training example x. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) # Note: compute cost based on `sum` not `mean`. ### YOUR CODE HERE: forward propagation z1 = np.dot(X, W1) + b1 # M*H h = sigmoid(z1) # M*H z2 = np.dot(h, W2) + b2 # M*Dy Y = softmax(z2) # M*Dy cost = np.sum(-labels * np.log(Y)) ### END YOUR CODE ### YOUR CODE HERE: backward propagation delta1 = Y - labels # M*Dy gradb2 = np.sum(delta1, 0, keepdims=True) # M*Dy gradW2 = np.dot(h.T, delta1) # H*Dy delta2 = np.dot(delta1, W2.T) # M*H ## Take care!! The argument of sigmoid_grad is sigmoid function value!! delta3 = np.multiply(delta2, sigmoid_grad(h)) # M*H gradW1 = np.dot(X.T, delta3) # Dx*H gradb1 = np.sum(delta3, 0, keepdims=True) # 1*H ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. Arguments: data -- M x Dx matrix, where each row is a training example. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation z1 = np.dot(data, W1) + b1 h = sigmoid(z1) z2 = np.dot(h, W2) + b2 y_hat = softmax(z2) cost = -np.sum(labels * np.log(y_hat)) ### END YOUR CODE ### YOUR CODE HERE: backward propagation # d1 = dJ/dz2 d1 = y_hat - labels # d2 = dJ/dh d2 = np.dot(d1, W2.T) # d3 = dJ/dz1 d3 = d2 * sigmoid_grad(h) gradW2 = np.dot(h.T, d1) gradb2 = np.sum(d1, axis=0) gradW1 = np.dot(data.T, d3) gradb1 = np.sum(d3, axis=0) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(X, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, the backward propagation for the gradients for all parameters. Notice the gradients computed here are different from the gradients in the assignment sheet: they are w.r.t. weights, not inputs. Arguments: X -- M x Dx matrix, where each row is a training example x. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) # Note: compute cost based on `sum` not `mean`. ### YOUR CODE HERE: forward propagation z1 = np.dot(X, W1) + b1 a = sigmoid(z1) z2 = np.dot(a, W2) + b2 y = softmax(z2) cost = -np.sum(np.log(y) * labels) # ### END YOUR CODE # ### YOUR CODE HERE: backward propagation grady = y - labels gradW2 = np.dot(a.T, grady) gradb2 = np.sum(grady, axis=0) grada = np.dot(grady, W2.T) gradz1 = grada * sigmoid_grad(a) gradW1 = np.dot(X.T, gradz1) gradb1 = np.sum(gradz1, axis=0) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. Arguments: data -- M x Dx matrix, where each row is a training example. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation z = data.dot(W1) + b1 h = sigmoid(z) yPredict = softmax(h.dot(W2) + b2) # raise NotImplementedError ### END YOUR CODE ### YOUR CODE HERE: backward propagation n = data.shape[0] costs = -np.log(yPredict[labels == 1]) cost = np.sum(costs) / n gradTheta = yPredict - labels # N*Dy gradTheta_aver = gradTheta / n gradW2 = h.T.dot(gradTheta_aver) # H*N dot N*Dy = H*Dy gradb2 = np.sum(gradTheta_aver, axis=0) # 1*Dy gradh = gradTheta_aver.dot(W2.T) # N*Dy dot Dy*H = N*H gradz = sigmoid_grad(h) * gradh # N*H gradW1 = data.T.dot(gradz) # Dx*N dot N*H = Dx*H gradb1 = np.sum(gradz, axis=0) # 1*H # raise NotImplementedError ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(X, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, the backward propagation for the gradients for all parameters. Notice the gradients computed here are different from the gradients in the assignment sheet: they are w.r.t. weights, not inputs. Arguments: X -- M x Dx matrix, where each row is a training example x. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) # 10, 5, 10 W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) # 10, 5 ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) # 1, 5 ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) # 5, 10 ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) # 1, 10 # Note: compute cost based on `sum` not `mean`. ### YOUR CODE HERE: forward propagation z2 = X.dot(W1) + b1 # (20, 5) - 20 is the number if training examples a2 = sigmoid(z2) # (20, 5) z3 = a2.dot(W2) + b2 # (20, 10) a3 = softmax(z3) # (20, 10) cost = -np.sum(labels * np.log(a3)) # cross entropy cost ### END YOUR CODE ### YOUR CODE HERE: backward propagation delta3 = a3 - labels # 20, 10 - the derivative of cross entropy gradb2 = np.sum( delta3, 0, keepdims=True ) # summing over training examples (1, 10), derivative over b is 1 gradW2 = np.dot( a2.T, delta3 ) # works similar to the derivative with respect to input x or hidden layer h delta2 = sigmoid_grad(a2) * np.dot(delta3, W2.T) # see assign1, 2(c) gradb1 = np.sum(delta2, 0, keepdims=True) gradW1 = np.dot(X.T, delta2) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. Arguments: data -- M x Dx matrix, where each row is a training example. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) if len(data.shape) >= 2: (N, _) = data.shape ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation a1 = sigmoid(data.dot(W1) + b1) a2 = softmax(a1.dot(W2) + b2) cost = -np.sum(np.log(a2[labels == 1])) / N #raise NotImplementedError ### END YOUR CODE ### YOUR CODE HERE: backward propagation grad_a2 = (a2 - labels) gradW2 = np.dot(a1.T, grad_a2) * (1.0 / N) gradb2 = np.sum(grad_a2, axis=0, keepdims=True) * (1.0 / N) grad_a1 = np.dot(grad_a2, W2.T) * sigmoid_grad(a1) gradW1 = np.dot(data.T, grad_a1) * (1.0 / N) gradb1 = np.sum(grad_a1, axis=0, keepdims=True) * (1.0 / N) #raise NotImplementedError ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def test_sigmoid(self): sig = lambda x: (sigmoid(x), sigmoid_grad(sigmoid(x))) random_ints = np.random.randint(1, 100, 100) random_floats = np.random.random_sample((100, )) random_floats = random_ints * random_floats for number in random_floats: result = gradcheck_naive(sig, np.array(number)) self.assertTrue(float(result) <= 1e-5)
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. Arguments: data -- M x Dx matrix, where each row is a training example. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) z1 = data.dot(W1) + b1 h1 = sigmoid(z1) z2 = h1.dot(W2) + b2 # raise NotImplementedError ### END YOUR CODE ### YOUR CODE HERE: backward propagation y_pred = softmax(z2) cost = -np.sum(np.log(y_pred) * labels) #print np.log(y_pred) * labels, cost delta = y_pred - labels print(cost, y_pred.shape, labels.shape) gradW2 = h1.T.dot(delta) gradb2 = np.sum(delta, axis=0) gradh1 = delta.dot(W2.T) gradz1 = gradh1 * sigmoid_grad(h1) #print (gradz1.shape, sigmoid_grad(h1).shape, gradh1.shape) gradW1 = data.T.dot(gradz1) gradb1 = np.sum(gradz1, axis=0) #print(gradb1.shape) ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(X, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, the backward propagation for the gradients for all parameters. Notice the gradients computed here are different from the gradients in the assignment sheet: they are w.r.t. weights, not inputs. Arguments: X -- M x Dx matrix, where each row is a training example x. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) # Note: compute cost based on `sum` not `mean`. ### Forward propagation z = X.dot(W1) + b1 h = sigmoid(z) theta = h.dot(W2) + b2 y_hat = softmax(theta) cost = -np.sum(labels * np.log(y_hat)) ### Backward propagation # Note: the gradients computed here are w.r.t.weights. grad_theta = y_hat - labels grad_b2 = np.sum(grad_theta, axis=0, keepdims=True) grad_W2 = np.dot(h.T, grad_theta) grad_h = np.dot(grad_theta, W2.T) grad_sigmoid = grad_h * sigmoid_grad(h) grad_b1 = np.sum(grad_sigmoid, axis=0, keepdims=True) grad_W1 = np.dot(X.T, grad_sigmoid) assert grad_b2.shape == b2.shape assert grad_W2.shape == W2.shape assert grad_b1.shape == b1.shape assert grad_W1.shape == W1.shape ### Stack gradients (do not modify) grad = np.concatenate((grad_W1.flatten(), grad_b1.flatten(), grad_W2.flatten(), grad_b2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. Arguments: data -- M x Dx matrix, where each row is a training example. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation x = data #print x.shape y = labels h = sigmoid(x.dot(W1) + b1) y_pred = softmax(h.dot(W2) + b2) cost = np.sum(-np.log(y_pred[labels == 1])) / data.shape[0] print 'x: ', x.shape print 'y: ', y.shape print 'h: ', h.shape print 'y_pred: ', y_pred.shape print 'cost: ', cost.shape, ' = ', cost ### END YOUR CODE ### YOUR CODE HERE: backward propagation d3 = (y_pred - y) / data.shape[0] gradW2 = np.dot(h.T, d3) gradb2 = np.sum(d3, 0, keepdims=True) d2 = np.dot(d3, W2.T) d1 = d2 * sigmoid_grad(h) gradW1 = np.dot(x.T, d1) gradb1 = np.sum(d1, 0) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation # data : N * Dx # W1 : Dx * H # b1 : 1 * H # W2 : H * Dy # b2 : 1 * Dy N = data.shape[0] z1 = data.dot(W1) + b1 a1 = sigmoid(z1) # N * H z2 = a1.dot(W2) + b2 a2 = softmax(z2) # N * Dy cost = np.sum(-np.log(a2[labels == 1])) / N ### END YOUR CODE ### YOUR CODE HERE: backward propagation delta_score = a2 - labels # 1 * Dy delta_score /= N gradW2 = np.dot(a1.T, delta_score) # H * 1 * 1 * Dy = H * Dy gradb2 = np.sum(delta_score, axis=0) grad_h = np.dot(delta_score, W2.T) # 1 * Dy * Dy * H = 1 * H grad_h = sigmoid_grad(a1) * grad_h gradW1 = np.dot(data.T, grad_h) gradb1 = np.sum(grad_h, axis=0) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(X, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, the backward propagation for the gradients for all parameters. Notice the gradients computed here are different from the gradients in the assignment sheet: they are w.r.t. weights, not inputs. Arguments: X -- M x Dx matrix, where each row is a training example x. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) # Note: compute cost based on `sum` not `mean`. ### YOUR CODE HERE: forward propagation fc1 = X.dot(W1) + b1 # [M,H] sig1 = sigmoid(fc1) # [M,H] scores = sig1.dot(W2) + b2 # [M,Dy] shifted_scores = scores - np.max(scores, axis=-1, keepdims=True) # [M,Dy] z = np.exp(shifted_scores).sum(axis=-1, keepdims=True) # [M,1] log_porbs = shifted_scores - np.log(z) cost = -1 * (log_porbs * labels).sum() ### END YOUR CODE ### YOUR CODE HERE: backward propagation dout = np.exp(log_porbs) dout[labels == 1] -= 1 gradW2 = sig1.T.dot(dout) gradb2 = dout.sum(axis=0) dsig1 = dout.dot(W2.T) dfc1 = sigmoid_grad(sig1) * dsig1 gradW1 = X.T.dot(dfc1) gradb1 = dfc1.sum(axis=0) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(X, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, the backward propagation for the gradients for all parameters. Notice the gradients computed here are different from the gradients in the assignment sheet: they are w.r.t. weights, not inputs. Arguments: X -- M x Dx matrix, where each row is a training example x. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) # Note: compute cost based on `sum` not `mean`. ### YOUR CODE HERE: forward propagation z1 = np.dot(X, W1) + b1 # R - M * H h = sigmoid(z1) # R - M * H z2 = np.dot(h, W2) + b2 # R - M * Dy y_pred = softmax(z2) # R - M * Dy cost = -np.sum(labels * np.log(y_pred)) # cross-entropy ### END YOUR CODE ### YOUR CODE HERE: backward propagation dz2 = y_pred - labels # R - M * Dy dh = dz2.dot(W2.T) # R - M * H # note: sigmoid_grad takes sigmoid(x) as input value dz1 = dh * sigmoid_grad(h) # R - M * H gradW2 = h.T.dot(dz2) # R - H * Dy gradb2 = np.sum(dz2, 0) # R - 1 * Dy gradW1 = X.T.dot(dz1) # R - Dx * H gradb1 = np.sum(dz1, 0) # R - 1 * H ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(X, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, the backward propagation for the gradients for all parameters. Notice the gradients computed here are different from the gradients in the assignment sheet: they are w.r.t. weights, not inputs. Arguments: X -- M x Dx matrix, where each row is a training example x. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) # Note: compute cost based on `sum` not `mean`. ### YOUR CODE HERE: forward propagation h = X.dot(W1) + b1 # (M, Dx) * (Dx, H) -> (M, H) sig_h = sigmoid(h) y = sig_h.dot(W2) + b2 # (M, H) * (H, Dy) -> (M, Dy) softmax_y = softmax(y) cost = -np.sum(labels * np.log(softmax_y)) ### END YOUR CODE ### YOUR CODE HERE: backward propagation d_y = softmax_y - labels # (M, Dy) https://math.stackexchange.com/questions/945871/derivative-of-softmax-loss-function d_W2 = sig_h.T.dot(d_y) # (H, M) * (M, Dy) -> (H, Dy) d_b2 = np.sum(d_y, axis=0, keepdims=True) # (M, Dy) -> (, Dy) d_sig_h = d_y.dot(W2.T) # (M, Dy) * (Dy, H) -> (M, H) d_h = sigmoid_grad(sig_h) * d_sig_h # (M, H) d_W1 = X.T.dot(d_h) # (Dx, M) * (M, H) = (Dx, H) d_b1 = np.sum(d_h, axis=0, keepdims=True) # (M, H) -> (, H) gradW1, gradb1, gradW2, gradb2 = d_W1, d_b1, d_W2, d_b2 ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) N = data.shape[0] W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation hidden = np.dot(data, W1) + b1 layer1_a = sigmoid(hidden) layer2 = np.dot(layer1_a, W2) + b2 # need to calculate the softmax loss probs = softmax(layer2) cost = -np.sum(np.log(probs[np.arange(N), np.argmax(labels, axis=1)])) ### END YOUR CODE ### YOUR CODE HERE: backward propagation #There is no regularization :/ # dx -> sigmoid -> W2 * layer1_a + b -> sigmoid -> W1 * data + b1 -> .. dx = probs.copy() dx -= labels dlayer2 = np.zeros_like(dx) gradW2 = np.zeros_like(W2) gradW1 = np.zeros_like(W1) gradb2 = np.zeros_like(b2) gradb1 = np.zeros_like(b1) gradW2 = np.dot(layer1_a.T, dx) gradb2 = np.sum(dx, axis=0) dlayer2 = np.dot(dx, W2.T) dlayer1 = sigmoid_grad(layer1_a) * dlayer2 gradW1 = np.dot(data.T, dlayer1) gradb1 = np.sum(dlayer1, axis=0) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(X, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, the backward propagation for the gradients for all parameters. Notice the gradients computed here are different from the gradients in the assignment sheet: they are w.r.t. weights, not inputs. Arguments: X -- M x Dx matrix, where each row is a training example x. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) # Note: compute cost based on `sum` not `mean`. ### YOUR CODE HERE: forward propagation X1_out = sigmoid(X.dot(W1) + b1) # (M, H) softmax_output = softmax(X1_out.dot(W2) + b2) # shape(M, Dy) ### END YOUR CODE ### YOUR CODE HERE: backward propagation M = X.shape[0] cost = -np.sum(np.log(softmax_output[labels == 1])) / M # labels shape :(M, Dy), [label==1] shape:[M, Dy] True, False, # softmax_output[lable==1]] :shape: (M, ) dSoftmax = (softmax_output - labels) / M # (M, Dy) gradW2 = np.dot(X1_out.T, dSoftmax) gradb2 = np.sum(dSoftmax, axis=0, keepdims=True) dX1_out = np.dot(dSoftmax, W2.T) # (M, H) dsigmoid = sigmoid_grad(X1_out) * dX1_out # important! gradW1 = np.dot(X.T, dsigmoid) #(Dx, H) = (Dx, M)(M, H) gradb1 = np.sum(dsigmoid, axis=0, keepdims=True) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. Arguments: data -- M x Dx matrix, where each row is a training example. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation h = sigmoid(np.dot(data, W1) + b1) yhat = softmax(np.dot(h, W2) + b2) cost = -np.sum(np.log(yhat[labels == 1])) / data.shape[0] ### END YOUR CODE ### YOUR CODE HERE: backward propagation """ here we compute gradb1, gradW1, gradb2, gradW2 """ cost_grad = (yhat - labels) / data.shape[0] # M x Dy gradW2 = np.dot(h.T, cost_grad) # (H x M) * (M x Dy) = H x Dy gradb2 = np.sum(cost_grad, axis=0, keepdims=True) # Dy, summing over M training set dJdh = np.dot(cost_grad, W2.T) # (M x Dy) . (Dy x H) = M x H gradb1_single = sigmoid_grad(h) * dJdh # M x H (element-wise) gradW1 = np.dot(data.T, gradb1_single) # (M x Dx).T . (M x H) = Dx x H gradb1 = np.sum(gradb1_single, axis=0) # sum along M data set ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. Arguments: data -- M x Dx matrix, where each row is a training example. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation hidden = np.dot(data, W1) + b1 hidden_act = sigmoid(hidden) output = np.dot(hidden_act, W2) + b2 output_act = softmax(output) logprobs = -np.log(output_act[np.arange(data.shape[0]), np.argmax(labels, axis=1)]) cost = np.sum(logprobs) / data.shape[0] ### END YOUR CODE ### YOUR CODE HERE: backward propagation dscores = (output_act - labels) / data.shape[0] #print('dscore.shape', dscores.shape) gradW2 = np.dot(hidden_act.T, dscores) gradb2 = np.sum(dscores, axis=0) dhidden_act = np.dot(dscores, W2.T) dhidden = sigmoid_grad(hidden_act) * dhidden_act gradW1 = np.dot(data.T, dhidden) gradb1 = np.sum(dhidden, axis=0) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(X, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, the backward propagation for the gradients for all parameters. Notice the gradients computed here are different from the gradients in the assignment sheet: they are w.r.t. weights, not inputs. Arguments: X -- M x Dx matrix, where each row is a training example x. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) # Note: compute cost based on `sum` not `mean`. ### YOUR CODE HERE: forward propagation z1 = X.dot(W1) + b1 h = sigmoid(z1) z2 = h.dot(W2) + b2 yhat = softmax(z2) cost = -np.sum(labels * np.log(yhat)) ### END YOUR CODE ### YOUR CODE HERE: backward propagation delta1 = yhat - labels # M x Dy delta2 = delta1.dot(W2.transpose()) # M x H # sigmoid_grad takes the sigmoid output as input # so the input should be sigmoid(z1) which is h delta3 = delta2 * sigmoid_grad(h) # M x H gradW1 = X.transpose().dot(delta3) # Dx x H (sums over M examples) gradb1 = np.sum(delta3, 0) # 1 x H (sums over M examples) gradW2 = h.transpose().dot(delta1) # H x Dy (sums over M examples) gradb2 = np.sum(delta1, 0) # 1 x Dx (sums over M examples) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def test_sigmoid_shape(dim, sigmoid_f): testing_shape = [] for y in range(0, dim): testing_shape.append(np.random.randint(3, 8)) shape = tuple(testing_shape) #z = np.random.randn(*testing_shape) x = np.random.standard_normal(shape) y = np.copy(x) assert x.shape == sigmoid(y).shape assert x.shape == sigmoid_grad(sigmoid(y)).shape
def test_sigmoid_shape(dim): testing_shape = [] for y in range(0,dim): testing_shape.append(np.random.randint(3,8)) shape = tuple(testing_shape) #z = np.random.randn(*testing_shape) x = np.random.standard_normal(shape) y = np.copy(x) assert x.shape == sigmoid(y).shape assert x.shape == sigmoid_grad(sigmoid(y)).shape
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. Arguments: data -- M x Dx matrix, where each row is a training example. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation M = len(data) z1 = np.dot(data, W1) + b1 # (M, H) h1 = sigmoid(z1) # (M, H) z2 = np.dot(h1, W2) + b2 # (M, Dy) y_hat = softmax(z2) # (M, Dy) CE = -np.log(y_hat[np.arange(M), np.argmax(labels, axis=1)]) # (M, 1) cost = np.mean(CE) ### END YOUR CODE ### YOUR CODE HERE: backward propagation gradz2 = (y_hat - labels) / M # (M, Dy) gradh1 = np.dot(gradz2, W2.T) # (M, H) gradW2 = np.dot(h1.T, gradz2) # (H, Dy) gradb2 = np.sum(gradz2, axis=0) # (1, Dy) gradz1 = gradh1 * sigmoid_grad(h1) # (M, H) gradW1 = np.dot(data.T, gradz1) # (Dx, H) gradb1 = np.sum(gradz1, axis=0) # (1, H) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation N = data.shape[0] l1 = data.dot(W1) + b1 h = sigmoid(l1) l2 = h.dot(W2) + b2 y_hat = softmax(l2) cost = -np.sum(labels * np.log(y_hat)) / N # cross entropy ### raise NotImplementedError ### END YOUR CODE ### YOUR CODE HERE: backward propagation dl2 = y_hat - labels dW2 = np.dot(h.T, dl2) db2 = np.sum(dl2, axis=0) dh = np.dot(dl2, W2.T) dl1 = dh * sigmoid_grad(h) dW1 = np.dot(data.T, dl1) db1 = np.sum(dl1, axis=0) gradW2 = dW2 / N gradb2 = db2 / N gradW1 = dW1 / N gradb1 = db1 / N ### raise NotImplementedError ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. Arguments: data -- M x Dx matrix, where each row is a training example. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) gradW1 = np.zeros_like(W1) gradb1 = np.zeros_like(b1) gradW2 = np.zeros_like(W2) gradb2 = np.zeros_like(b2) N = data.shape[0] ### YOUR CODE HERE: forward propagation h = sigmoid(np.dot(data, W1) + b1) out = softmax(np.dot(h, W2) + b2) cost = -np.sum(labels * np.log(out)) #cost /= N ### END YOUR CODE ### YOUR CODE HERE: backward propagation temp0 = out - labels gradb2 += np.sum(temp0, axis=0) gradW2 += np.dot(h.T, temp0) temp1 = np.dot(temp0, W2.T) * sigmoid_grad(h) gradb1 += np.sum(temp1, axis=0) gradW1 += np.dot(data.T, temp1) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. Arguments: data -- M x Dx matrix, where each row is a training example. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) # Dx * H ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) # 1 * H ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) # H * Dy ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) x = data # M * Dx y = labels # M * Dy M = x.shape[0] ### forward propagation z1 = np.dot(x, W1) + b1 a1 = sigmoid(z1) z2 = np.dot(a1, W2) + b2 y_hat = a2 = softmax(z2) cost = -np.sum(np.log(a2[np.arange(M), np.argmax(y, axis=1)])) # Cross Entropy ### backward propagation gradz2 = y_hat - y gradW2 = np.dot(a1.T, gradz2) gradb2 = np.sum(gradz2, axis=0) grada2 = np.dot(gradz2, W2.T) gradz1 = sigmoid_grad(a1) * grada2 gradW1 = np.dot(x.T, gradz1) gradb1 = np.sum(gradz1, axis=0) ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. Arguments: data -- M x Dx matrix, where each row is a training example. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation h = sigmoid(data.dot(W1) + b1) y_beta = softmax(h.dot(W2) + b2) cost = -np.sum(np.log(y_beta[labels == 1])) ### END YOUR CODE ### YOUR CODE HERE: backward propagation # Backpropagate throughy_beta the first latent layer # Calculate analytic gradient for the cross entropy loss function d3 = (y_beta - labels) # Backpropagate through the second latent layer gradW2 = np.dot(h.T, d3) gradb2 = np.sum(d3, axis=0, keepdims=True) # Backpropagate through the first latent layer d2 = np.dot(d3, W2.T) * sigmoid_grad(h) gradW1 = np.dot(data.T, d2) gradb1 = np.sum(d2, axis=0, keepdims=True) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) layer1_output = np.dot(data, W1) + b1 layer1_activations = sigmoid(layer1_output) output_scores = np.dot(layer1_activations, W2) + b2 softmax_scores = softmax(output_scores) cross_entropy_loss = -1 * np.sum(labels * np.log(softmax_scores)) #print cross_entropy_loss.shape cost = cross_entropy_loss doutput_scores = softmax_scores #doutput_scores-=labels label_index = np.argmax(labels, axis=1) doutput_scores[np.arange(data.shape[0]), label_index] -= 1 gradW2 = np.dot(layer1_activations.T, doutput_scores) gradb2 = np.sum(doutput_scores, axis=0) dlayer1_activations = np.dot(doutput_scores, W2.T) dlayer1_output = sigmoid_grad(layer1_activations) * dlayer1_activations gradW1 = np.dot(data.T, dlayer1_output) gradb1 = np.sum(dlayer1_output, axis=0) ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. Arguments: data -- M x Dx matrix, where each row is a training example. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) N = len(data) W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation z1s = np.dot(data, W1) + b1 hs = sigmoid(z1s) ys = softmax(np.dot(hs, W2) + b2) cost = -np.sum(np.log(ys[np.arange(N), np.argmax(labels, axis=1)])) ### END YOUR CODE ### YOUR CODE HERE: backward propagation gradW1 = np.zeros_like(W1) gradW2 = np.zeros_like(W2) gradb1 = np.zeros_like(b1) gradb2 = np.zeros_like(b2) errors = ys - labels gradW2 = np.dot(hs.T, errors) gradb2 = np.sum(errors, axis=0) # you should input the output of sigmoid into sigmoid_grad tmpb1 = sigmoid_grad(hs) * np.dot(errors, W2.T) gradW1 = np.dot(data.T, tmpb1) gradb1 = np.sum(tmpb1, axis=0) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(X, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, the backward propagation for the gradients for all parameters. Notice the gradients computed here are different from the gradients in the assignment sheet: they are w.r.t. weights, not inputs. Arguments: X -- M x Dx matrix, where each row is a training example x. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) # Note: compute cost based on `sum` not `mean`. ### YOUR CODE HERE: forward propagation Z1 = np.matmul(X, W1) + b1 # M by H A1 = sigmoid(Z1) # M by H Z2 = np.matmul(A1, W2) + b2 # M by Dy A2 = softmax(Z2) # M by Dy m = X.shape[0] cost = np.sum(-labels * np.log(A2)) / m ### END YOUR CODE ### YOUR CODE HERE: backward propagation DZ2 = A2 - labels # M by Dy gradb2 = np.sum(DZ2, axis=0, keepdims=True) / m # 1 by Dy gradW2 = np.matmul(np.transpose(A1), DZ2) / m # H by Dy DA1 = np.dot(DZ2, np.transpose(W2)) DZ1 = DA1 * sigmoid_grad(A1) # sigmoid_grad takes the result from sigmoid gradb1 = np.sum(DZ1, axis=0, keepdims=True) / m gradW1 = np.matmul(np.transpose(X), DZ1) / m ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. Arguments: data -- M x Dx matrix, where each row is a training example. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation h1 = data.dot(W1) + b1 # (M, H) a1 = sigmoid(h1) h2 = a1.dot(W2) + b2 # (M, Dy) scores = softmax(h2) cost = -np.sum(np.log(scores) * labels) ### END YOUR CODE ### YOUR CODE HERE: backward propagation gradh2 = scores - labels gradW2 = a1.T.dot(gradh2) gradb2 = np.sum(gradh2, axis=0) grada1 = gradh2.dot(W2.T) gradh1 = grada1 * sigmoid_grad(a1) gradW1 = data.T.dot(gradh1) gradb1 = np.sum(gradh1, axis=0) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. Arguments: data -- M x Dx matrix, where each row is a training example. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation z1 = np.dot(data, W1) + b1 h = sigmoid(z1) y = softmax(np.dot(h, W2) + b2) # print y.shape # print labels.shape cost = -np.sum(np.multiply(np.log(y), labels)) ### END YOUR CODE ### YOUR CODE HERE: backward propagation # print W2.shape delta1 = y - labels gradW2 = np.dot(h.transpose(), delta1) gradb2 = np.sum(delta1, axis=0) delta2 = np.multiply(np.dot(delta1, W2.transpose()), sigmoid_grad(h)) gradW1 = np.dot(data.transpose(), delta2) gradb1 = np.sum(delta2, axis=0) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. Arguments: data -- M x Dx matrix, where each row is a training example. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation fc_out = np.dot(data, W1) + b1 # shape (M, H) fc_sigmoid_out = sigmoid(fc_out) # shape (M, H) scores = np.dot(fc_sigmoid_out, W2) + b2 # shape (M, Dy) y_hat = softmax(scores) # shape (M, Dy) # M = data.shape[0] cost = -np.sum(labels * np.log(y_hat)) # / M ### END YOUR CODE ### YOUR CODE HERE: backward propagation dscores = y_hat - labels # / M # shape (M, Dy) gradW2 = np.dot(fc_sigmoid_out.T, dscores) # shape (H, Dy) gradb2 = np.sum(dscores, axis=0) # shape (Dy,) dfc_sigmoid_out = np.dot(dscores, W2.T) # shape (M, H) dfc_out = dfc_sigmoid_out * sigmoid_grad(fc_sigmoid_out) # shape (M, H) gradW1 = np.dot(data.T, dfc_out) # shape (Dx, H) gradb1 = np.sum(dfc_out, axis=0) # shape (H,) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def sigmoid_backward(dout, cache): """ Computes the backward pass for an sigmoid layer. Inputs: - dout: Upstream derivative, same shape as the input to the sigmoid layer (x) - cache: sigmoid(x) Returns a tuple of: - dx: back propagated gradient with respect to x """ x = cache return sigmoid_grad(x) * dout
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. Arguments: data -- M x Dx matrix, where each row is a training example. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation h = sigmoid(np.dot(data,W1) + b1) yhat = softmax(np.dot(h,W2) + b2) ### END YOUR CODE ### YOUR CODE HERE: backward propagation cost = np.sum(-np.log(yhat[labels==1])) / data.shape[0] d3 = (yhat - labels) / data.shape[0] gradW2 = np.dot(h.T, d3) gradb2 = np.sum(d3,0,keepdims=True) dh = np.dot(d3,W2.T) grad_h = sigmoid_grad(h) * dh gradW1 = np.dot(data.T,grad_h) gradb1 = np.sum(grad_h,0) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### forward propagation N = data.shape[0] l1 = data.dot(W1) + b1 h = sigmoid(l1) l2 = h.dot(W2) + b2 y_hat = softmax(l2) cost = -np.sum(labels * np.log(y_hat)) / N # cross entropy ### backward propagation dl2 = y_hat - labels dW2 = np.dot(h.T, dl2) db2 = np.sum(dl2, axis=0) dh = np.dot(dl2, W2.T) dl1 = dh * sigmoid_grad(h) dW1 = np.dot(data.T, dl1) db1 = np.sum(dl1, axis=0) gradW2 = dW2/N gradb2 = db2/N gradW1 = dW1/N gradb1 = db1/N ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation N = data.shape[0] Z1 = data.dot(W1) + b1 # (N, H) A1 = sigmoid(Z1) # (N, H) scores = A1.dot(W2) + b2 # (N, Dy) probs = softmax(scores) # (N, Dy) cost = -np.sum(np.log(probs[labels==1])) / N ### END YOUR CODE ### YOUR CODE HERE: backward propagation dscores = (probs - labels) / N dW2 = A1.T.dot(dscores) db2 = np.sum(dscores, axis=0) dA1 = dscores.dot(W2.T) dZ1 = sigmoid_grad(A1) * dA1 dW1 = data.T.dot(dZ1) db1 = np.sum(dZ1, axis=0) gradW1 = dW1 gradW2 = dW2 gradb1 = db1 gradb2 = db2 ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation # print data.shape, W1.shape, b1.shape, W2.shape, b2.shape, labels.shape # (20, 10) (10, 5) (1, 5) (5, 10) (1, 10) (20, 10) z1 = data.dot(W1) + b1 h = sigmoid(z1) z2 = h.dot(W2) + b2 y = softmax(z2) cost = -1 * np.sum(np.log(y) * labels) ### END YOUR CODE ### YOUR CODE HERE: backward propagation dEdz2 = y - labels dEdh = dEdz2.dot(W2.T) gradW2 = h.T.dot(dEdz2) gradb2 = np.sum(dEdz2, axis = 0) dEdz1 = dEdh * sigmoid_grad(h) dEdx = dEdz1.dot(W1.T) gradW1 = data.T.dot(dEdz1) gradb1 = np.sum(dEdz1, axis = 0) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation N, D = data.shape h = sigmoid(data.dot(W1) + b1) scores = softmax(h.dot(W2) + b2) cost = np.sum(- np.log(scores[labels == 1])) / N ### END YOUR CODE ### YOUR CODE HERE: backward propagation dscores = scores - labels # good dscores /= N gradb2 = np.sum(dscores, axis=0) gradW2 = np.dot(h.T, dscores) grad_h = np.dot(dscores, W2.T) grad_h = sigmoid_grad(h) * grad_h gradb1 = np.sum(grad_h, axis=0) gradW1 = np.dot(data.T, grad_h) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation z = np.dot(data,W1) + b1 a = sigmoid(z) z2 = np.dot(a,W2) + b2 a2 = softmax(z2) # correct = np.argmax(labels) cost = -np.sum(np.log(a2) * labels) # print "sizes: " # print cost ### END YOUR CODE ### YOUR CODE HERE: backward propagation d1 = a2 - labels #Dy x 1 d2 = np.dot(d1,W2.T) # 1 d3 = np.multiply(d2,sigmoid_grad(a)) # print "sizes: " + str(d3.shape) gradW2 = np.dot(a.T,d1) gradb2 = np.sum(d1,axis=0) gradW1 = np.dot(data.T,d3) gradb1 = np.sum(d3,axis=0) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) # grad = 1 return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation # Dimension of data: N * Dx # Dimension of W1: Dx * H # Dimension of W2: H * Dy hidden = sigmoid(data.dot(W1) + b1) # Dimensions: (N*Dx) * (Dx*H) prediction = softmax(hidden.dot(W2) + b2) # Dimensions: (N*H) * (H*Dy) cost = -np.sum(np.log(prediction) * labels) # Dimensions: 1 ### END YOUR CODE ### YOUR CODE HERE: backward propagation delta = prediction - labels # Dimesions: N * Dy # first backpropagate into parameters W2 and b2 gradW2 = np.dot(hidden.T, delta) # Dimensions: H * Dy gradb2 = np.sum(delta, axis=0,keepdims=True) # next backprop into hidden layer gradHidden = np.dot(delta, W2.T) * sigmoid_grad(hidden) # Dimensions: (N x Dy) x (Dy x H) = 20 x 5 # finally into W1, b1 gradW1 = np.dot(data.T, gradHidden) # Dimensions: Dx * H gradb1 = np.sum(gradHidden, axis=0,keepdims=True) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation # raise NotImplementedError # 前向计算labels(y)和cost hidden = sigmoid(np.dot(data, W1) + b1) prediction = softmax(np.dot(hidden, W2) + b2) cost = - np.sum(labels * np.log(prediction)) ### END YOUR CODE ### YOUR CODE HERE: backward propagation # raise NotImplementedError # 后向计算各参数的梯度 delta1 = prediction - labels gradW2 = np.dot(hidden.T, delta1) gradb2 = np.sum(delta1, axis=0) delta2 = np.dot(delta1, W2.T) delta3 = delta2 * sigmoid_grad(hidden) gradW1 = np.dot(data.T, delta3) gradb1 = np.sum(delta3, axis=0) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation a1 = sigmoid(data.dot(W1)+b1) a2 = softmax(a1.dot(W2)+b2) cost = -np.sum(a2*np.log(labels)) ### END YOUR CODE ### YOUR CODE HERE: backward propagation grad_a2 = (a2-labels) / data.shape[0] grad_W2 = a1.T.dot(grad_a2) grad_b2 = np.sum(grad_a2, axis=0, keepdims=True) grad_a1 = grad_a2.dot(W2.T)*sigmoid_grad(a1) grad_W1 = data.T.dot(grad_a1) grad_b1 = np.sum(grad_a1, axis=0, keepdims=True) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((grad_W1.flatten(), grad_b1.flatten(), grad_W2.flatten(), grad_b2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation z1 = np.dot(data , W1) + b1 f1 = sigmoid(z1) z2 = np.dot(f1, W2) + b2 ### END YOUR CODE output = softmax(z2) cost = - sum(np.log(np.sum(output * labels, axis = 1))) ### YOUR CODE HERE: backward propagation delta = output - labels gradW2 = np.dot(f1.T, delta) gradb2 = np.sum(delta, axis=0) delta1 = delta.dot(W2.T) * sigmoid_grad(f1) gradb1 = np.sum(delta1, axis=0) gradW1 = np.dot(data.T, delta1) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) #pdb.set_trace() W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) h1 = np.dot(data, W1)+b1 a1 = sigmoid(h1) scores = np.dot(a1, W2)+b2 probs = softmax(scores) cost = np.sum(-scores*labels+labels*np.log(np.sum(np.exp(scores), axis=1, keepdims=True))) #gradscores = -labels+np.exp(scores)/np.sum(np.exp(scores), axis=1, keepdims=True) gradscores = -labels+probs gradb2 = np.sum(gradscores, axis=0) gradW2 = np.dot(a1.T,gradscores) grada1 = np.dot(gradscores,W2.T) gradh1 = grada1*sigmoid_grad(a1) gradb1 = np.sum(gradh1, axis=0) gradW1 = np.dot(data.T,gradh1) ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation out1 = data.dot(W1) + b1 #(N,D) * (D,H) + (H,) out1_act = sigmoid(out1) out2 = out1_act.dot(W2) + b2 # (N,Dy) score = softmax(out2) cost = np.sum(-1 * labels * np.log(score)) ### END YOUR CODE ### YOUR CODE HERE: backward propagation dscore = score - labels gradW2 = out1_act.T.dot(dscore) gradb2 = np.sum(dscore,axis=0) dout1_act = dscore.dot(W2.T) #(N,dy) * (Dy,H) = (N,H) dout1 = sigmoid_grad(out1_act)*(dout1_act) gradW1 = data.T.dot(dout1) gradb1 = np.sum(dout1,axis=0) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H)) # Dx * H ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) # H * Dy ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation h1 = np.dot(data, W1) + b1 # N * H h1 = sigmoid(h1) h2 = np.dot(h1, W2) + b2 # N * Dy y_hat = softmax(h2) cost = -np.sum(np.multiply(np.log(y_hat), labels)) ### END YOUR CODE ### YOUR CODE HERE: backward propagation phi = y_hat - labels # N * Dy gradW2 = np.dot(h1.T, phi) # H * N * N * Dy = H * Dy gradb2 = np.sum(phi, 0, keepdims=True) # 1 * Dy dhidden = np.dot(phi, W2.T) * sigmoid_grad(h1) # N * H gradW1 = np.dot(data.T, dhidden) # Dx * N * N * H gradb1 = np.sum(dhidden, 0, keepdims=True) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation z_1 = data.dot(W1) + b1 h = sigmoid(z_1) z_2 = h.dot(W2) + b2 y_hat = softmax(z_2) cost = -np.sum(np.log(y_hat) * labels) ### END YOUR CODE ### YOUR CODE HERE: backward propagation delta_3 = y_hat - labels gradW2 = h.T.dot(delta_3) gradb2 = np.sum(delta_3, axis=0) delta_2 = delta_3.dot(W2.T) * sigmoid_grad(h) gradW1 = data.T.dot(delta_2) gradb1 = np.sum(delta_2, axis=0) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) n_sample,_ = data.shape W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### forward pass hiddens = sigmoid(data.dot(W1) + b1) probs = softmax(hiddens.dot(W2) + b2) true_labels = np.argmax(labels, axis = 1) cost = -1.0 * np.sum(np.log(probs[range(n_sample),true_labels])) / n_sample ### backward pass dscores = probs dscores[range(n_sample),true_labels] -= 1 dscores /= n_sample gradW2 = np.dot(hiddens.T, dscores) gradb2 = np.sum(dscores,axis = 0) gradHiddens = np.dot(dscores, W2.T) * sigmoid_grad(hiddens) gradW1 = np.dot(data.T, gradHiddens) gradb1 = np.sum(gradHiddens, axis = 0) ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) h1 = sigmoid(np.dot(data,W1) + b1) out = softmax(np.dot(h1,W2) + b2) cost=-np.sum(np.log(out)*labels) dout = np.copy(out) dout-=labels dh1 = dout.dot(W2.T) gradW2 = h1.T.dot(dout) gradb2=np.sum(dout,axis=0) dsigmoid = sigmoid_grad(h1) dh1*=dsigmoid gradW1 =data.T.dot(dh1) gradb1=np.sum(dh1,axis=0) ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. """ ### Unpack network parameters ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) z1 = np.dot(data, W1) + b1 h = sigmoid(z1) z2 = np.dot(h, W2) + b2 y_hat = softmax(z2) cost = -np.sum( labels * np.log(y_hat) ) delta2 = y_hat - labels gradW2 = np.dot(h.T, delta2) gradb2 = np.sum(delta2, axis=0) delta1 = np.dot(delta2, W2.T)*sigmoid_grad(h) gradW1 = np.dot(data.T, delta1) gradb1 = np.sum(delta1, axis=0) ### Stack gradients grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation labels = labels.astype("int64") a_1_0 = data.dot(W1) + b1 a_1 = sigmoid(a_1_0) a_2_0 = (a_1.dot(W2) + b2) loss, dx = softmax_loss(a_2_0, labels) gradb2 = np.sum(dx, axis=0, keepdims=True) gradW2 = a_1.T.dot(dx) da_1 = sigmoid_grad(a_1)*dx.dot(W2.T) gradb1 = np.sum(da_1, axis=0, keepdims=True) gradW1 = data.T.dot(da_1) # fb2 = lambda x: (softmax_loss(a_1.dot(W2)+x, labels)[0]) # print "+++++++++++++++++++++++++++" # print gradb2 # print "---------------------------" # print gradcheck(fb2,b2) # print "***************************" # fW2 = lambda x: (softmax_loss(a_1.dot(x)+b2, labels)[0]) # print "+++++++++++++++++++++++++++" # print gradW2 # print "---------------------------" # print gradcheck(fW2,W2) # print "***************************" assert(gradb2.shape == b2.shape) assert(gradW2.shape == W2.shape) assert(gradb1.shape == b1.shape) assert(gradW1.shape == W1.shape) cost = loss ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) N = data.shape[0] W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation hidden = np.dot(data,W1) + b1 layer1_a = sigmoid(hidden) layer2 = np.dot(layer1_a, W2) + b2 # need to calculate the softmax loss probs = softmax(layer2) cost = - np.sum(np.log(probs[np.arange(N), np.argmax(labels, axis=1)])) ### END YOUR CODE ### YOUR CODE HERE: backward propagation #There is no regularization :/ # dx -> sigmoid -> W2 * layer1_a + b -> sigmoid -> W1 * data + b1 -> .. dx = probs.copy() dx -= labels dlayer2 = np.zeros_like(dx) gradW2 = np.zeros_like(W2) gradW1 = np.zeros_like(W1) gradb2 = np.zeros_like(b2) gradb1 = np.zeros_like(b1) gradW2 = np.dot(layer1_a.T, dx) gradb2 = np.sum(dx, axis=0) dlayer2 = np.dot(dx, W2.T) dlayer1 = sigmoid_grad(layer1_a) * dlayer2 gradW1 = np.dot(data.T, dlayer1) gradb1 = np.sum(dlayer1, axis=0) # Decided to implement affine (forward and backward function) # sigmoid (forward and backward function) # These should work properly; # scores, cache_1 = affine_forward(data, W1, b1) # scores, cache_s1 = sigmoid_forward(scores) # scores, cache_2 = affine_forward(scores, W2, b2) # # need to calculate the softmax loss # probs = softmax(scores) # cost = -np.sum(np.log(probs[np.arange(N), np.argmax(labels)] + 1e-12)) / N # softmax_dx = probs.copy() # softmax_dx[np.arange(N), np.argmax(labels,axis=1)] -= 1 # softmax_dx /= N # grads = {} # dlayer2, grads['W2'], grads['b2'] = affine_backward(softmax_dx, cache_2) # dlayer1s = sigmoid_backward(dlayer2, cache_s1) # dlayer1, grads['W1'], grads['b1'] = affine_backward(dlayer1s, cache_1) #softmax_dx is the gradient of the loss w.r.t. y_{est} ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad