def negSamplingCostAndGradient(dataset, predicted, target, outputVectors, K=10): """ Negative sampling cost function for word2vec models """ ################################################################### # Implement the cost and gradients for one predicted word vector # # and one target word vector as a building block for word2vec # # models, using the negative sampling technique. K is the sample # # size. You might want to use dataset.sampleTokenIdx() to sample # # a random word index. # # Input/Output Specifications: same as softmaxCostAndGradient # # We will not provide starter code for this function, but feel # # free to reference the code you previously wrote for this # # assignment! # ################################################################### ### YOUR CODE HERE samples = [] sample_indices = [] for i in range(0,10): index = dataset.sampleTokenIdx() samples.append(outputVectors[index]) sample_indices.append(index) samples = np.array(samples) N, D = outputVectors.shape # cost = - log(\sigma(v^{i-C+j} \dot h)) + \sum_{k=1}^{K} log(\sigma(v^{(k)} \dot h)) samples_dot_predicted = sigmoid(samples.dot(predicted.reshape((D,1)))) predicted_dot_target = sigmoid(predicted.dot(outputVectors[target])) cost = -1.0 * np.sum(np.log(samples_dot_predicted)) - np.log(predicted_dot_target) # Derivative w.r.t. predicted (h) # -(\frac{1}{\sigma(v^{i-C+j} \dot h)})(\grad sigmoid(v^{i-C+j} \dot (h)))(v^{i-C+j}) # => - (1-sigmoid(samples \dot (predicted)))(samples) # + (1-sigmoid(sampled \dot predicted))() # import pdb;pdb.set_trace() # import pdb; pdb.set_trace() sig = predicted_dot_target - 1.0 gradPred = sig * outputVectors[target].reshape(1, D) + (1.0 - samples_dot_predicted).reshape(1, K).dot(samples) gradPred = gradPred.reshape(D,) grad = np.zeros(outputVectors.shape) grad[target, :] = predicted * sig for sample, k in zip(samples, sample_indices): grad[k, :] += -1.0 * predicted * (sigmoid(-1.0 * predicted.dot(sample)) - 1.0) ### END YOUR CODE assert grad.shape == outputVectors.shape assert gradPred.shape == predicted.shape return cost, gradPred, grad
def predict(network, x): W1, W2, W3 = network['W1'], network['W2'], network['W3'] b1, b2, b3 = network['b1'], network['b2'], network['b3'] a1 = np.dot(x, W1) + b1 z1 = sigmoid(a1) a2 = np.dot(z1, W2) + b2 z2 = sigmoid(a2) a3 = np.dot(z2, W3) + b3 y = softmax(a3) return y
def test_sigmoid(self): x = np.array([[1, 2], [-1, -2]]) f = sigmoid(x) g = sigmoid_grad(f) np.testing.assert_array_almost_equal( f, np.array([[0.73105858, 0.88079708], [0.26894142, 0.11920292]])) np.testing.assert_array_almost_equal( g, np.array([[0.19661193, 0.10499359], [0.19661193, 0.10499359]]))
def compare_functions(): x = np.arange(-5.0, 5.0, 0.1) y1 = sigmoid(x) y2 = step_function(x) y3 = relu(x) plt.plot(x, y1, label="sigmoid") plt.plot(x, y2, label="step", linestyle="--") plt.plot(x, y3, label="ReLU", linestyle=":") plt.ylim(-0.1, 1.1) plt.title("sigmoid & step") plt.legend() plt.show()
def forward(network, X): print(network) W1, W2, W3 = network['W1'], network['W2'], network['W3'] B1, B2, B3 = network['B1'], network['B2'], network['B3'] A1 = np.dot(X, W1) + B1 Z1 = sigmoid(A1) print(A1) print(Z1) A2 = np.dot(Z1, W2) + B2 Z2 = sigmoid(A2) print(A2) print(Z2) A3 = np.dot(Z2, W3) + B3 Y = softmax(A3) print(A3) return Y
def forward_backward_prop(dimensions, data, labels, params): """ Forward and backward propagation for a two-layer sigmoidal network """ ################################################################### # Compute the forward propagation and for the cross entropy cost, # # and backward propagation for the gradients for all parameters. # ################################################################### ### Unpack network parameters (do not modify) t = 0 W1 = np.reshape(params[t : t + dimensions[0] * dimensions[1]], (dimensions[0], dimensions[1])) t += dimensions[0] * dimensions[1] b1 = np.reshape(params[t : t + dimensions[1]], (1, dimensions[1])) t += dimensions[1] W2 = np.reshape(params[t : t + dimensions[1] * dimensions[2]], (dimensions[1], dimensions[2])) t += dimensions[1] * dimensions[2] b2 = np.reshape(params[t : t + dimensions[2]], (1, dimensions[2])) ### YOUR CODE HERE: forward propagation # cost = ... # labels is (20, 10) (20 1-hot vectors) - this is y # data is (20, 10) - this is x # W1 is (10, 5) # W2 is (5, 10) # b1 is (1, 5) # b2 is (1, 10) a = data.dot(W1) + b1 h = sigmoid(a) # hidden layer y_hat = softmax(h.dot(W2) + b2) # Top classifier layer N, D = data.shape (Dx, H) = W1.shape # TODO: may need to change this to sum over rows and then sum up rows? # cost = np.sum(-np.sum(np.multiply(labels, np.log(y_hat)), axis=1).reshape((N, 1))) cost_per_datapoint = -np.sum(labels * np.log(y_hat), axis=1).reshape((N, 1)) # sum over rows cost = np.sum(cost_per_datapoint) ### END YOUR CODE ### YOUR CODE HERE: backward propagation # gradW1 = ... # gradb1 = ... # gradW2 = ... # gradb2 = ... # d_y_hat/d_W2 J_theta = y_hat - labels # theta_W2 = h # theta_h = W2 h_a = h * (1.0 - h) a_W1 = data y_hathw = J_theta.dot(W2.T) * h_a gradW2 = h.T.dot(J_theta) gradW1 = data.T.dot(y_hathw) # gradW1 = np.dot(data.T, np.dot(J_theta, theta_h.T) * h_a) gradb1 = np.sum(y_hathw, axis=0).reshape((1, H)) gradb2 = np.sum(J_theta, axis=0).reshape((1, D)) assert gradW1.shape == W1.shape assert gradb1.shape == b1.shape assert W2.shape == gradW2.shape assert gradb2.shape == b2.shape ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def draw_sigmoid(): x = np.arange(-5.0, 5.0, 0.1) y = sigmoid(x) plt.plot(x, y) plt.ylim(-0.1, 1.1) plt.show()
def test_sigmoid(self): x = np.array([[1, 2], [-1, -2]]) f = sigmoid(x) g = sigmoid_grad(f) np.testing.assert_array_almost_equal(f, np.array([[0.73105858, 0.88079708], [0.26894142, 0.11920292]])) np.testing.assert_array_almost_equal(g, np.array([[0.19661193, 0.10499359], [0.19661193, 0.10499359]]))
def forward_backward_prop(dimensions, data, labels, params): """ Forward and backward propagation for a two-layer sigmoidal network """ ################################################################### # Compute the forward propagation and for the cross entropy cost, # # and backward propagation for the gradients for all parameters. # ################################################################### ### Unpack network parameters (do not modify) t = 0 W1 = np.reshape(params[t:t + dimensions[0] * dimensions[1]], (dimensions[0], dimensions[1])) t += dimensions[0] * dimensions[1] b1 = np.reshape(params[t:t + dimensions[1]], (1, dimensions[1])) t += dimensions[1] W2 = np.reshape(params[t:t + dimensions[1] * dimensions[2]], (dimensions[1], dimensions[2])) t += dimensions[1] * dimensions[2] b2 = np.reshape(params[t:t + dimensions[2]], (1, dimensions[2])) ### YOUR CODE HERE: forward propagation # cost = ... # labels is (20, 10) (20 1-hot vectors) - this is y # data is (20, 10) - this is x # W1 is (10, 5) # W2 is (5, 10) # b1 is (1, 5) # b2 is (1, 10) a = data.dot(W1) + b1 h = sigmoid(a) # hidden layer y_hat = softmax(h.dot(W2) + b2) # Top classifier layer N, D = data.shape (Dx, H) = W1.shape # TODO: may need to change this to sum over rows and then sum up rows? # cost = np.sum(-np.sum(np.multiply(labels, np.log(y_hat)), axis=1).reshape((N, 1))) cost_per_datapoint = -np.sum(labels * np.log(y_hat), axis=1).reshape( (N, 1)) # sum over rows cost = np.sum(cost_per_datapoint) ### END YOUR CODE ### YOUR CODE HERE: backward propagation #gradW1 = ... #gradb1 = ... #gradW2 = ... #gradb2 = ... # d_y_hat/d_W2 J_theta = y_hat - labels # theta_W2 = h # theta_h = W2 h_a = h * (1.0 - h) a_W1 = data y_hathw = J_theta.dot(W2.T) * h_a gradW2 = h.T.dot(J_theta) gradW1 = data.T.dot(y_hathw) # gradW1 = np.dot(data.T, np.dot(J_theta, theta_h.T) * h_a) gradb1 = np.sum(y_hathw, axis=0).reshape((1, H)) gradb2 = np.sum(J_theta, axis=0).reshape((1, D)) assert gradW1.shape == W1.shape assert gradb1.shape == b1.shape assert W2.shape == gradW2.shape assert gradb2.shape == b2.shape ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad