def nplm_cost_gradient(parameters, input, output): """ Cost function for NPLM :param parameters: tuple of (W, U, H, C) :param input: indices of context word :param output: index of current word :return: cost and gradient """ W, U, H, C = parameters context_size = len(input) x = np.concatenate([C[input[i]] for i in range(context_size)]) x = np.append(x, 1.) # Append bias term x = x.reshape(-1, 1) hidden_layer = np.tanh(H.dot(x)) y = W.dot(x) + U.dot(hidden_layer) prediction = softmax(y.reshape(-1)).reshape(-1, 1) cost = -np.sum(np.log(prediction[output])) one_hot = np.zeros_like(prediction) one_hot[output] = 1 delta = prediction - one_hot gradient_W = delta.dot(x.T) gradient_U = delta.dot(hidden_layer.T) gradient_H = tanh_gradient(hidden_layer) * U.T.dot(delta).dot(x.T) gradient_C = np.zeros_like(C) gradient_y_x = W + U.dot(tanh_gradient(hidden_layer) * H) gradient_x = gradient_y_x.T.dot(delta) gradient_x = gradient_x[:-1, :] gradient_x_split = np.split(gradient_x, context_size) for i in range(context_size): gradient_C[input[i]] += gradient_x_split[i].flatten() gradient = [gradient_W, gradient_U, gradient_H, gradient_C] return cost, gradient
def neural_network_cost_gradient(parameters, input, output): """ 3-layer network cost and gradient function :param parameters: pair of (W1, W2) :param input: input vector :param output: index to correct label :return: cross entropy cost and gradient """ W1, W2 = parameters input = input.reshape(-1, 1) hidden_layer = expit(W1.dot(input)) inside_softmax = W2.dot(hidden_layer) # TODO: allow softmax to normalize column vector prediction = softmax(inside_softmax.reshape(-1)).reshape(-1, 1) cost = -np.sum(np.log(prediction[output])) one_hot = np.zeros_like(prediction) one_hot[output] = 1 delta = prediction - one_hot gradient_W2 = delta.dot(hidden_layer.T) gradient_W1 = sigmoid_gradient(hidden_layer) * W2.T.dot(delta).dot(input.T) gradient = [gradient_W1, gradient_W2] return cost, gradient
def assertMultinomialLogisticRegression(self, sampler): data_size = 3 input_size = 5 output_size = 4 inputs = np.random.uniform(-10.0, 10.0, size=(data_size, input_size)) outputs = np.random.randint(0, output_size, size=data_size) initial_parameters = np.random.normal(size=(input_size, output_size)) # Create cost and gradient function for gradient descent and check its gradient cost_gradient = bind_cost_gradient( multinomial_logistic_regression_cost_gradient, inputs, outputs, sampler=sampler) result = gradient_check(cost_gradient, initial_parameters) self.assertEqual([], result) # Train multinomial logistic regression and see if it predicts correct labels final_parameters, cost_history = gradient_descent( cost_gradient, initial_parameters, 100) predictions = np.argmax(softmax(np.dot(final_parameters.T, inputs.T)), axis=0) for output, prediction in zip(outputs, predictions): self.assertEqual(output, prediction)
def test_softmax(self): # softmax should receive numpy array and return normalized vector expect = np.array([exp(1) / (exp(1) + exp(2)), exp(2) / (exp(1) + exp(2))]) actual = softmax(np.array([1, 2])) self.assertDistribution(actual) self.assertNumpyEqual(expect, actual) # softmax should be invariant to constant offsets in the input # softmax should be able to handle very large or small values actual = softmax(np.array([1001, 1002])) self.assertNumpyEqual(expect, actual) actual = softmax(np.array([-1002, -1001])) self.assertNumpyEqual(expect, actual) # softmax should receive matrix and return matrix of same size expect = np.array([[exp(1) / (exp(1) + exp(2)), exp(2) / (exp(1) + exp(2))], [exp(1) / (exp(1) + exp(2)), exp(2) / (exp(1) + exp(2))]]) actual = softmax(np.array([[1, 2], [3, 4]])) self.assertNumpyEqual(expect, actual)
def test_neural_network(self): np.random.seed(0) input_size = 2 hidden_size = 2 output_size = 2 # Classic XOR test data inputs = np.array([[0, 0], [0, 1], [1, 0], [1, 1]]) outputs = np.array([0, 1, 1, 0]) # Create cost and gradient function for gradient descent shapes = [(hidden_size, (input_size)), (output_size, (hidden_size))] flatten_neural_network_cost_gradient = flatten_cost_gradient( neural_network_cost_gradient, shapes) cost_gradient = bind_cost_gradient( flatten_neural_network_cost_gradient, inputs, outputs, sampler=batch_sampler) # Check gradient with initial parameters parameters_size = np.sum(np.product(shape) for shape in shapes) initial_parameters = np.random.normal(size=parameters_size) result = gradient_check(cost_gradient, initial_parameters) self.assertEqual([], result) # Train neural network (this is slow even such a simple task!) final_parameters, cost_history = gradient_descent( cost_gradient, initial_parameters, 1000) # Check if cost monotonically decrease (no guarantee in theory, but works in practice) previous_cost = None for cost in cost_history: if previous_cost is not None: self.assertLessEqual(cost, previous_cost) previous_cost = cost # TODO: extract duplicated code for prediction to reusable component split_index = hidden_size * (input_size) W1, W2 = np.split(final_parameters, [split_index]) W1 = W1.reshape((hidden_size, input_size)) W2 = W2.reshape((output_size, hidden_size)) for input, output in zip(inputs, outputs): input = input.reshape(-1, 1) hidden_layer = expit(W1.dot(input)) inside_softmax = W2.dot(hidden_layer) prediction = softmax(inside_softmax.reshape(-1)).reshape(-1, 1) label = np.argmax(prediction) # Check if output is correctly predicted self.assertEqual(output, label)
def test_softmax(self): # softmax should receive numpy array and return normalized vector expect = np.array( [exp(1) / (exp(1) + exp(2)), exp(2) / (exp(1) + exp(2))]) actual = softmax(np.array([1, 2])) self.assertDistribution(actual) self.assertNumpyEqual(expect, actual) # softmax should be invariant to constant offsets in the input # softmax should be able to handle very large or small values actual = softmax(np.array([1001, 1002])) self.assertNumpyEqual(expect, actual) actual = softmax(np.array([-1002, -1001])) self.assertNumpyEqual(expect, actual) # softmax should receive matrix and return matrix of same size expect = np.array( [[exp(1) / (exp(1) + exp(2)), exp(2) / (exp(1) + exp(2))], [exp(1) / (exp(1) + exp(2)), exp(2) / (exp(1) + exp(2))]]) actual = softmax(np.array([[1, 2], [3, 4]])) self.assertNumpyEqual(expect, actual)
def multinomial_logistic_regression_cost_gradient(parameters, input, output): """ Cost and gradient for multinomial logistic regression :param parameters: weight vector :param input: feature vector :param output: integer label :return: cost and gradient for the input and output """ prediction = softmax(np.dot(parameters.T, input)) cost = -np.log(prediction[output]) # Create one-hot vector one_hot = np.zeros_like(prediction) one_hot[output] = 1 gradient = np.dot(input.reshape(-1, 1), (prediction - one_hot).reshape(-1, 1).T) return cost, gradient
def test_neural_network(self): np.random.seed(0) input_size = 2 hidden_size = 2 output_size = 2 # Classic XOR test data inputs = np.array([[0, 0], [0, 1], [1, 0], [1, 1]]) outputs = np.array([0, 1, 1, 0]) # Create cost and gradient function for gradient descent shapes = [(hidden_size, (input_size)), (output_size, (hidden_size))] flatten_neural_network_cost_gradient = flatten_cost_gradient(neural_network_cost_gradient, shapes) cost_gradient = bind_cost_gradient(flatten_neural_network_cost_gradient, inputs, outputs, sampler=batch_sampler) # Check gradient with initial parameters parameters_size = np.sum(np.product(shape) for shape in shapes) initial_parameters = np.random.normal(size=parameters_size) result = gradient_check(cost_gradient, initial_parameters) self.assertEqual([], result) # Train neural network (this is slow even such a simple task!) final_parameters, cost_history = gradient_descent(cost_gradient, initial_parameters, 1000) # Check if cost monotonically decrease (no guarantee in theory, but works in practice) previous_cost = None for cost in cost_history: if previous_cost is not None: self.assertLessEqual(cost, previous_cost) previous_cost = cost # TODO: extract duplicated code for prediction to reusable component split_index = hidden_size * (input_size) W1, W2 = np.split(final_parameters, [split_index]) W1 = W1.reshape((hidden_size, input_size)) W2 = W2.reshape((output_size, hidden_size )) for input, output in zip(inputs, outputs): input = input.reshape(-1, 1) hidden_layer = expit(W1.dot(input)) inside_softmax = W2.dot(hidden_layer) prediction = softmax(inside_softmax.reshape(-1)).reshape(-1, 1) label = np.argmax(prediction) # Check if output is correctly predicted self.assertEqual(output, label)
def assertMultinomialLogisticRegression(self, sampler): data_size = 3 input_size = 5 output_size = 4 inputs = np.random.uniform(-10.0, 10.0, size=(data_size, input_size)) outputs = np.random.randint(0, output_size, size=data_size) initial_parameters = np.random.normal(size=(input_size, output_size)) # Create cost and gradient function for gradient descent and check its gradient cost_gradient = bind_cost_gradient(multinomial_logistic_regression_cost_gradient, inputs, outputs, sampler=sampler) result = gradient_check(cost_gradient, initial_parameters) self.assertEqual([], result) # Train multinomial logistic regression and see if it predicts correct labels final_parameters, cost_history = gradient_descent(cost_gradient, initial_parameters, 100) predictions = np.argmax(softmax(np.dot(final_parameters.T, inputs.T)), axis=0) for output, prediction in zip(outputs, predictions): self.assertEqual(output, prediction)
def softmax_cost_gradient(parameters, input, output): """ Softmax cost and gradient function for word2vec models :param parameters: word vectors for input and output (shape: (2, vocabulary_size, vector_size)) :param input: index to input word vectors :param output: index to output word vectors :return: cross entropy cost and gradient """ input_vectors, output_vectors = parameters input_vector = input_vectors[input] prediction = softmax(output_vectors.dot(input_vector)) one_hot_vector = np.zeros_like(prediction) one_hot_vector[output] = 1 gradient_input = np.zeros_like(input_vectors) gradient_input[input] = output_vectors.T.dot(prediction - one_hot_vector) gradient_output = (prediction - one_hot_vector).reshape(-1, 1).dot(input_vector.reshape(-1, 1).T) gradient = np.array([gradient_input, gradient_output]) cost = -np.log(prediction[output]) return cost, gradient
def test_multinomial_logistic_regression(self): input_size = 10 output_size = 5 input = np.random.normal(size=(input_size,)) output = np.random.randint(0, output_size) def multinomial_logistic_regression_wrapper(parameters): return multinomial_logistic_regression_cost_gradient(parameters, input, output) initial_parameters = np.random.normal(size=(input_size, output_size)) result = gradient_check(multinomial_logistic_regression_wrapper, initial_parameters) self.assertEqual([], result) # Train multinomial logistic regression and see if it predicts correct label final_parameters, cost_history = gradient_descent( multinomial_logistic_regression_wrapper, initial_parameters, 100) prediction = softmax(np.dot(final_parameters.T, input)) > 0.5 for i in range(len(prediction)): if output == i: self.assertEqual(1, prediction[i]) else: self.assertEqual(0, prediction[i])
def test_multinomial_logistic_regression(self): input_size = 10 output_size = 5 input = np.random.normal(size=(input_size, )) output = np.random.randint(0, output_size) def multinomial_logistic_regression_wrapper(parameters): return multinomial_logistic_regression_cost_gradient( parameters, input, output) initial_parameters = np.random.normal(size=(input_size, output_size)) result = gradient_check(multinomial_logistic_regression_wrapper, initial_parameters) self.assertEqual([], result) # Train multinomial logistic regression and see if it predicts correct label final_parameters, cost_history = gradient_descent( multinomial_logistic_regression_wrapper, initial_parameters, 100) prediction = softmax(np.dot(final_parameters.T, input)) > 0.5 for i in range(len(prediction)): if output == i: self.assertEqual(1, prediction[i]) else: self.assertEqual(0, prediction[i])