def softmax_cross_entropy_loss(Z, Y=np.array([])): ''' Computes the softmax activation of the inputs Z Estimates the cross entropy loss Inputs: Z - numpy.ndarray (n, m) Y - numpy.ndarray (1, m) of labels when y=[] loss is set to [] Returns: A - numpy.ndarray (n, m) of softmax activations cache - a dictionary to store the activations later used to estimate derivatives loss - cost of prediction ''' ### CODE HERE maxz = np.max(Z, axis=0, keepdims=True) A = np.exp(Z - maxz) A = A / np.sum(A, axis=0, keepdims=True) #print("Activations") cache_Activation = {} cache_Activation["Activation"] = A one_hot_vector = one_hot(Y.astype(int), 10) #print("one hot vector of Y") #print(one_hot_vector) #print(one_hot_vector.shape) #print(A.shape) loss = -np.sum(one_hot_vector.T * np.log(A)) / (Y.shape[1]) #print(Y.shape[1]) #print("cross entropy loss") #print(loss) return A, cache_Activation, loss
def neuralNetwork(X, Y, lDim, nEpoch, alpha, Xdev, Ydev, k, miniBatchSize): parameters = initParameter(lDim) v, s = initParameterAdam(parameters) costTrain = [] costDev = [] tAdam = 0 for i in range(nEpoch): miniBatch = initRandomMiniBatch(X, Y, miniBatchSize) for (miniBatchX, miniBatchY) in miniBatch: AL, cache = forwardPropagation(miniBatchX, parameters) costTr = costNN(AL, one_hot(miniBatchY.astype(int), 10)) gradients = backPropagation(AL, one_hot(miniBatchY.astype(int), 10), parameters, cache, k) tAdam = tAdam + 1 parameters, v, s = optimizerAdam(parameters, gradients, alpha, v, s, tAdam) costTrain.append(costTr) costD = errorDev(Xdev, one_hot(Ydev.astype(int), 10), parameters) costDev.append(costD) Ypred, _ = predictClass(Xdev, parameters) print((accuracy(Ydev, Ypred))) return parameters, costTrain, costDev
def softmax_cross_entropy_loss_der(Y, cache): ''' Computes the derivative of softmax activation and cross entropy loss Inputs: Y - numpy.ndarray (1, m) of labels cache - a dictionary with cached activations A of size (n,m) Returns: dZ - numpy.ndarray (n, m) derivative for the previous layer ''' ### CODE HERE one_hot_vector = one_hot(Y.astype(int), 10) #print(one_hot_vector.shape) dZ = (cache["Activation"] - one_hot_vector.T) / Y.shape[1] return dZ
def main(): trainDevX, trainDevY, testX, testY = \ mnist(noTrSamples=50000,noTsSamples=5000,\ digit_range=[0,1,2,3,4,5,6,7,8,9],\ noTrPerClass=5000, noTsPerClass=500) perm = np.random.permutation(trainDevX.shape[1]) trainDevX = trainDevX[:, perm] trainDevY = trainDevY[:, perm] devX = trainDevX[:, 45000:50000] devY = trainDevY[:, 45000:50000] trainX = trainDevX[:, 0:45000] trainY = trainDevY[:, 0:45000] #trainY = one_hot(trainY.astype(int), 10) #devY = one_hot(devY.astype(int), 10) #testY = one_hot(testY.astype(int), 10) lDim = [784, 500, 10] miniBatchSize = 100 k = [[50], [100]] nEpoch = 200 alpha = 0.001 costHL = [] accTest = [] errDev = [] errTest = [] for i in range(len(k)): print(k[i]) parameters, costTrain, costDev, t = neuralNetwork( trainX, trainY, lDim, nEpoch, alpha, devX, devY, k[i], miniBatchSize) costHL.append((costTrain, costDev)) errDev.append(costDev[nEpoch - 1]) Ypred, _ = predictClass(devX, parameters) accDev.append(accuracy(devY, Ypred)) Ypred, AL = predictClass(testX, parameters) accTest.append(accuracy(testY, Ypred)) errTest.append(costNN(AL, one_hot(testY.astype(int), 10)))
def main(): ''' Trains a multilayer network for MNIST digit classification (all 10 digits) To create a network with 1 hidden layer of dimensions 800 Run the progam as: python deepMultiClassNetwork_starter.py "[784,800]" The network will have the dimensions [784,800,10] 784 is the input size of digit images (28pix x 28pix = 784) 10 is the number of digits To create a network with 2 hidden layers of dimensions 800 and 500 Run the progam as: python deepMultiClassNetwork_starter.py "[784,800,500]" The network will have the dimensions [784,800,500,10] 784 is the input size of digit images (28pix x 28pix = 784) 10 is the number of digits ''' net_dims = ast.literal_eval(sys.argv[1]) net_dims.append(10) # Adding the digits layer with dimensionality = 10 print("Network dimensions are:" + str(net_dims)) # getting the subset dataset from MNIST train_data, train_label, test_data, test_label = \ mnist(noTrSamples=6000,noTsSamples=1000,\ digit_range=[0,1,2,3,4,5,6,7,8,9],\ noTrPerClass=600, noTsPerClass=100) valid_data, valid_label = train_data[:, 5000:6000], train_label[:, 5000:6000] # initialize learning rate and num_iterations learning_rate = 10.0 num_iterations = 1000 num_train_samples = 5000 num_validate_samples = 1000 num_test_samples = 1000 costs, parameters ,costs_valid = multi_layer_network(train_data, train_label,valid_data,valid_label, net_dims, \ num_iterations=num_iterations, learning_rate= learning_rate, decay_rate = 0.0) # compute the accuracy for training set and testing set train_Pred = classify(train_data, parameters) test_Pred = classify(test_data, parameters) #print (train_label.shape) #print(train_Pred.shape) #print (train_label) #print(train_Pred) train_label_new = one_hot(train_label.astype(int), 10) test_label_new = one_hot(test_label.astype(int), 10) result_train = np.sum(abs(train_Pred - train_label_new.T)) print("Train error is : %f" % (result_train)) result_test = np.sum(abs(test_Pred - test_label_new.T)) print("Test error is : %f" % (result_test)) trAcc = 1 / num_train_samples * np.sum(num_train_samples - result_train) * 100 teAcc = 1 / num_test_samples * np.sum(num_test_samples - result_test) * 100 print("Accuracy for training set is {0:0.3f} %".format(trAcc)) print("Accuracy for testing set is {0:0.3f} %".format(teAcc)) ### CODE HERE to plot costs x = range(0, int(num_iterations / 10)) plt.plot(x, costs) plt.plot(x, costs_valid) plt.xlabel('iterations') plt.ylabel('Costs') plt.title('Training and validation') plt.show()
def multi_layer_network(X, Y, valid_data, valid_label, net_dims, num_iterations, learning_rate, decay_rate=0.01): # changed the default values ''' Creates the multilayer network and trains the network Inputs: X - numpy.ndarray (n,m) of training data Y - numpy.ndarray (1,m) of training data labels net_dims - tuple of layer dimensions num_iterations - num of epochs to train learning_rate - step size for gradient descent Returns: costs - list of costs over training parameters - dictionary of trained network parameters ''' parameters = initialize_multilayer_weights(net_dims) #print("shape of the parameters") #print (len(parameters)) A0 = X costs = [] costs_valid = [] for ii in range(num_iterations): ### CODE HERE # Forward Prop ## call to multi_layer_forward to get activations AL, caches = multi_layer_forward(X, parameters) #print("cache is") #print(len(cache)) #print(cache[2]) #print("A last layer") #print(AL) ## call to softmax cross entropy loss A, cache_Activation, cost = softmax_cross_entropy_loss(AL, Y) #print("Activation") #print(A) #print("cross entropy loss") #print(loss) dZ = softmax_cross_entropy_loss_der(Y, cache_Activation) #print("softmax derivative") #print(dZ) grad = multi_layer_backward(dZ, caches, parameters) #print(grad) parameters, alpha = update_parameters(parameters, grad, num_iterations, learning_rate, decay_rate=0.01) #print("after update parameters") #print(parameters) #print(gradients) # Backward Prop ## call to softmax cross entropy loss der ## call to multi_layer_backward to get gradients ## call to update the parameters if ii % 10 == 0: costs.append(cost) if ii % 10 == 0: AL_valid, caches_valid = multi_layer_forward( valid_data, parameters) valid_pred = classify(valid_data, parameters) x1, cache_Activation, vaild_loss = softmax_cross_entropy_loss( AL_valid, valid_label) costs_valid.append(vaild_loss) valid_label_new = one_hot(valid_label.astype(int), 10) result_valid = np.sum(np.abs(valid_pred - valid_label_new.T)) print("validation error is : %f" % (result_valid)) print("Cost at iteration %i is: %.05f, learning rate: %.05f" % (ii, cost, learning_rate)) return costs, parameters, costs_valid