# To check that you have correctly implemented the gradient correctly, you can numerically estimate the gradient of the loss function and compare the numeric estimate to the gradient that you computed. We have provided code that does this for you: # In[12]: # Once you've implemented the gradient, recompute it with the code below # and gradient check it with the function we provided for you # Compute the loss and its gradient at W. #loss, grad = svm_loss_naive(W, X_train, y_train, 0.0) # Numerically compute the gradient along several randomly chosen dimensions, and # compare them with your analytically computed gradient. The numbers should match # almost exactly along all dimensions. from cs231n.gradient_check import grad_check_sparse f = lambda w: svm_loss_naive(w, X_train, y_train, 0.0)[0] grad_numerical = grad_check_sparse(f, W, grad, 10) # ###Inline Question 1: # It is possible that once in a while a dimension in the gradcheck will not match exactly. # What could such a discrepancy be caused by? Is it a reason for concern? # What is a simple example in one dimension where a gradient check could fail? # *Hint: the SVM loss function is not strictly speaking differentiable* # # **Your Answer:** *fill this in.* # In[ ]: # Next implement the function svm_loss_vectorized; for now only compute the loss; # we will implement the gradient in a moment. tic = time.time()
loss, grad = svm_loss_naive(W, X_dev, y_dev, 0.000005) print('loss: %f' % (loss, )) # Once you've implemented the gradient, recompute it with the code below # and gradient check it with the function we provided for you # Compute the loss and its gradient at W. loss, grad = svm_loss_naive(W, X_dev, y_dev, 0.0) # Numerically compute the gradient along several randomly chosen dimensions, and # compare them with your analytically computed gradient. The numbers should match # almost exactly along all dimensions. from cs231n.gradient_check import grad_check_sparse f = lambda w: svm_loss_naive(w, X_dev, y_dev, 0.0)[0] grad_numerical = grad_check_sparse(f, W, grad) # do the gradient check once again with regularization turned on # you didn't forget the regularization gradient did you? loss, grad = svm_loss_naive(W, X_dev, y_dev, 5e1) f = lambda w: svm_loss_naive(w, X_dev, y_dev, 5e1)[0] grad_numerical = grad_check_sparse(f, W, grad) ## -------- vectorized --------- # Next implement the function svm_loss_vectorized; for now only compute the loss; # we will implement the gradient in a moment. tic = time.time() loss_naive, grad_naive = svm_loss_naive(W, X_dev, y_dev, 0.000005) toc = time.time() print('Naive loss: %e computed in %fs' % (loss_naive, toc - tic)) tic = time.time()
print('Train labels shape: ', y_train.shape) print('Validation data shape: ', X_val.shape) print('Validation labels shape: ', y_val.shape) print('Test data shape: ', X_test.shape) print('Test labels shape: ', y_test.shape) print('dev data shape: ', X_dev.shape) print('dev labels shape: ', y_dev.shape) print('\n') ''' Generate weight matrix and conduct softmax loss computation using naive version ''' W = np.random.randn(3073, 10) * 1e-4 loss, grad = softmax_loss_naive(W, X_dev, y_dev, 0.0) # Since W is initialized to very small values, loss should come out to ~(-log(0.1)) print(f"naive loss computation: {loss} -log(0.1): {-np.log(0.1)}") # Check gradient calculation for accuracy f = lambda w: softmax_loss_naive(w, X_dev, y_dev, 0.0)[0] grad_check_sparse(f, W, grad, 10) # Another check, with regularization this time loss_naive, grad_naive = softmax_loss_naive(W, X_dev, y_dev, 0.000005) f = lambda w: softmax_loss_naive(w, X_dev, y_dev, 0.000005)[0] grad_check_sparse(f, W, grad, 10) ''' Repeat for vectorized implementation and compare ''' loss_vectorized, grad_vectorized = softmax_loss_vectorized( W, X_dev, y_dev, 0.000005) print(f'vectorized loss: {loss_vectorized} -log(0.1): {-np.log(0.1)}') # Use the Frobenius norm to compare the two versions of the gradient. grad_difference = np.linalg.norm(grad_naive - grad_vectorized, ord='fro') print('Loss difference: ', np.abs(loss_naive - loss_vectorized)) print('Gradient difference: ', grad_difference) ''' Use the validation set to tune hyperparams - regularization strength and learning rate '''
loss_n, grad_n = softmax_loss_naive(X_dev, y_dev, W_dev, 0.005) print('loss_n: %f' % (loss_n, )) print('grad: {0}'.format(grad_n[:2, :5])) loss_v, grad_v = softmax_loss_vectorized(X_dev, y_dev, W_dev, 0.005) print('loss_v: %f' % (loss_v, )) print('grad: {0}'.format(grad_v[:2, :5])) # In[ ]: from cs231n.gradient_check import grad_check_sparse # In[68]: f = lambda w: softmax_loss_naive(X_dev, y_dev, w, 0.0)[0] grad_numerical = grad_check_sparse(f, W_dev, grad_n) # In[65]: f = lambda w: softmax_loss_vectorized(X_dev, y_dev, w, 0.0)[0] grad_numerical = grad_check_sparse(f, W_dev, grad_v) # In[70]: import time tic = time.time() loss_naive, grad_naive = softmax_loss_naive(X_dev, y_dev, W_dev, 0.005) toc = time.time() print('Naive loss: %e computed in %fs' % (loss_naive, toc - tic)) tic = time.time()
X_test = X_test[mask] y_test = y_test[mask] # 训练模型 X_train = np.reshape(X_train, (X_train.shape[0], -1)) # 1维展开 X_train = np.hstack([X_train, np.ones([X_train.shape[0], 1])]) X_test = np.reshape(X_test, (X_test.shape[0], -1)) # 1维展开 X_test = np.hstack([X_test, np.ones([X_test.shape[0], 1])]) num_class = 10 W = np.random.randn(X_train.shape[1], num_class) * 0.001 # 检查数值梯度和解析梯度 from cs231n.classifiers import softmax_loss_naive, softmax_loss_vectorized loss, grad = softmax_loss_naive(W, X_train, y_train, 0.5) from cs231n.gradient_check import grad_check_sparse f = lambda w: softmax_loss_vectorized(w, X_train, y_train, 0.5)[0] grad_check_sparse(f, W, grad) from cs231n.classifiers import Softmax classifer = Softmax() loss_hist = classifer.train(X_train, y_train, verbose=True, num_iters=5000, batch_size=100) plt.plot(loss_hist) plt.xlabel('Step') plt.ylabel('Loss') plt.show() # 泛化准确率 y_pred = classifer.predict(X_test) accuracy = np.mean(y_pred == y_test) print("Test accuracy:", accuracy)
def main(): X_train, y_train, X_val, y_val, X_test, y_test = get_CIFAR10_data() num_training = 49000 num_dev = 500 mask = np.random.choice(num_training, num_dev, replace=False) X_dev = X_train[mask] y_dev = y_train[mask] # Preprocessing: reshape the image data into rows X_train = np.reshape(X_train, (X_train.shape[0], -1)) X_val = np.reshape(X_val, (X_val.shape[0], -1)) X_test = np.reshape(X_test, (X_test.shape[0], -1)) X_dev = np.reshape(X_dev, (X_dev.shape[0], -1)) # Normalize the data: subtract the mean image mean_image = np.mean(X_train, axis=0) X_train -= mean_image X_val -= mean_image X_test -= mean_image X_dev -= mean_image # add bias dimension and transform into columns X_train = np.hstack([X_train, np.ones((X_train.shape[0], 1))]) X_val = np.hstack([X_val, np.ones((X_val.shape[0], 1))]) X_test = np.hstack([X_test, np.ones((X_test.shape[0], 1))]) X_dev = np.hstack([X_dev, np.ones((X_dev.shape[0], 1))]) # Generate a random softmax weight matrix and use it to compute the loss. W = np.random.randn(3073, 10) * 0.0001 loss, grad = softmax_loss_naive(W, X_dev, y_dev, 0.0) # As a rough sanity check, our loss should be something close to -log(0.1). # Since the weight matrix W is uniform randomly selected, the predicted probability # of each class is uniform distribution and identically equals 1/10, where 10 is the number of classes print('loss: %f' % loss) print('sanity check: %f' % (-np.log(0.1))) f = lambda w: softmax_loss_naive(w, X_dev, y_dev, 0.0)[0] grad_numerical = grad_check_sparse(f, W, grad, 10) # similar to SVM case, do another gradient check with regularization loss, grad = softmax_loss_naive(W, X_dev, y_dev, 1e2) f = lambda w: softmax_loss_naive(w, X_dev, y_dev, 1e2)[0] grad_numerical = grad_check_sparse(f, W, grad, 10) # implement a vectorized version in softmax_loss_vectorized. tic = time.time() loss_naive, grad_naive = softmax_loss_naive(W, X_dev, y_dev, 0.00001) toc = time.time() print('Naive loss: {} computed in {}'.format(loss_naive, toc - tic)) tic = time.time() loss_vectorized, grad_vectorized = softmax_loss_vectorized( W, X_dev, y_dev, 0.00001) toc = time.time() print('Vectorized loss: {} computed in {}'.format(loss_naive, toc - tic)) grad_difference = np.linalg.norm(grad_naive - grad_vectorized, ord='fro') print('Loss difference: %f' % np.abs(loss_naive - loss_vectorized)) print('Gradient difference: %f' % grad_difference) # Use the validation set to tune hyperparameters (regularization strength and # learning rate). You should experiment with different ranges for the learning # rates and regularization strengths; if you are careful you should be able to # get a classification accuracy of over 0.35 on the validation set. results = {} best_val = -1 best_softmax = None learning_rates = [1e-7, 2e-7, 5e-7] #regularization_strengths = [5e4, 1e8] regularization_strengths = [(1 + 0.1 * i) * 1e4 for i in range(-3, 4) ] + [(5 + 0.1 * i) * 1e4 for i in range(-3, 4)] for lr in learning_rates: for rs in regularization_strengths: print('Traing SVM with rs {} and lr {}'.format(rs, lr)) softmax = Softmax() softmax.train(X_train, y_train, lr, rs, num_iters=2000) y_train_pred = softmax.predict(X_train) train_accuracy = np.mean(y_train == y_train_pred) y_val_pred = softmax.predict(X_val) val_accuracy = np.mean(y_val == y_val_pred) if val_accuracy > best_val: best_val = val_accuracy best_softmax = softmax results[(lr, rs)] = train_accuracy, val_accuracy # Print out results. for lr, reg in sorted(results): train_accuracy, val_accuracy = results[(lr, reg)] print('lr %e reg %e train accuracy: %f val accuracy: %f' % (lr, reg, train_accuracy, val_accuracy)) print('best validation accuracy achieved during cross-validation: %f' % best_val) # Evaluate the best softmax on test set y_test_pred = best_softmax.predict(X_test) test_accuracy = np.mean(y_test == y_test_pred) print('softmax on raw pixels final test set accuracy: %f' % (test_accuracy, )) # Visualize the learned weights for each class w = best_softmax.W[:-1, :] # strip out the bias w = w.reshape(32, 32, 3, 10) w_min, w_max = np.min(w), np.max(w) classes = [ 'plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck' ] for i in range(10): plt.subplot(2, 5, i + 1) # Rescale the weights to be between 0 and 255 wimg = 255.0 * (w[:, :, :, i].squeeze() - w_min) / (w_max - w_min) plt.imshow(wimg.astype('uint8')) plt.axis('off') plt.title(classes[i])