def test_sgd_momentum(): N, D = 4, 5 w = np.linspace(-0.4, 0.6, num=N*D).reshape(N, D) dw = np.linspace(-0.6, 0.4, num=N*D).reshape(N, D) v = np.linspace(0.6, 0.9, num=N*D).reshape(N, D) config = {'learning_rate': 1e-3, 'velocity': v} next_w, _ = sgd_momentum(w, dw, config=config) expected_next_w = np.asarray([ [ 0.1406, 0.20738947, 0.27417895, 0.34096842, 0.40775789], [ 0.47454737, 0.54133684, 0.60812632, 0.67491579, 0.74170526], [ 0.80849474, 0.87528421, 0.94207368, 1.00886316, 1.07565263], [ 1.14244211, 1.20923158, 1.27602105, 1.34281053, 1.4096 ]]) expected_velocity = np.asarray([ [ 0.5406, 0.55475789, 0.56891579, 0.58307368, 0.59723158], [ 0.61138947, 0.62554737, 0.63970526, 0.65386316, 0.66802105], [ 0.68217895, 0.69633684, 0.71049474, 0.72465263, 0.73881053], [ 0.75296842, 0.76712632, 0.78128421, 0.79544211, 0.8096 ]]) assert_close(next_w, expected_next_w) assert_close(expected_velocity, config['velocity'])
def test_sgd_momentum(): N, D = 4, 5 w = np.linspace(-0.4, 0.6, num=N * D).reshape(N, D) dw = np.linspace(-0.6, 0.4, num=N * D).reshape(N, D) v = np.linspace(0.6, 0.9, num=N * D).reshape(N, D) config = {'learning_rate': 1e-3, 'velocity': v} next_w, _ = sgd_momentum(w, dw, config=config) expected_next_w = np.asarray( [[0.1406, 0.20738947, 0.27417895, 0.34096842, 0.40775789], [0.47454737, 0.54133684, 0.60812632, 0.67491579, 0.74170526], [0.80849474, 0.87528421, 0.94207368, 1.00886316, 1.07565263], [1.14244211, 1.20923158, 1.27602105, 1.34281053, 1.4096]]) expected_velocity = np.asarray( [[0.5406, 0.55475789, 0.56891579, 0.58307368, 0.59723158], [0.61138947, 0.62554737, 0.63970526, 0.65386316, 0.66802105], [0.68217895, 0.69633684, 0.71049474, 0.72465263, 0.73881053], [0.75296842, 0.76712632, 0.78128421, 0.79544211, 0.8096]]) assert_close(next_w, expected_next_w) assert_close(expected_velocity, config['velocity'])
plt.plot(solver.loss_history, 'o') plt.title('Training loss history') plt.xlabel('Iteration') plt.ylabel('Training loss') plt.show() from cs231n.optim import sgd_momentum N, D = 4, 5 w = np.linspace(-0.4, 0.6, num=N * D).reshape(N, D) dw = np.linspace(-0.6, 0.4, num=N * D).reshape(N, D) v = np.linspace(0.6, 0.9, num=N * D).reshape(N, D) config = {'learning_rate': 1e-3, 'velocity': v} next_w, _ = sgd_momentum(w, dw, config=config) expected_next_w = np.asarray( [[0.1406, 0.20738947, 0.27417895, 0.34096842, 0.40775789], [0.47454737, 0.54133684, 0.60812632, 0.67491579, 0.74170526], [0.80849474, 0.87528421, 0.94207368, 1.00886316, 1.07565263], [1.14244211, 1.20923158, 1.27602105, 1.34281053, 1.4096]]) expected_velocity = np.asarray( [[0.5406, 0.55475789, 0.56891579, 0.58307368, 0.59723158], [0.61138947, 0.62554737, 0.63970526, 0.65386316, 0.66802105], [0.68217895, 0.69633684, 0.71049474, 0.72465263, 0.73881053], [0.75296842, 0.76712632, 0.78128421, 0.79544211, 0.8096]]) print 'next_w error: ', rel_error(next_w, expected_next_w) print 'velocity error: ', rel_error(expected_velocity, config['velocity'])
# # SGD+Momentum # Stochastic gradient descent with momentum is a widely used update rule that tends to make deep networks converge faster than vanilla stochstic gradient descent. # # Open the file `cs231n/optim.py` and read the documentation at the top of the file to make sure you understand the API. Implement the SGD+momentum update rule in the function `sgd_momentum` and run the following to check your implementation. You should see errors less than 1e-8. # In[ ]: from cs231n.optim import sgd_momentum N, D = 4, 5 w = np.linspace(-0.4, 0.6, num=N*D).reshape(N, D) dw = np.linspace(-0.6, 0.4, num=N*D).reshape(N, D) v = np.linspace(0.6, 0.9, num=N*D).reshape(N, D) config = {'learning_rate': 1e-3, 'velocity': v} next_w, _ = sgd_momentum(w, dw, config=config) expected_next_w = np.asarray([ [ 0.1406, 0.20738947, 0.27417895, 0.34096842, 0.40775789], [ 0.47454737, 0.54133684, 0.60812632, 0.67491579, 0.74170526], [ 0.80849474, 0.87528421, 0.94207368, 1.00886316, 1.07565263], [ 1.14244211, 1.20923158, 1.27602105, 1.34281053, 1.4096 ]]) expected_velocity = np.asarray([ [ 0.5406, 0.55475789, 0.56891579, 0.58307368, 0.59723158], [ 0.61138947, 0.62554737, 0.63970526, 0.65386316, 0.66802105], [ 0.68217895, 0.69633684, 0.71049474, 0.72465263, 0.73881053], [ 0.75296842, 0.76712632, 0.78128421, 0.79544211, 0.8096 ]]) print 'next_w error: ', rel_error(next_w, expected_next_w) print 'velocity error: ', rel_error(expected_velocity, config['velocity'])
# # SGD+Momentum # Stochastic gradient descent with momentum is a widely used update rule that tends to make deep networks converge faster than vanilla stochstic gradient descent. # # Open the file `cs231n/optim.py` and read the documentation at the top of the file to make sure you understand the API. Implement the SGD+momentum update rule in the function `sgd_momentum` and run the following to check your implementation. You should see errors less than 1e-8. # In[ ]: from cs231n.optim import sgd_momentum N, D = 4, 5 w_ = np.linspace(-0.4, 0.6, num=N * D).reshape(N, D) dw = np.linspace(-0.6, 0.4, num=N * D).reshape(N, D) v_ = np.linspace(0.6, 0.9, num=N * D).reshape(N, D) config = {'learning_rate': 1e-3, 'velocity': v_} next_w, config1 = sgd_momentum(w_, dw, config=config) expected_next_w = np.asarray( [[0.1406, 0.20738947, 0.27417895, 0.34096842, 0.40775789], [0.47454737, 0.54133684, 0.60812632, 0.67491579, 0.74170526], [0.80849474, 0.87528421, 0.94207368, 1.00886316, 1.07565263], [1.14244211, 1.20923158, 1.27602105, 1.34281053, 1.4096]]) expected_velocity = np.asarray( [[0.5406, 0.55475789, 0.56891579, 0.58307368, 0.59723158], [0.61138947, 0.62554737, 0.63970526, 0.65386316, 0.66802105], [0.68217895, 0.69633684, 0.71049474, 0.72465263, 0.73881053], [0.75296842, 0.76712632, 0.78128421, 0.79544211, 0.8096]]) print('sgd_moment evaluation') print('next_w error: ', rel_error(next_w, expected_next_w)) print('velocity error: ', rel_error(expected_velocity, config['velocity']))