batch_size = 50 learning_rate = 1e-3 niterations = 25000 momentum = 0.90 dropout = 0.00 gradient_clip = (-1.0, 1.0) save_every = 1000 plot_every = 100 logs = {} data = loader(batch_size=batch_size, permuted=permuted) def dW(W): load_weights(model, W) forget(model) inputs, labels = data.fetch() preds = forward(model, inputs) target = labels backward(model, target) gradients = extract_grads(model) clipped_gradients = np.clip(gradients, gradient_clip[0], gradient_clip[1]) loss = -1.0 * np.sum(labels * np.log(preds)) / batch_size
batch_size = 50 learning_rate = 1e-2 niterations = 25000 momentum = 0.9 dropout = 0.10 gradient_clip = (-1.0, 1.0) save_every = 1000 plot_every = 100 logs = {} data = loader(batch_size=batch_size) def dW(W): load_weights(model, W) forget(model) inputs, labels = data.fetch() preds = forward(model, inputs) target = labels backward(model, target) gradients = extract_grads(model) clipped_gradients = np.clip(gradients, gradient_clip[0], gradient_clip[1]) loss = -1.0 * np.sum(labels * np.log(preds)) / batch_size
batch_size = 16 learning_rate = 1e-3 niterations = 25000 momentum = 0.90 dropout = 0.00 gradient_clip = (-1.0, 1.0) save_every = 1000 plot_every = 100 logs = {} data = loader(batch_size=batch_size, permuted=permuted) def dW(W): load_weights(model, W) forget(model) inputs, labels = data.fetch() preds = forward(model, inputs) target = labels backward(model, target) gradients = extract_grads(model) clipped_gradients = np.clip(gradients, gradient_clip[0], gradient_clip[1]) loss = -1.0 * np.sum(labels * np.log(preds)) / batch_size gradient_norm = (gradients ** 2).sum() / gradients.size
batch_size = 50 learning_rate = 1e-2 niterations = 25000 momentum = 0.9 dropout = 0.10 gradient_clip = (-1.0, 1.0) save_every = 1000 plot_every = 100 logs = {} data = loader(batch_size=batch_size) def dW(W): load_weights(model, W) forget(model) inputs, labels = data.fetch() preds = forward(model, inputs) target = labels backward(model, target) gradients = extract_grads(model) clipped_gradients = np.clip(gradients, gradient_clip[0], gradient_clip[1]) loss = -1.0 * np.sum(labels * np.log(preds)) / batch_size gradient_norm = (gradients ** 2).sum() / gradients.size