def adam(nn, X_train, y_train, val_set=None, alpha=0.001, mb_size=256, n_iter=2000, print_after=100): M = {k: np.zeros_like(v) for k, v in nn.model.items()} R = {k: np.zeros_like(v) for k, v in nn.model.items()} beta1 = .9 beta2 = .999 minibatches = get_minibatch(X_train, y_train, mb_size) if val_set: X_val, y_val = val_set for iter in range(1, n_iter + 1): t = iter idx = np.random.randint(0, len(minibatches)) X_mini, y_mini = minibatches[idx] grad, loss = nn.train_step(X_mini, y_mini) if iter % print_after == 0: if val_set: val_acc = util.accuracy(y_val, nn.predict(X_val)) print('Iter-{} loss: {:.4f} validation: {:4f}'.format(iter, loss, val_acc)) else: print('Iter-{} loss: {:.4f}'.format(iter, loss)) for k in grad: M[k] = util.exp_running_avg(M[k], grad[k], beta1) R[k] = util.exp_running_avg(R[k], grad[k]**2, beta2) m_k_hat = M[k] / (1. - beta1**(t)) r_k_hat = R[k] / (1. - beta2**(t)) nn.model[k] -= alpha * m_k_hat / (np.sqrt(r_k_hat) + c.eps) return nn
def adam_rnn(nn, X_train, y_train, alpha=0.001, mb_size=256, n_iter=2000, print_after=100): M = {k: np.zeros_like(v) for k, v in nn.model.items()} R = {k: np.zeros_like(v) for k, v in nn.model.items()} beta1 = .9 beta2 = .999 minibatches = get_minibatch(X_train, y_train, mb_size, shuffle=False) idx = 0 state = nn.initial_state() smooth_loss = -np.log(1.0 / len(set(X_train))) for iter in range(1, n_iter + 1): t = iter if idx >= len(minibatches): idx = 0 state = nn.initial_state() X_mini, y_mini = minibatches[idx] idx += 1 if iter % print_after == 0: print( "=========================================================================" ) print('Iter-{} loss: {:.4f}'.format(iter, smooth_loss)) print( "=========================================================================" ) sample = nn.sample(X_mini[0], state, 100) print(sample) print( "=========================================================================" ) print() print() grad, loss, state = nn.train_step(X_mini, y_mini, state) smooth_loss = 0.999 * smooth_loss + 0.001 * loss for k in grad: M[k] = util.exp_running_avg(M[k], grad[k], beta1) R[k] = util.exp_running_avg(R[k], grad[k]**2, beta2) m_k_hat = M[k] / (1. - beta1**(t)) r_k_hat = R[k] / (1. - beta2**(t)) nn.model[k] -= alpha * m_k_hat / (np.sqrt(r_k_hat) + c.eps) return nn
def rmsprop(nn, X_train, y_train, val_set=None, alpha=1e-3, mb_size=256, n_iter=2000, print_after=100): cache = {k: np.zeros_like(v) for k, v in nn.model.items()} gamma = .9 minibatches = get_minibatch(X_train, y_train, mb_size) if val_set: X_val, y_val = val_set for iter in range(1, n_iter + 1): idx = np.random.randint(0, len(minibatches)) X_mini, y_mini = minibatches[idx] grad, loss = nn.train_step(X_mini, y_mini) if iter % print_after == 0: if val_set: val_acc = util.accuracy(y_val, nn.predict(X_val)) print('Iter-{} loss: {:.4f} validation: {:4f}'.format(iter, loss, val_acc)) print('grad:',grad) else: print('Iter-{} loss: {:.4f}'.format(iter, loss)) print('grad:',grad) for k in grad: cache[k] = util.exp_running_avg(cache[k], grad[k]**2, gamma) nn.model[k] -= alpha * grad[k] / (np.sqrt(cache[k]) + c.eps) return nn
def rmsprop(nn, X_train, y_train, val_set=None, alpha=1e-3, mb_size=256, n_iter=2000, print_after=100): cache = {k: np.zeros_like(v) for k, v in nn.model.items()} gamma = .9 minibatches = get_minibatch(X_train, y_train, mb_size) if val_set: X_val, y_val = val_set for iter in range(1, n_iter + 1): idx = np.random.randint(0, len(minibatches)) X_mini, y_mini = minibatches[idx] grad, loss = nn.train_step(X_mini, y_mini) if iter % print_after == 0: if val_set: val_acc = util.accuracy(y_val, nn.predict(X_val)) print('Iter-{} loss: {:.4f} validation: {:4f}'.format(iter, loss, val_acc)) else: print('Iter-{} loss: {:.4f}'.format(iter, loss)) for k in grad: cache[k] = util.exp_running_avg(cache[k], grad[k]**2, gamma) nn.model[k] -= alpha * grad[k] / (np.sqrt(cache[k]) + c.eps) return nn
def adam_rnn(nn, X_train, y_train, alpha=0.001, mb_size=256, n_iter=2000, print_after=100): M = {k: np.zeros_like(v) for k, v in nn.model.items()} R = {k: np.zeros_like(v) for k, v in nn.model.items()} beta1 = .9 beta2 = .999 minibatches = get_minibatch(X_train, y_train, mb_size, shuffle=False) idx = 0 state = nn.initial_state() smooth_loss = -np.log(1.0 / len(set(X_train))) for iter in range(1, n_iter + 1): t = iter if idx >= len(minibatches): idx = 0 state = nn.initial_state() X_mini, y_mini = minibatches[idx] idx += 1 if iter % print_after == 0: print("=========================================================================") print('Iter-{} loss: {:.4f}'.format(iter, smooth_loss)) print("=========================================================================") sample = nn.sample(X_mini[0], state, 100) print(sample) print("=========================================================================") print() print() grad, loss, state = nn.train_step(X_mini, y_mini, state) smooth_loss = 0.999 * smooth_loss + 0.001 * loss for k in grad: M[k] = util.exp_running_avg(M[k], grad[k], beta1) R[k] = util.exp_running_avg(R[k], grad[k]**2, beta2) m_k_hat = M[k] / (1. - beta1**(t)) r_k_hat = R[k] / (1. - beta2**(t)) nn.model[k] -= alpha * m_k_hat / (np.sqrt(r_k_hat) + c.eps) return nn
def bn_forward(X, gamma, beta, cache, momentum=.9, train=True): running_mean, running_var = cache if train: mu = np.mean(X, axis=0) var = np.var(X, axis=0) X_norm = (X - mu) / np.sqrt(var + c.eps) out = gamma * X_norm + beta cache = (X, X_norm, mu, var, gamma, beta) running_mean = util.exp_running_avg(running_mean, mu, momentum) running_var = util.exp_running_avg(running_var, var, momentum) else: X_norm = (X - running_mean) / np.sqrt(running_var + c.eps) out = gamma * X_norm + beta cache = None return out, cache, running_mean, running_var
def bn_forward(X, gamma, beta, cache, momentum=.9, train=True): running_mean, running_var = cache if train: mu = np.mean(X, axis=0) var = np.var(X, axis=0) X_norm = (X - mu) / np.sqrt(var + c.eps) out = gamma * X_norm + beta cache = (X, X_norm, mu, var, gamma, beta) running_mean = util.exp_running_avg(running_mean, mu, momentum) running_var = util.exp_running_avg(running_var, var, momentum) else: X_norm = (X - running_mean) / np.sqrt(running_var + c.eps) out = gamma * X_norm + beta cache = None return out, cache, running_mean, running_var