def nesterov_momentum(learning_rate): X, Y = transform_data() X, Y = shuffle(X, Y) N = len(X) // 2 Xtrain = X[:N] Ytrain = Y[:N] Ttrain = generate_T(Ytrain) Xtest = X[N:] Ytest = Y[N:] Ttest = generate_T(Ytest) N, D = Xtrain.shape M = 100 K = len(set(Y)) iterations = 50 batch_N = 250 batches = N // batch_N v = np.random.randn(M, K) / np.sqrt(M + K) b_1 = np.random.randn(K) / np.sqrt(K) w = np.random.randn(D, M) / np.sqrt(D + M) b_0 = np.random.randn(M) / np.sqrt(M) mu = .9 dv = 0 db_1 = 0 dw = 0 db_0 = 0 nesterov_cost = [] nesterov_cr = [] best_nesterov = 0 best_iteration = 0 for i in range(iterations): for b in range(batches): X = Xtrain[b * batch_N:(b + 1) * batch_N, :] T = Ttrain[b * batch_N:(b + 1) * batch_N, :] Y, Z = generate_Y('tanh', X, w, b_0, v, b_1) Y_test, _ = generate_Y('tanh', Xtest, w, b_0, v, b_1) P_test = np.argmax(Y_test, axis=1) if b % batches == 0: nesterov_cost.append(cross_entropy(Y_test, Ttest)) cr = classification_rate(P_test, Ytest) nesterov_cr.append(cr) if cr > best_nesterov: best_nesterov = cr best_iteration = i dv = mu * dv - learning_rate * derivative_v('tanh', Z, Y, T) db_1 = mu * db_1 - learning_rate * derivative_b1('tanh', Y, T) dw = mu * dw - learning_rate * derivative_w('tanh', X, Y, Z, T, v) db_0 = mu * db_0 - learning_rate * derivative_b0( 'tanh', Y, Z, T, v) v += mu * dv - learning_rate * derivative_v('tanh', Z, Y, T) b_1 += mu * db_1 - learning_rate * derivative_b1('tanh', Y, T) w += mu * dw + learning_rate * derivative_w('tanh', X, Y, Z, T, v) b_0 += mu * db_0 - learning_rate * derivative_b0( 'tanh', Y, Z, T, v) if i % 100 == 0: print('Nesterov Cost: ', nesterov_cost[i], 'Nesterov Classification: ', nesterov_cr[i]) return nesterov_cost, nesterov_cr, best_nesterov, best_iteration
def exp_decay(learning_rate): X, Y = transform_data() X, Y = shuffle(X, Y) N = len(X) // 2 Xtrain = X[:N] Ytrain = Y[:N] Ttrain = generate_T(Ytrain) Xtest = X[N:] Ytest = Y[N:] Ttest = generate_T(Ytest) N, D = Xtrain.shape M = 100 K = len(set(Y)) iterations = 50 batch_N = 250 batches = N // batch_N dv = 0 d_b1 = 0 dw = 0 d_b0 = 0 mu = .9 v = np.random.randn(M, K) / np.sqrt(M + K) b_1 = np.random.randn(K) / np.sqrt(K) w = np.random.randn(D, M) / np.sqrt(D + M) b_0 = np.random.randn(M) / np.sqrt(M) learning_rate = learning_rate exp_cost = [] exp_cr = [] exp_lr = [] best_exp = 0 best_iteration = 0 for i in range(iterations): learning_rate = learning_rate * np.exp(-K * i) exp_lr.append(learning_rate) for b in range(batches): X = Xtrain[b * batches:(b + 1) * batches, :] T = Ttrain[b * batches:(b + 1) * batches, :] Y, Z = generate_Y('tanh', X, w, b_0, v, b_1) Y_test, _ = generate_Y('tanh', Xtest, w, b_0, v, b_1) P_test = np.argmax(Y_test, axis=1) if b % batches == 0: exp_cost.append(cross_entropy(Y_test, Ttest)) cr = classification_rate(P_test, Ytest) exp_cr.append(cr) if cr > best_exp: best_exp = cr best_iteration = i dv = mu * dv - learning_rate * derivative_v('tanh', Z, Y, T) d_b1 = mu * d_b1 - learning_rate * derivative_b1('tanh', Y, T) dw = mu * dw - learning_rate * derivative_w('tanh', X, Y, Z, T, v) d_b0 = mu * d_b0 - learning_rate * derivative_b0( 'tanh', Y, Z, T, v) v += dv b_1 += d_b1 w += dw b_0 += d_b0 if i % 10 == 0: print('Exp Cost: ', exp_cost[i], 'Exp Classification: ', exp_cr[i]) return exp_cost, exp_cr, exp_lr, best_exp, best_iteration
def grid(): X,Y = transform_data() X,Y = shuffle(X,Y) N = len(X)//2 Xtrain = X[:N] Ytrain = Y[:N] Ttrain = generate_T(Ytrain) Xtest = X[N:] Ytest = Y[N:] Ttest = generate_T(Ytest) N,D = Xtrain.shape K = len(set(Y)) w0 = np.random.randn(D,K)/np.sqrt(D+K) b0 = np.random.randn(K)/np.sqrt(K) learning_rates = [10**i for i in range(-7,-3,1)] momentums = [1-10**i for i in sorted(list(range(-4,0)),reverse=True)] iterations = 2000 best_lr = 0 best_momentum = 0 best_cr = 0 cost = {} cr = {} for lr in learning_rates: learning_rate = lr for mu in momentums: dw = 0 db = 0 cost[(lr,mu)] = list() cr[(lr,mu)] = list() for i in range(iterations): if i == 0: A_train = relu(Xtrain.dot(w0) + b0) A_test = relu(Xtest.dot(w0) + b0) else: A_train = relu(Xtrain.dot(w) + b0) A_test = relu(Xtest.dot(w) + b0) Y_train = np.exp(A_train)/np.exp(A_train).sum(axis=1,keepdims=True) Y_test = np.exp(A_test)/np.exp(A_test).sum(axis=1,keepdims=True) P_test = np.argmax(Y_test,axis=1) cost[(lr,mu)].append(cross_entropy(Y_test,Ttest)) current_cr = classification_rate(P_test,Ytest) cr[(lr,mu)].append(current_cr) if current_cr > best_cr: best_cr = current_cr best_lr = lr best_mu = mu dw = mu*dw - (1-mu)*learning_rate*derivative_w(Xtrain,Y_train,Ttrain) db = mu*db - (1-mu)*learning_rate*derivative_b(Y_train,Ttrain) if i == 0: w = w0 + dw b = b0 + db else: w += dw b += db if i % 100 == 0: print('Learning Rate: ',lr,'Momentum: ',mu,'Cost: ',cost[(lr,mu)][i],'Classification Rate: ',cr[(lr,mu)][i]) if i == (iterations - 1): print('') return cost,cr,best_lr,best_mu,best_cr
def batch(learning_rate): X, Y = transform_data() X, Y = shuffle(X, Y) N = len(X) // 2 Xtrain = X[:N] Ytrain = Y[:N] Ttrain = generate_T(Ytrain) Xtest = X[N:] Ytest = Y[N:] Ttest = generate_T(Ytest) N, D = Xtrain.shape M = 100 K = len(set(Y)) iterations = 50 batch_N = 250 batches = len(X) // batch_N v = np.random.randn(M, K) / np.sqrt(M + K) b_1 = np.random.randn(K) / np.sqrt(K) w = np.random.randn(D, M) / np.sqrt(D + M) b_0 = np.random.randn(M) / np.sqrt(M) batch_cost = [] batch_cr = [] best_batch = 0 best_iteration = 0 for i in range(iterations): for b in range(batches): X = Xtrain[b * batch_N:(b + 1) * batch_N, :] T = Ttrain[b * batch_N:(b + 1) * batch_N, :] Y, Z = generate_Y('tanh', X, w, b_0, v, b_1) Y_test, _ = generate_Y('tanh', Xtest, w, b_0, v, b_1) P_test = np.argmax(Y_test, axis=1) if b % batches == 0: batch_cost.append(cross_entropy(Y_test, Ttest)) cr = classification_rate(P_test, Ytest) batch_cr.append(cr) if cr > best_batch: best_batch = cr best_iteration = i v -= learning_rate * derivative_v('tanh', Z, Y, T) b_1 -= learning_rate * derivative_b1('tanh', Y, T) w -= learning_rate * derivative_w('tanh', X, Y, Z, T, v) b_0 -= learning_rate * derivative_b0('tanh', Y, Z, T, v) if i % 100 == 0: print('Batch Cost: ', batch_cost[i], 'Batch Classification: ', batch_cr[i]) return batch_cost, batch_cr, best_batch, best_iteration
def rmsprop(learning_rate): X, Y = transform_data() X, Y = shuffle(X, Y) N = len(X) // 2 Xtrain = X[:N] Ytrain = Y[:N] Ttrain = generate_T(Ytrain) Xtest = X[N:] Ytest = Y[N:] Ttest = generate_T(Ytest) Ttest = generate_T(Ytest) N, D = Xtrain.shape M = 100 K = len(set(Y)) iterations = 50 batch_N = 250 batches = N // batch_N dv = 0 d_b1 = 0 dw = 0 d_b0 = 0 mu = .9 v = np.random.randn(M, K) / np.sqrt(M + K) b_1 = np.random.randn(K) / np.sqrt(K) w = np.random.randn(D, M) / np.sqrt(D + M) b_0 = np.random.randn(M) / np.sqrt(M) cache_v = np.ones((M, K)) cache_b1 = np.ones(K) cache_w = np.ones((D, M)) cache_b0 = np.ones(M) epsilon = 10e-10 decay = .9 rmsprop_cost = [] rmsprop_cr = [] best_rms = 0 best_iteration = 0 for i in range(iterations): for b in range(batches): X = Xtrain[b * batches:(b + 1) * batches, :] T = Ttrain[b * batches:(b + 1) * batches, :] Y, Z = generate_Y('tanh', X, w, b_0, v, b_1) Y_test, _ = generate_Y('tanh', Xtest, w, b_0, v, b_1) P_test = np.argmax(Y_test, axis=1) if b % batches == 0: rmsprop_cost.append(cross_entropy(Y_test, Ttest)) cr = classification_rate(P_test, Ytest) rmsprop_cr.append(cr) if cr > best_rms: best_rms = cr best_iteration = i cache_v = decay * cache_v + (1 - decay) * derivative_v( 'tanh', Z, Y, T)**2 cache_b1 = decay * cache_b1 + (1 - decay) * derivative_b1( 'tanh', Y, T)**2 cache_w = decay * cache_w + (1 - decay) * derivative_w( 'tanh', X, Y, Z, T, v)**2 cache_b0 = decay * cache_b0 + (1 - decay) * derivative_b0( 'tanh', Y, Z, T, v)**2 dv = mu * dv - learning_rate * derivative_v( 'tanh', Z, Y, T) / (np.sqrt(cache_v + epsilon)) d_b1 = mu * d_b1 - learning_rate * derivative_b1( 'tanh', Y, T) / (np.sqrt(cache_b1 + epsilon)) dw = mu * dw - learning_rate * derivative_w( 'tanh', X, Y, Z, T, v) / (np.sqrt(cache_w + epsilon)) d_b0 = mu * d_b0 - learning_rate * derivative_b0( 'tanh', Y, Z, T, v) / (np.sqrt(cache_b0 + epsilon)) v += dv b_1 += d_b1 w += dw b_0 += d_b0 if i % 10 == 0: print('RMSProp Cost: ', rmsprop_cost[i], 'RMSProp Classification: ', rmsprop_cr[i]) return rmsprop_cost, rmsprop_cr, best_rms, best_iteration
def full(self): for i in range(self.iterations): Y_train, Z = generate_Y(self.activation, self.Xtrain, self.w, self.b_0, self.v, self.b_1) P_train = np.argmax(Y_train, axis=1) Y_test, _ = generate_Y(self.activation, self.Xtest, self.w, self.b_0, self.v, self.b_1) P_test = np.argmax(Y_test, axis=1) self.train_cost.append(cross_entropy(Y_train, self.Ttrain)) self.test_cost.append(cross_entropy(Y_test, self.Ttest)) train_cr = classification_rate(P_train, self.Ytrain) self.train_cr.append(train_cr) test_cr = classification_rate(P_test, self.Ytest) self.test_cr.append(test_cr) if train_cr > self.best_train: self.best_train = train_cr self.train_iteration = i if test_cr > self.best_test: self.best_test = test_cr self.test_iteration = i self.m_v = self.decay_0 * self.m_v + ( 1 - self.decay_0) * derivative_v(self.activation, Z, Y_train, self.Ttrain) self.dm_v = self.m_v / (1 - self.decay_0**(i + 1)) self.v_v = self.decay_1 * self.v_v + ( 1 - self.decay_1) * derivative_v(self.activation, Z, Y_train, self.Ttrain)**2 self.dv_v = self.v_v / (1 - self.decay_1**(i + 1)) self.m_b1 = self.decay_0 * self.m_b1 + ( 1 - self.decay_0) * derivative_b1(self.activation, Y_train, self.Ttrain) self.dm_b1 = self.m_b1 / (1 - self.decay_0**(i + 1)) self.v_b1 = self.decay_1 * self.v_b1 + ( 1 - self.decay_1) * derivative_b1(self.activation, Y_train, self.Ttrain)**2 self.dv_b1 = self.v_b1 / (1 - self.decay_1**(i + 1)) self.m_w = self.decay_0 * self.m_w + ( 1 - self.decay_0) * derivative_w(self.activation, self.Xtrain, Y_train, Z, self.Ttrain, self.v) self.dm_w = self.m_w / (1 - self.decay_0**(i + 1)) self.v_w = self.decay_1 * self.v_w + ( 1 - self.decay_1) * derivative_w(self.activation, self.Xtrain, Y_train, Z, self.Ttrain, self.v)**2 self.dv_w = self.v_w / (1 - self.decay_1**(i + 1)) self.m_b0 = self.decay_0 * self.m_b0 + ( 1 - self.decay_0) * derivative_b0(self.activation, Y_train, Z, self.Ttrain, self.v) self.dm_b0 = self.m_b0 / (1 - self.decay_0**(i + 1)) self.v_b0 = self.decay_1 * self.v_b0 + ( 1 - self.decay_1) * derivative_b0(self.activation, Y_train, Z, self.Ttrain, self.v)**2 self.dv_b0 = self.v_b0 / (1 - self.decay_1**(i + 1)) self.v -= self.learning_rate * self.dm_v / (np.sqrt(self.dv_v + self.epsilon)) self.b_1 -= self.learning_rate * self.dm_b1 / ( np.sqrt(self.dv_b1 + self.epsilon)) self.w -= self.learning_rate * self.dm_w / (np.sqrt(self.dv_w + self.epsilon)) self.b_0 -= self.learning_rate * self.dm_b0 / ( np.sqrt(self.dv_b0 + self.epsilon)) if i % 100 == 0: print(i, 'Train Cost: ', self.train_cost[i], 'Train Classification Rate: ', self.train_cr[i])
def stochastic(self, samples): for i in range(self.iterations): current_X, current_T = shuffle(self.Xtrain, self.Ttrain) for s in range(samples): X = current_X[s, :].reshape(1, current_X.shape[1]) T = current_T[s, :].reshape(1, current_T.shape[1]) Y, Z = generate_Y(self.activation, X, self.w, self.b_0, self.v, self.b_1) Y_train, _ = generate_Y(self.activation, self.Xtrain, self.w, self.b_0, self.v, self.b_1) P_train = np.argmax(Y_train, axis=1) Y_test, _ = generate_Y(self.activation, self.Xtest, self.w, self.b_0, self.v, self.b_1) P_test = np.argmax(Y_test, axis=1) self.train_cost.append(cross_entropy(Y_train, self.Ttrain)) self.test_cost.append(cross_entropy(Y_test, self.Ttest)) train_cr = classification_rate(P_train, self.Ytrain) self.train_cr.append(train_cr) test_cr = classification_rate(P_test, self.Ytest) self.test_cr.append(test_cr) if train_cr > self.best_train: self.best_train = train_cr self.train_iteration = i if test_cr > self.best_test: self.best_test = test_cr self.test_iteration = i self.m_v = self.decay_0 * self.m_v + ( 1 - self.decay_0) * derivative_v(self.activation, Z, Y, T) self.dm_v = self.m_v / (1 - self.decay_0**(i + 1)) self.v_v = self.decay_1 * self.v_v + ( 1 - self.decay_1) * derivative_v(self.activation, Z, Y, T)**2 self.dv_v = self.v_v / (1 - self.decay_1**(i + 1)) self.m_b1 = self.decay_0 * self.m_b1 + ( 1 - self.decay_0) * derivative_b1(self.activation, Y, T) self.dm_b1 = self.m_b1 / (1 - self.decay_0**(i + 1)) self.v_b1 = self.decay_1 * self.v_b1 + ( 1 - self.decay_1) * derivative_b1(self.activation, Y, T)**2 self.dv_b1 = self.v_b1 / (1 - self.decay_1**(i + 1)) self.m_w = self.decay_0 * self.m_w + ( 1 - self.decay_0) * derivative_w(self.activation, X, Y, Z, T, self.v) self.dm_w = self.m_w / (1 - self.decay_0**(i + 1)) self.v_w = self.decay_1 * self.v_w + ( 1 - self.decay_1) * derivative_w(self.activation, X, Y, Z, T, self.v)**2 self.dv_w = self.v_w / (1 - self.decay_1**(i + 1)) self.m_b0 = self.decay_0 * self.m_b0 + ( 1 - self.decay_0) * derivative_b0(self.activation, Y, Z, T, self.v) self.dm_b0 = self.m_b0 / (1 - self.decay_0**(i + 1)) self.v_b0 = self.decay_1 * self.v_b0 + ( 1 - self.decay_1) * derivative_b0(self.activation, Y, Z, T, self.v)**2 self.dv_b0 = self.v_b0 / (1 - self.decay_1**(i + 1)) self.v -= self.learning_rate * self.dm_v / ( np.sqrt(self.dv_v + self.epsilon)) self.b_1 -= self.learning_rate * self.dm_b1 / ( np.sqrt(self.dv_b1 + self.epsilon)) self.w -= self.learning_rate * self.dm_w / ( np.sqrt(self.dv_w + self.epsilon)) self.b_0 -= self.learning_rate * self.dm_b0 / ( np.sqrt(self.dv_b0 + self.epsilon)) if i % 100 == 0: print(i, 'Train Cost: ', self.train_cost[i], 'Train Classification Rate: ', self.train_cr[i])