def nesterov_momentum(learning_rate): X, Y = transform_data() X, Y = shuffle(X, Y) N = len(X) // 2 Xtrain = X[:N] Ytrain = Y[:N] Ttrain = generate_T(Ytrain) Xtest = X[N:] Ytest = Y[N:] Ttest = generate_T(Ytest) N, D = Xtrain.shape M = 100 K = len(set(Y)) iterations = 50 batch_N = 250 batches = N // batch_N v = np.random.randn(M, K) / np.sqrt(M + K) b_1 = np.random.randn(K) / np.sqrt(K) w = np.random.randn(D, M) / np.sqrt(D + M) b_0 = np.random.randn(M) / np.sqrt(M) mu = .9 dv = 0 db_1 = 0 dw = 0 db_0 = 0 nesterov_cost = [] nesterov_cr = [] best_nesterov = 0 best_iteration = 0 for i in range(iterations): for b in range(batches): X = Xtrain[b * batch_N:(b + 1) * batch_N, :] T = Ttrain[b * batch_N:(b + 1) * batch_N, :] Y, Z = generate_Y('tanh', X, w, b_0, v, b_1) Y_test, _ = generate_Y('tanh', Xtest, w, b_0, v, b_1) P_test = np.argmax(Y_test, axis=1) if b % batches == 0: nesterov_cost.append(cross_entropy(Y_test, Ttest)) cr = classification_rate(P_test, Ytest) nesterov_cr.append(cr) if cr > best_nesterov: best_nesterov = cr best_iteration = i dv = mu * dv - learning_rate * derivative_v('tanh', Z, Y, T) db_1 = mu * db_1 - learning_rate * derivative_b1('tanh', Y, T) dw = mu * dw - learning_rate * derivative_w('tanh', X, Y, Z, T, v) db_0 = mu * db_0 - learning_rate * derivative_b0( 'tanh', Y, Z, T, v) v += mu * dv - learning_rate * derivative_v('tanh', Z, Y, T) b_1 += mu * db_1 - learning_rate * derivative_b1('tanh', Y, T) w += mu * dw + learning_rate * derivative_w('tanh', X, Y, Z, T, v) b_0 += mu * db_0 - learning_rate * derivative_b0( 'tanh', Y, Z, T, v) if i % 100 == 0: print('Nesterov Cost: ', nesterov_cost[i], 'Nesterov Classification: ', nesterov_cr[i]) return nesterov_cost, nesterov_cr, best_nesterov, best_iteration
def exp_decay(learning_rate): X, Y = transform_data() X, Y = shuffle(X, Y) N = len(X) // 2 Xtrain = X[:N] Ytrain = Y[:N] Ttrain = generate_T(Ytrain) Xtest = X[N:] Ytest = Y[N:] Ttest = generate_T(Ytest) N, D = Xtrain.shape M = 100 K = len(set(Y)) iterations = 50 batch_N = 250 batches = N // batch_N dv = 0 d_b1 = 0 dw = 0 d_b0 = 0 mu = .9 v = np.random.randn(M, K) / np.sqrt(M + K) b_1 = np.random.randn(K) / np.sqrt(K) w = np.random.randn(D, M) / np.sqrt(D + M) b_0 = np.random.randn(M) / np.sqrt(M) learning_rate = learning_rate exp_cost = [] exp_cr = [] exp_lr = [] best_exp = 0 best_iteration = 0 for i in range(iterations): learning_rate = learning_rate * np.exp(-K * i) exp_lr.append(learning_rate) for b in range(batches): X = Xtrain[b * batches:(b + 1) * batches, :] T = Ttrain[b * batches:(b + 1) * batches, :] Y, Z = generate_Y('tanh', X, w, b_0, v, b_1) Y_test, _ = generate_Y('tanh', Xtest, w, b_0, v, b_1) P_test = np.argmax(Y_test, axis=1) if b % batches == 0: exp_cost.append(cross_entropy(Y_test, Ttest)) cr = classification_rate(P_test, Ytest) exp_cr.append(cr) if cr > best_exp: best_exp = cr best_iteration = i dv = mu * dv - learning_rate * derivative_v('tanh', Z, Y, T) d_b1 = mu * d_b1 - learning_rate * derivative_b1('tanh', Y, T) dw = mu * dw - learning_rate * derivative_w('tanh', X, Y, Z, T, v) d_b0 = mu * d_b0 - learning_rate * derivative_b0( 'tanh', Y, Z, T, v) v += dv b_1 += d_b1 w += dw b_0 += d_b0 if i % 10 == 0: print('Exp Cost: ', exp_cost[i], 'Exp Classification: ', exp_cr[i]) return exp_cost, exp_cr, exp_lr, best_exp, best_iteration
def train(self, X, Y, activation=1, lr=10e-7, reg=10e-7, epoch=10): N, D = X.shape #Diamentionality of our data batch_size = 500 n_batches = int(N / batch_size) ind = tar2ind( Y ) # WE convert our target array into indicator matrix using one hot encoding _, K = ind.shape self.W1 = np.random.randn(D, self.M) / np.sqrt( D) #Input to hidden weight self.W2 = np.random.randn(self.M, K) / np.sqrt( self.M) #Hidden to output weights self.b1 = np.random.randn(self.M) self.b2 = np.random.randn(K) dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 mu = 0.9 # Momentum decay_rate = 0.99 cost = [] for n in range(0, 200): #tempx , tempy = shuffle(X, ind) for i in range(0, n_batches): X_tr = X[i * batch_size:(i * batch_size + batch_size), :] Y_tr = Y[i * batch_size:(i * batch_size + batch_size), ] ind = tar2ind(Y_tr) output, hidden = forward(X_tr, activation, self.W1, self.b1, self.W2, self.b2) #Performing backpropagation now dW2 = mu * dW2 + lr * (derivative_W2(ind, output, hidden, reg, self.W2)) self.W2 = self.W2 + dW2 db2 = mu * db2 + lr * (derivative_b2(ind, output, reg, self.b2)) self.b2 = self.b2 + db2 dW1 = mu * dW1 + lr * (derivative_W1( ind, output, hidden, self.W2, X_tr, activation, reg, self.W1)) self.W1 = self.W1 + dW1 db1 = mu * db1 + lr * (derivative_b1( ind, output, hidden, self.W2, activation, reg, self.b1)) self.b1 = self.b1 + db1 c = cross_entropy(ind, output) cost.append(c) if i % 10 == 0: result = np.argmax(output, axis=1) r = classification_rate(Y_tr, result) print("iteration:- ", i, "cost:- ", c, "classification rate:- ", r)
def batch(learning_rate): X, Y = transform_data() X, Y = shuffle(X, Y) N = len(X) // 2 Xtrain = X[:N] Ytrain = Y[:N] Ttrain = generate_T(Ytrain) Xtest = X[N:] Ytest = Y[N:] Ttest = generate_T(Ytest) N, D = Xtrain.shape M = 100 K = len(set(Y)) iterations = 50 batch_N = 250 batches = len(X) // batch_N v = np.random.randn(M, K) / np.sqrt(M + K) b_1 = np.random.randn(K) / np.sqrt(K) w = np.random.randn(D, M) / np.sqrt(D + M) b_0 = np.random.randn(M) / np.sqrt(M) batch_cost = [] batch_cr = [] best_batch = 0 best_iteration = 0 for i in range(iterations): for b in range(batches): X = Xtrain[b * batch_N:(b + 1) * batch_N, :] T = Ttrain[b * batch_N:(b + 1) * batch_N, :] Y, Z = generate_Y('tanh', X, w, b_0, v, b_1) Y_test, _ = generate_Y('tanh', Xtest, w, b_0, v, b_1) P_test = np.argmax(Y_test, axis=1) if b % batches == 0: batch_cost.append(cross_entropy(Y_test, Ttest)) cr = classification_rate(P_test, Ytest) batch_cr.append(cr) if cr > best_batch: best_batch = cr best_iteration = i v -= learning_rate * derivative_v('tanh', Z, Y, T) b_1 -= learning_rate * derivative_b1('tanh', Y, T) w -= learning_rate * derivative_w('tanh', X, Y, Z, T, v) b_0 -= learning_rate * derivative_b0('tanh', Y, Z, T, v) if i % 100 == 0: print('Batch Cost: ', batch_cost[i], 'Batch Classification: ', batch_cr[i]) return batch_cost, batch_cr, best_batch, best_iteration
def batch_grad(): #get data and for test and train sets X, Y = get_normalized_data() #XTrain = X[:-1000, :] #YTrain = Y[:-1000] #YTrain_ind = y2indicator(YTrain) #XTest = X[-1000:, :] #YTest = Y[-1000:] # = y2indicator(YTest) Y_ind = y2indicator(Y) batchSz = 500 #Initialize random weights N, D = X.shape K = len(set(Y)) M = 300 W1 = np.random.randn(D, M) b1 = np.random.randn(M) W2 = np.random.randn(M, K) b2 = np.random.randn(K) learning_rate = 0.001 reg = 0.01 cache_w2 = 0 cache_b2 = 0 cache_w1 = 0 cache_b1 = 0 decay_rate = 0.999 eps = 10e-10 no_batches = int(N / batchSz) print("No of bathces: ", no_batches) for i in range(300): for n in range(no_batches): #get current batch XBatch = X[n * batchSz:(n * batchSz + batchSz), :] YBatch_ind = Y_ind[n * batchSz:(n * batchSz + batchSz), :] #Forward prop pY, Z = forward_relu(XBatch, W1, b1, W2, b2) #Backprop gW2 = derivative_w2(pY, YBatch_ind, Z) + reg * W2 cache_w2 = decay_rate * cache_w2 + (1 - decay_rate) * gW2 * gW2 W2 += learning_rate * gW2 / (np.sqrt(cache_w2) + eps) gb2 = derivative_b2(pY, YBatch_ind) + reg * b2 cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2 b2 += learning_rate * gb2 / (np.sqrt(cache_b2) + eps) gW1 = derivative_w1(pY, YBatch_ind, W2, Z, XBatch) + reg * W1 cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2 b2 += learning_rate * gb2 / (np.sqrt(cache_b2) + eps) gb1 = derivative_b1(pY, YBatch_ind, W2, Z) + reg * b1 cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1 * gb1 b1 += learning_rate * gb1 / (np.sqrt(cache_b1) + eps) if n % 100 == 0: #Forward prop #pY, Z = forward_relu(XBatch, W1, b1, W2, b2) YBatch = Y[n * batchSz:n * batchSz + batchSz] P = np.argmax(pY, axis=1) er = error_rate(P, YBatch) c = cost(YBatch_ind, pY) print("Loop: ", i, n, "Error rate: ", er, "Cost: ", c) pY, Z = forward_relu(X, W1, b1, W2, b2) p = np.argmax(pY, axis=1) print("Final Final training error rate: ", error_rate(p, Y)) XTest = get_test_data() pY, ZTest = forward_relu(XTest, W1, b1, W2, b2) YTest = np.argmax(pY, axis=1) f = open("test_rms.csv", "w") f.write("ImageId,Label\n") n = YTest.shape[0] for i in range(n): f.write(str(i + 1) + "," + str(YTest[i]) + "\n") f.close()
def rmsprop(learning_rate): X, Y = transform_data() X, Y = shuffle(X, Y) N = len(X) // 2 Xtrain = X[:N] Ytrain = Y[:N] Ttrain = generate_T(Ytrain) Xtest = X[N:] Ytest = Y[N:] Ttest = generate_T(Ytest) Ttest = generate_T(Ytest) N, D = Xtrain.shape M = 100 K = len(set(Y)) iterations = 50 batch_N = 250 batches = N // batch_N dv = 0 d_b1 = 0 dw = 0 d_b0 = 0 mu = .9 v = np.random.randn(M, K) / np.sqrt(M + K) b_1 = np.random.randn(K) / np.sqrt(K) w = np.random.randn(D, M) / np.sqrt(D + M) b_0 = np.random.randn(M) / np.sqrt(M) cache_v = np.ones((M, K)) cache_b1 = np.ones(K) cache_w = np.ones((D, M)) cache_b0 = np.ones(M) epsilon = 10e-10 decay = .9 rmsprop_cost = [] rmsprop_cr = [] best_rms = 0 best_iteration = 0 for i in range(iterations): for b in range(batches): X = Xtrain[b * batches:(b + 1) * batches, :] T = Ttrain[b * batches:(b + 1) * batches, :] Y, Z = generate_Y('tanh', X, w, b_0, v, b_1) Y_test, _ = generate_Y('tanh', Xtest, w, b_0, v, b_1) P_test = np.argmax(Y_test, axis=1) if b % batches == 0: rmsprop_cost.append(cross_entropy(Y_test, Ttest)) cr = classification_rate(P_test, Ytest) rmsprop_cr.append(cr) if cr > best_rms: best_rms = cr best_iteration = i cache_v = decay * cache_v + (1 - decay) * derivative_v( 'tanh', Z, Y, T)**2 cache_b1 = decay * cache_b1 + (1 - decay) * derivative_b1( 'tanh', Y, T)**2 cache_w = decay * cache_w + (1 - decay) * derivative_w( 'tanh', X, Y, Z, T, v)**2 cache_b0 = decay * cache_b0 + (1 - decay) * derivative_b0( 'tanh', Y, Z, T, v)**2 dv = mu * dv - learning_rate * derivative_v( 'tanh', Z, Y, T) / (np.sqrt(cache_v + epsilon)) d_b1 = mu * d_b1 - learning_rate * derivative_b1( 'tanh', Y, T) / (np.sqrt(cache_b1 + epsilon)) dw = mu * dw - learning_rate * derivative_w( 'tanh', X, Y, Z, T, v) / (np.sqrt(cache_w + epsilon)) d_b0 = mu * d_b0 - learning_rate * derivative_b0( 'tanh', Y, Z, T, v) / (np.sqrt(cache_b0 + epsilon)) v += dv b_1 += d_b1 w += dw b_0 += d_b0 if i % 10 == 0: print('RMSProp Cost: ', rmsprop_cost[i], 'RMSProp Classification: ', rmsprop_cr[i]) return rmsprop_cost, rmsprop_cr, best_rms, best_iteration
def batch_grad(): #get data and for test and train sets X, Y = get_normalized_data() #XTrain = X[:-1000, :] #YTrain = Y[:-1000] #YTrain_ind = y2indicator(YTrain) #XTest = X[-1000:, :] #YTest = Y[-1000:] # = y2indicator(YTest) Y_ind = y2indicator(Y) batchSz = 500 #Initialize random weights N, D = X.shape K = len(set(Y)) M = 300 W1 = np.random.randn(D, M) b1 = np.random.randn(M) W2 = np.random.randn(M, K) b2 = np.random.randn(K) learning_rate = 10e-5 no_batches = int(N / batchSz) print("No of bathces: ", no_batches) for i in range(300): for n in range(no_batches): #get current batch XBatch = X[n * batchSz:(n * batchSz + batchSz), :] #YBatch = Y[n*batchSz:n*batchSz + batchSz] YBatch_ind = Y_ind[n * batchSz:(n * batchSz + batchSz), :] #Forward prop pY, Z = forward_relu(XBatch, W1, b1, W2, b2) #Backprop W2 += learning_rate * derivative_w2(pY, YBatch_ind, Z) b2 += learning_rate * derivative_b2(pY, YBatch_ind) W1 += learning_rate * derivative_w1(pY, YBatch_ind, W2, Z, XBatch) b1 += learning_rate * derivative_b1(pY, YBatch_ind, W2, Z) if n % 100 == 0: #Forward prop #pY, Z = forward_relu(XBatch, W1, b1, W2, b2) YBatch = Y[n * batchSz:n * batchSz + batchSz] P = np.argmax(pY, axis=1) er = error_rate(P, YBatch) c = cost(YBatch_ind, pY) print("Loop: ", i, n, "Error rate: ", er, "Cost: ", c) # pY, Z = forward_prop(XTrain, W1, b1, W2, b2) # P = np.argmax(pY, axis=1) # print("Final training error rate: ", error_rate(P, YTrain)) # # pY, Z = forward_prop(XTest, W1, b1, W2, b2) # P = np.argmax(pY, axis=1) # print("Final testing error rate: ", error_rate(P, YTest)) pY, Z = forward_relu(X, W1, b1, W2, b2) p = np.argmax(pY, axis=1) print("Final Final training error rate: ", error_rate(p, Y))
def full(self): for i in range(self.iterations): Y_train, Z = generate_Y(self.activation, self.Xtrain, self.w, self.b_0, self.v, self.b_1) P_train = np.argmax(Y_train, axis=1) Y_test, _ = generate_Y(self.activation, self.Xtest, self.w, self.b_0, self.v, self.b_1) P_test = np.argmax(Y_test, axis=1) self.train_cost.append(cross_entropy(Y_train, self.Ttrain)) self.test_cost.append(cross_entropy(Y_test, self.Ttest)) train_cr = classification_rate(P_train, self.Ytrain) self.train_cr.append(train_cr) test_cr = classification_rate(P_test, self.Ytest) self.test_cr.append(test_cr) if train_cr > self.best_train: self.best_train = train_cr self.train_iteration = i if test_cr > self.best_test: self.best_test = test_cr self.test_iteration = i self.m_v = self.decay_0 * self.m_v + ( 1 - self.decay_0) * derivative_v(self.activation, Z, Y_train, self.Ttrain) self.dm_v = self.m_v / (1 - self.decay_0**(i + 1)) self.v_v = self.decay_1 * self.v_v + ( 1 - self.decay_1) * derivative_v(self.activation, Z, Y_train, self.Ttrain)**2 self.dv_v = self.v_v / (1 - self.decay_1**(i + 1)) self.m_b1 = self.decay_0 * self.m_b1 + ( 1 - self.decay_0) * derivative_b1(self.activation, Y_train, self.Ttrain) self.dm_b1 = self.m_b1 / (1 - self.decay_0**(i + 1)) self.v_b1 = self.decay_1 * self.v_b1 + ( 1 - self.decay_1) * derivative_b1(self.activation, Y_train, self.Ttrain)**2 self.dv_b1 = self.v_b1 / (1 - self.decay_1**(i + 1)) self.m_w = self.decay_0 * self.m_w + ( 1 - self.decay_0) * derivative_w(self.activation, self.Xtrain, Y_train, Z, self.Ttrain, self.v) self.dm_w = self.m_w / (1 - self.decay_0**(i + 1)) self.v_w = self.decay_1 * self.v_w + ( 1 - self.decay_1) * derivative_w(self.activation, self.Xtrain, Y_train, Z, self.Ttrain, self.v)**2 self.dv_w = self.v_w / (1 - self.decay_1**(i + 1)) self.m_b0 = self.decay_0 * self.m_b0 + ( 1 - self.decay_0) * derivative_b0(self.activation, Y_train, Z, self.Ttrain, self.v) self.dm_b0 = self.m_b0 / (1 - self.decay_0**(i + 1)) self.v_b0 = self.decay_1 * self.v_b0 + ( 1 - self.decay_1) * derivative_b0(self.activation, Y_train, Z, self.Ttrain, self.v)**2 self.dv_b0 = self.v_b0 / (1 - self.decay_1**(i + 1)) self.v -= self.learning_rate * self.dm_v / (np.sqrt(self.dv_v + self.epsilon)) self.b_1 -= self.learning_rate * self.dm_b1 / ( np.sqrt(self.dv_b1 + self.epsilon)) self.w -= self.learning_rate * self.dm_w / (np.sqrt(self.dv_w + self.epsilon)) self.b_0 -= self.learning_rate * self.dm_b0 / ( np.sqrt(self.dv_b0 + self.epsilon)) if i % 100 == 0: print(i, 'Train Cost: ', self.train_cost[i], 'Train Classification Rate: ', self.train_cr[i])
def stochastic(self, samples): for i in range(self.iterations): current_X, current_T = shuffle(self.Xtrain, self.Ttrain) for s in range(samples): X = current_X[s, :].reshape(1, current_X.shape[1]) T = current_T[s, :].reshape(1, current_T.shape[1]) Y, Z = generate_Y(self.activation, X, self.w, self.b_0, self.v, self.b_1) Y_train, _ = generate_Y(self.activation, self.Xtrain, self.w, self.b_0, self.v, self.b_1) P_train = np.argmax(Y_train, axis=1) Y_test, _ = generate_Y(self.activation, self.Xtest, self.w, self.b_0, self.v, self.b_1) P_test = np.argmax(Y_test, axis=1) self.train_cost.append(cross_entropy(Y_train, self.Ttrain)) self.test_cost.append(cross_entropy(Y_test, self.Ttest)) train_cr = classification_rate(P_train, self.Ytrain) self.train_cr.append(train_cr) test_cr = classification_rate(P_test, self.Ytest) self.test_cr.append(test_cr) if train_cr > self.best_train: self.best_train = train_cr self.train_iteration = i if test_cr > self.best_test: self.best_test = test_cr self.test_iteration = i self.m_v = self.decay_0 * self.m_v + ( 1 - self.decay_0) * derivative_v(self.activation, Z, Y, T) self.dm_v = self.m_v / (1 - self.decay_0**(i + 1)) self.v_v = self.decay_1 * self.v_v + ( 1 - self.decay_1) * derivative_v(self.activation, Z, Y, T)**2 self.dv_v = self.v_v / (1 - self.decay_1**(i + 1)) self.m_b1 = self.decay_0 * self.m_b1 + ( 1 - self.decay_0) * derivative_b1(self.activation, Y, T) self.dm_b1 = self.m_b1 / (1 - self.decay_0**(i + 1)) self.v_b1 = self.decay_1 * self.v_b1 + ( 1 - self.decay_1) * derivative_b1(self.activation, Y, T)**2 self.dv_b1 = self.v_b1 / (1 - self.decay_1**(i + 1)) self.m_w = self.decay_0 * self.m_w + ( 1 - self.decay_0) * derivative_w(self.activation, X, Y, Z, T, self.v) self.dm_w = self.m_w / (1 - self.decay_0**(i + 1)) self.v_w = self.decay_1 * self.v_w + ( 1 - self.decay_1) * derivative_w(self.activation, X, Y, Z, T, self.v)**2 self.dv_w = self.v_w / (1 - self.decay_1**(i + 1)) self.m_b0 = self.decay_0 * self.m_b0 + ( 1 - self.decay_0) * derivative_b0(self.activation, Y, Z, T, self.v) self.dm_b0 = self.m_b0 / (1 - self.decay_0**(i + 1)) self.v_b0 = self.decay_1 * self.v_b0 + ( 1 - self.decay_1) * derivative_b0(self.activation, Y, Z, T, self.v)**2 self.dv_b0 = self.v_b0 / (1 - self.decay_1**(i + 1)) self.v -= self.learning_rate * self.dm_v / ( np.sqrt(self.dv_v + self.epsilon)) self.b_1 -= self.learning_rate * self.dm_b1 / ( np.sqrt(self.dv_b1 + self.epsilon)) self.w -= self.learning_rate * self.dm_w / ( np.sqrt(self.dv_w + self.epsilon)) self.b_0 -= self.learning_rate * self.dm_b0 / ( np.sqrt(self.dv_b0 + self.epsilon)) if i % 100 == 0: print(i, 'Train Cost: ', self.train_cost[i], 'Train Classification Rate: ', self.train_cr[i])