def grid(): X,Y = transform_data() X,Y = shuffle(X,Y) N = len(X)//2 Xtrain = X[:N] Ytrain = Y[:N] Ttrain = generate_T(Ytrain) Xtest = X[N:] Ytest = Y[N:] Ttest = generate_T(Ytest) N,D = Xtrain.shape K = len(set(Y)) w0 = np.random.randn(D,K)/np.sqrt(D+K) b0 = np.random.randn(K)/np.sqrt(K) learning_rates = [10**i for i in range(-7,-3,1)] momentums = [1-10**i for i in sorted(list(range(-4,0)),reverse=True)] iterations = 2000 best_lr = 0 best_momentum = 0 best_cr = 0 cost = {} cr = {} for lr in learning_rates: learning_rate = lr for mu in momentums: dw = 0 db = 0 cost[(lr,mu)] = list() cr[(lr,mu)] = list() for i in range(iterations): if i == 0: A_train = relu(Xtrain.dot(w0) + b0) A_test = relu(Xtest.dot(w0) + b0) else: A_train = relu(Xtrain.dot(w) + b0) A_test = relu(Xtest.dot(w) + b0) Y_train = np.exp(A_train)/np.exp(A_train).sum(axis=1,keepdims=True) Y_test = np.exp(A_test)/np.exp(A_test).sum(axis=1,keepdims=True) P_test = np.argmax(Y_test,axis=1) cost[(lr,mu)].append(cross_entropy(Y_test,Ttest)) current_cr = classification_rate(P_test,Ytest) cr[(lr,mu)].append(current_cr) if current_cr > best_cr: best_cr = current_cr best_lr = lr best_mu = mu dw = mu*dw - (1-mu)*learning_rate*derivative_w(Xtrain,Y_train,Ttrain) db = mu*db - (1-mu)*learning_rate*derivative_b(Y_train,Ttrain) if i == 0: w = w0 + dw b = b0 + db else: w += dw b += db if i % 100 == 0: print('Learning Rate: ',lr,'Momentum: ',mu,'Cost: ',cost[(lr,mu)][i],'Classification Rate: ',cr[(lr,mu)][i]) if i == (iterations - 1): print('') return cost,cr,best_lr,best_mu,best_cr
def exp_decay(learning_rate): X, Y = transform_data() X, Y = shuffle(X, Y) N = len(X) // 2 Xtrain = X[:N] Ytrain = Y[:N] Ttrain = generate_T(Ytrain) Xtest = X[N:] Ytest = Y[N:] Ttest = generate_T(Ytest) N, D = Xtrain.shape M = 100 K = len(set(Y)) iterations = 50 batch_N = 250 batches = N // batch_N dv = 0 d_b1 = 0 dw = 0 d_b0 = 0 mu = .9 v = np.random.randn(M, K) / np.sqrt(M + K) b_1 = np.random.randn(K) / np.sqrt(K) w = np.random.randn(D, M) / np.sqrt(D + M) b_0 = np.random.randn(M) / np.sqrt(M) learning_rate = learning_rate exp_cost = [] exp_cr = [] exp_lr = [] best_exp = 0 best_iteration = 0 for i in range(iterations): learning_rate = learning_rate * np.exp(-K * i) exp_lr.append(learning_rate) for b in range(batches): X = Xtrain[b * batches:(b + 1) * batches, :] T = Ttrain[b * batches:(b + 1) * batches, :] Y, Z = generate_Y('tanh', X, w, b_0, v, b_1) Y_test, _ = generate_Y('tanh', Xtest, w, b_0, v, b_1) P_test = np.argmax(Y_test, axis=1) if b % batches == 0: exp_cost.append(cross_entropy(Y_test, Ttest)) cr = classification_rate(P_test, Ytest) exp_cr.append(cr) if cr > best_exp: best_exp = cr best_iteration = i dv = mu * dv - learning_rate * derivative_v('tanh', Z, Y, T) d_b1 = mu * d_b1 - learning_rate * derivative_b1('tanh', Y, T) dw = mu * dw - learning_rate * derivative_w('tanh', X, Y, Z, T, v) d_b0 = mu * d_b0 - learning_rate * derivative_b0( 'tanh', Y, Z, T, v) v += dv b_1 += d_b1 w += dw b_0 += d_b0 if i % 10 == 0: print('Exp Cost: ', exp_cost[i], 'Exp Classification: ', exp_cr[i]) return exp_cost, exp_cr, exp_lr, best_exp, best_iteration
def nesterov_momentum(learning_rate): X, Y = transform_data() X, Y = shuffle(X, Y) N = len(X) // 2 Xtrain = X[:N] Ytrain = Y[:N] Ttrain = generate_T(Ytrain) Xtest = X[N:] Ytest = Y[N:] Ttest = generate_T(Ytest) N, D = Xtrain.shape M = 100 K = len(set(Y)) iterations = 50 batch_N = 250 batches = N // batch_N v = np.random.randn(M, K) / np.sqrt(M + K) b_1 = np.random.randn(K) / np.sqrt(K) w = np.random.randn(D, M) / np.sqrt(D + M) b_0 = np.random.randn(M) / np.sqrt(M) mu = .9 dv = 0 db_1 = 0 dw = 0 db_0 = 0 nesterov_cost = [] nesterov_cr = [] best_nesterov = 0 best_iteration = 0 for i in range(iterations): for b in range(batches): X = Xtrain[b * batch_N:(b + 1) * batch_N, :] T = Ttrain[b * batch_N:(b + 1) * batch_N, :] Y, Z = generate_Y('tanh', X, w, b_0, v, b_1) Y_test, _ = generate_Y('tanh', Xtest, w, b_0, v, b_1) P_test = np.argmax(Y_test, axis=1) if b % batches == 0: nesterov_cost.append(cross_entropy(Y_test, Ttest)) cr = classification_rate(P_test, Ytest) nesterov_cr.append(cr) if cr > best_nesterov: best_nesterov = cr best_iteration = i dv = mu * dv - learning_rate * derivative_v('tanh', Z, Y, T) db_1 = mu * db_1 - learning_rate * derivative_b1('tanh', Y, T) dw = mu * dw - learning_rate * derivative_w('tanh', X, Y, Z, T, v) db_0 = mu * db_0 - learning_rate * derivative_b0( 'tanh', Y, Z, T, v) v += mu * dv - learning_rate * derivative_v('tanh', Z, Y, T) b_1 += mu * db_1 - learning_rate * derivative_b1('tanh', Y, T) w += mu * dw + learning_rate * derivative_w('tanh', X, Y, Z, T, v) b_0 += mu * db_0 - learning_rate * derivative_b0( 'tanh', Y, Z, T, v) if i % 100 == 0: print('Nesterov Cost: ', nesterov_cost[i], 'Nesterov Classification: ', nesterov_cr[i]) return nesterov_cost, nesterov_cr, best_nesterov, best_iteration
def train(self, X, Y, activation=1, lr=10e-7, reg=10e-7, epoch=10): N, D = X.shape #Diamentionality of our data batch_size = 500 n_batches = int(N / batch_size) ind = tar2ind( Y ) # WE convert our target array into indicator matrix using one hot encoding _, K = ind.shape self.W1 = np.random.randn(D, self.M) / np.sqrt( D) #Input to hidden weight self.W2 = np.random.randn(self.M, K) / np.sqrt( self.M) #Hidden to output weights self.b1 = np.random.randn(self.M) self.b2 = np.random.randn(K) dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 mu = 0.9 # Momentum decay_rate = 0.99 cost = [] for n in range(0, 200): #tempx , tempy = shuffle(X, ind) for i in range(0, n_batches): X_tr = X[i * batch_size:(i * batch_size + batch_size), :] Y_tr = Y[i * batch_size:(i * batch_size + batch_size), ] ind = tar2ind(Y_tr) output, hidden = forward(X_tr, activation, self.W1, self.b1, self.W2, self.b2) #Performing backpropagation now dW2 = mu * dW2 + lr * (derivative_W2(ind, output, hidden, reg, self.W2)) self.W2 = self.W2 + dW2 db2 = mu * db2 + lr * (derivative_b2(ind, output, reg, self.b2)) self.b2 = self.b2 + db2 dW1 = mu * dW1 + lr * (derivative_W1( ind, output, hidden, self.W2, X_tr, activation, reg, self.W1)) self.W1 = self.W1 + dW1 db1 = mu * db1 + lr * (derivative_b1( ind, output, hidden, self.W2, activation, reg, self.b1)) self.b1 = self.b1 + db1 c = cross_entropy(ind, output) cost.append(c) if i % 10 == 0: result = np.argmax(output, axis=1) r = classification_rate(Y_tr, result) print("iteration:- ", i, "cost:- ", c, "classification rate:- ", r)
def compute(self, pred, label, seq_mask=None): label = np.expand_dims(label, axis=2) ce = cross_entropy( softmax=pred, label=label, soft_label=False, axis=-1, ignore_index=-100) ce = np.squeeze(ce, axis=2) if seq_mask is not None: ce = ce * seq_mask word_num = np.sum(seq_mask) return ce, word_num return ce
def batch(learning_rate): X, Y = transform_data() X, Y = shuffle(X, Y) N = len(X) // 2 Xtrain = X[:N] Ytrain = Y[:N] Ttrain = generate_T(Ytrain) Xtest = X[N:] Ytest = Y[N:] Ttest = generate_T(Ytest) N, D = Xtrain.shape M = 100 K = len(set(Y)) iterations = 50 batch_N = 250 batches = len(X) // batch_N v = np.random.randn(M, K) / np.sqrt(M + K) b_1 = np.random.randn(K) / np.sqrt(K) w = np.random.randn(D, M) / np.sqrt(D + M) b_0 = np.random.randn(M) / np.sqrt(M) batch_cost = [] batch_cr = [] best_batch = 0 best_iteration = 0 for i in range(iterations): for b in range(batches): X = Xtrain[b * batch_N:(b + 1) * batch_N, :] T = Ttrain[b * batch_N:(b + 1) * batch_N, :] Y, Z = generate_Y('tanh', X, w, b_0, v, b_1) Y_test, _ = generate_Y('tanh', Xtest, w, b_0, v, b_1) P_test = np.argmax(Y_test, axis=1) if b % batches == 0: batch_cost.append(cross_entropy(Y_test, Ttest)) cr = classification_rate(P_test, Ytest) batch_cr.append(cr) if cr > best_batch: best_batch = cr best_iteration = i v -= learning_rate * derivative_v('tanh', Z, Y, T) b_1 -= learning_rate * derivative_b1('tanh', Y, T) w -= learning_rate * derivative_w('tanh', X, Y, Z, T, v) b_0 -= learning_rate * derivative_b0('tanh', Y, Z, T, v) if i % 100 == 0: print('Batch Cost: ', batch_cost[i], 'Batch Classification: ', batch_cr[i]) return batch_cost, batch_cr, best_batch, best_iteration
def update(self, x, t): self.model.zerograd() y = self.model.forward(x) pred = np.argmax(y, axis=1) acc = 1.0 * np.where(pred == t)[0].size / y.shape[ 0] #devide the number of currect answer in the batch by batch size prob = util.softmax(y) #change output to probability (normalization) loss = util.cross_entropy(prob, t) #loss function dout = prob dout[np.arange(dout.shape[0]), t] -= 1 #differentiate loss function by y self.model.backward( dout / dout.shape[0] ) #calculate partial differentiations by each parameters of each layer to use in next update() function to update parameters. self.model.update( ) #update parameters based on the partial differntials return loss, acc
def train(device, model, train_mode, examples, num_steps): optimizer = torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9) milestones = list(map(int, num_steps * np.array([1 / 2, 3 / 4, 7 / 8]))) logger.info('lr milestones: {}'.format(milestones)) scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones) model.train() for i, example in zip(range(num_steps), examples): scheduler.step() optimizer.zero_grad() if train_mode == 'pair': im0, im1, target = example im0, im1, target = im0.to(device), im1.to(device), target.to( device) output = model(im0, im1) loss = torch.nn.functional.binary_cross_entropy_with_logits( output, target) elif train_mode == 'softmax': train_ims, test_ims, _ = example # train_ims: [b, k, n, ...] # test_ims: [b, k, n', ...] test_ims, gt = util.flatten_few_shot_examples(test_ims, shuffle=True) train_ims, test_ims, gt = train_ims.to(device), test_ims.to( device), gt.to(device) # test_ims: [b, m, ...] # gt: [b, m] scores = model(train_ims, test_ims) # scores: [b, m, k] loss = util.cross_entropy(scores, gt, dim=-1) # Besides the loss, we can obtain the accuracy. _, pred = torch.max(scores, -1, keepdim=False) is_correct = torch.eq(pred, gt).cpu().numpy() acc = np.sum(is_correct) / is_correct.size else: raise ValueError('unknown train mode: "{}"'.format(train_mode)) loss.backward() optimizer.step() logger.info('step %d, loss %.4f', i, loss.item())
def build_graphs(self, input, pasts): """component content h, selective focus s, memory m""" """return generated thought u, generated content v""" """v -> input; only when receiving an external input""" """m -> h; only when receiving an external input""" """s -> h; only when receiving an external input""" """s -> m; when perform thinking""" s = self.selective_focus(pasts) G = self.generative_focus(pasts) h = self.forward(input, G) v = self.backward(h, G) m = self.retrieve_memory(s) u = self.backward(m, G) grads, delta = self.Mw.gradients(h) self.memorize_operation = util.apply_gradients(grads, delta, 1.0) self.improve_focus_operation = util.cross_entropy( s, h, self.get_selective_focus_variables()) self.reset_memory_operation = self.Mw.get_reset_operation() self.reseed_memory_operation = self.Mw.get_reseed_operation() return u, v
# Calculate cross-entropy error for random weights, and for closed-form solution to bayes classifier import numpy as np from util import sigmoid, cross_entropy N = 100 D = 2 means = np.array(((-2,-2), (2,2))) covar = np.eye(2) # Artifically create 2 classes, center first 50 points at (-2,-2), last 50 at (2,2) X = np.random.randn(N, D) X[:N//2, :] = X[:N//2, :] + means[0] * np.ones((N//2,D)) X[N//2:, :] = X[N//2:, :] + means[1] * np.ones((N//2,D)) Xb = np.concatenate((np.ones((N, 1)), X), axis=1) # Class labels, first 50 are 0, last 50 are 1 T = np.concatenate((np.zeros((N//2,)), np.ones((N//2,)))) # Random weights w = np.random.randn(D+1) Y = sigmoid(Xb @ w) print('Random weights:', cross_entropy(T, Y)) # Closed form Bayes solution w = ((means[1, None] - means[0, None]) @ np.linalg.inv(covar)).T w = np.concatenate(((0,), w.reshape(D))) # Add weight for bias Y = sigmoid(Xb @ w) print('Closed form solution:', cross_entropy(T, Y))
def rmsprop(learning_rate): X, Y = transform_data() X, Y = shuffle(X, Y) N = len(X) // 2 Xtrain = X[:N] Ytrain = Y[:N] Ttrain = generate_T(Ytrain) Xtest = X[N:] Ytest = Y[N:] Ttest = generate_T(Ytest) Ttest = generate_T(Ytest) N, D = Xtrain.shape M = 100 K = len(set(Y)) iterations = 50 batch_N = 250 batches = N // batch_N dv = 0 d_b1 = 0 dw = 0 d_b0 = 0 mu = .9 v = np.random.randn(M, K) / np.sqrt(M + K) b_1 = np.random.randn(K) / np.sqrt(K) w = np.random.randn(D, M) / np.sqrt(D + M) b_0 = np.random.randn(M) / np.sqrt(M) cache_v = np.ones((M, K)) cache_b1 = np.ones(K) cache_w = np.ones((D, M)) cache_b0 = np.ones(M) epsilon = 10e-10 decay = .9 rmsprop_cost = [] rmsprop_cr = [] best_rms = 0 best_iteration = 0 for i in range(iterations): for b in range(batches): X = Xtrain[b * batches:(b + 1) * batches, :] T = Ttrain[b * batches:(b + 1) * batches, :] Y, Z = generate_Y('tanh', X, w, b_0, v, b_1) Y_test, _ = generate_Y('tanh', Xtest, w, b_0, v, b_1) P_test = np.argmax(Y_test, axis=1) if b % batches == 0: rmsprop_cost.append(cross_entropy(Y_test, Ttest)) cr = classification_rate(P_test, Ytest) rmsprop_cr.append(cr) if cr > best_rms: best_rms = cr best_iteration = i cache_v = decay * cache_v + (1 - decay) * derivative_v( 'tanh', Z, Y, T)**2 cache_b1 = decay * cache_b1 + (1 - decay) * derivative_b1( 'tanh', Y, T)**2 cache_w = decay * cache_w + (1 - decay) * derivative_w( 'tanh', X, Y, Z, T, v)**2 cache_b0 = decay * cache_b0 + (1 - decay) * derivative_b0( 'tanh', Y, Z, T, v)**2 dv = mu * dv - learning_rate * derivative_v( 'tanh', Z, Y, T) / (np.sqrt(cache_v + epsilon)) d_b1 = mu * d_b1 - learning_rate * derivative_b1( 'tanh', Y, T) / (np.sqrt(cache_b1 + epsilon)) dw = mu * dw - learning_rate * derivative_w( 'tanh', X, Y, Z, T, v) / (np.sqrt(cache_w + epsilon)) d_b0 = mu * d_b0 - learning_rate * derivative_b0( 'tanh', Y, Z, T, v) / (np.sqrt(cache_b0 + epsilon)) v += dv b_1 += d_b1 w += dw b_0 += d_b0 if i % 10 == 0: print('RMSProp Cost: ', rmsprop_cost[i], 'RMSProp Classification: ', rmsprop_cr[i]) return rmsprop_cost, rmsprop_cr, best_rms, best_iteration
def get_loss(self): return util.cross_entropy(self.O, util.one_hot(self.labels_active))
def full(self): for i in range(self.iterations): Y_train, Z = generate_Y(self.activation, self.Xtrain, self.w, self.b_0, self.v, self.b_1) P_train = np.argmax(Y_train, axis=1) Y_test, _ = generate_Y(self.activation, self.Xtest, self.w, self.b_0, self.v, self.b_1) P_test = np.argmax(Y_test, axis=1) self.train_cost.append(cross_entropy(Y_train, self.Ttrain)) self.test_cost.append(cross_entropy(Y_test, self.Ttest)) train_cr = classification_rate(P_train, self.Ytrain) self.train_cr.append(train_cr) test_cr = classification_rate(P_test, self.Ytest) self.test_cr.append(test_cr) if train_cr > self.best_train: self.best_train = train_cr self.train_iteration = i if test_cr > self.best_test: self.best_test = test_cr self.test_iteration = i self.m_v = self.decay_0 * self.m_v + ( 1 - self.decay_0) * derivative_v(self.activation, Z, Y_train, self.Ttrain) self.dm_v = self.m_v / (1 - self.decay_0**(i + 1)) self.v_v = self.decay_1 * self.v_v + ( 1 - self.decay_1) * derivative_v(self.activation, Z, Y_train, self.Ttrain)**2 self.dv_v = self.v_v / (1 - self.decay_1**(i + 1)) self.m_b1 = self.decay_0 * self.m_b1 + ( 1 - self.decay_0) * derivative_b1(self.activation, Y_train, self.Ttrain) self.dm_b1 = self.m_b1 / (1 - self.decay_0**(i + 1)) self.v_b1 = self.decay_1 * self.v_b1 + ( 1 - self.decay_1) * derivative_b1(self.activation, Y_train, self.Ttrain)**2 self.dv_b1 = self.v_b1 / (1 - self.decay_1**(i + 1)) self.m_w = self.decay_0 * self.m_w + ( 1 - self.decay_0) * derivative_w(self.activation, self.Xtrain, Y_train, Z, self.Ttrain, self.v) self.dm_w = self.m_w / (1 - self.decay_0**(i + 1)) self.v_w = self.decay_1 * self.v_w + ( 1 - self.decay_1) * derivative_w(self.activation, self.Xtrain, Y_train, Z, self.Ttrain, self.v)**2 self.dv_w = self.v_w / (1 - self.decay_1**(i + 1)) self.m_b0 = self.decay_0 * self.m_b0 + ( 1 - self.decay_0) * derivative_b0(self.activation, Y_train, Z, self.Ttrain, self.v) self.dm_b0 = self.m_b0 / (1 - self.decay_0**(i + 1)) self.v_b0 = self.decay_1 * self.v_b0 + ( 1 - self.decay_1) * derivative_b0(self.activation, Y_train, Z, self.Ttrain, self.v)**2 self.dv_b0 = self.v_b0 / (1 - self.decay_1**(i + 1)) self.v -= self.learning_rate * self.dm_v / (np.sqrt(self.dv_v + self.epsilon)) self.b_1 -= self.learning_rate * self.dm_b1 / ( np.sqrt(self.dv_b1 + self.epsilon)) self.w -= self.learning_rate * self.dm_w / (np.sqrt(self.dv_w + self.epsilon)) self.b_0 -= self.learning_rate * self.dm_b0 / ( np.sqrt(self.dv_b0 + self.epsilon)) if i % 100 == 0: print(i, 'Train Cost: ', self.train_cost[i], 'Train Classification Rate: ', self.train_cr[i])
def stochastic(self, samples): for i in range(self.iterations): current_X, current_T = shuffle(self.Xtrain, self.Ttrain) for s in range(samples): X = current_X[s, :].reshape(1, current_X.shape[1]) T = current_T[s, :].reshape(1, current_T.shape[1]) Y, Z = generate_Y(self.activation, X, self.w, self.b_0, self.v, self.b_1) Y_train, _ = generate_Y(self.activation, self.Xtrain, self.w, self.b_0, self.v, self.b_1) P_train = np.argmax(Y_train, axis=1) Y_test, _ = generate_Y(self.activation, self.Xtest, self.w, self.b_0, self.v, self.b_1) P_test = np.argmax(Y_test, axis=1) self.train_cost.append(cross_entropy(Y_train, self.Ttrain)) self.test_cost.append(cross_entropy(Y_test, self.Ttest)) train_cr = classification_rate(P_train, self.Ytrain) self.train_cr.append(train_cr) test_cr = classification_rate(P_test, self.Ytest) self.test_cr.append(test_cr) if train_cr > self.best_train: self.best_train = train_cr self.train_iteration = i if test_cr > self.best_test: self.best_test = test_cr self.test_iteration = i self.m_v = self.decay_0 * self.m_v + ( 1 - self.decay_0) * derivative_v(self.activation, Z, Y, T) self.dm_v = self.m_v / (1 - self.decay_0**(i + 1)) self.v_v = self.decay_1 * self.v_v + ( 1 - self.decay_1) * derivative_v(self.activation, Z, Y, T)**2 self.dv_v = self.v_v / (1 - self.decay_1**(i + 1)) self.m_b1 = self.decay_0 * self.m_b1 + ( 1 - self.decay_0) * derivative_b1(self.activation, Y, T) self.dm_b1 = self.m_b1 / (1 - self.decay_0**(i + 1)) self.v_b1 = self.decay_1 * self.v_b1 + ( 1 - self.decay_1) * derivative_b1(self.activation, Y, T)**2 self.dv_b1 = self.v_b1 / (1 - self.decay_1**(i + 1)) self.m_w = self.decay_0 * self.m_w + ( 1 - self.decay_0) * derivative_w(self.activation, X, Y, Z, T, self.v) self.dm_w = self.m_w / (1 - self.decay_0**(i + 1)) self.v_w = self.decay_1 * self.v_w + ( 1 - self.decay_1) * derivative_w(self.activation, X, Y, Z, T, self.v)**2 self.dv_w = self.v_w / (1 - self.decay_1**(i + 1)) self.m_b0 = self.decay_0 * self.m_b0 + ( 1 - self.decay_0) * derivative_b0(self.activation, Y, Z, T, self.v) self.dm_b0 = self.m_b0 / (1 - self.decay_0**(i + 1)) self.v_b0 = self.decay_1 * self.v_b0 + ( 1 - self.decay_1) * derivative_b0(self.activation, Y, Z, T, self.v)**2 self.dv_b0 = self.v_b0 / (1 - self.decay_1**(i + 1)) self.v -= self.learning_rate * self.dm_v / ( np.sqrt(self.dv_v + self.epsilon)) self.b_1 -= self.learning_rate * self.dm_b1 / ( np.sqrt(self.dv_b1 + self.epsilon)) self.w -= self.learning_rate * self.dm_w / ( np.sqrt(self.dv_w + self.epsilon)) self.b_0 -= self.learning_rate * self.dm_b0 / ( np.sqrt(self.dv_b0 + self.epsilon)) if i % 100 == 0: print(i, 'Train Cost: ', self.train_cost[i], 'Train Classification Rate: ', self.train_cr[i])
def main(): # load the data: (Xtrain, Ytrain), (Xtest, Ytest) = mnist.load_data() # print(Xtrain.shape) N, d, _ = Xtrain.shape D = d*d Ntest = len(Xtest) # normalize the data: Xtrain = Xtrain / 255.0 Xtest = Xtest / 255.0 # display: # n = np.random.choice(N) # plt.imshow(Xtrain[n], cmap='gray') # plt.title(str(Ytrain[n])) # plt.show() # reshape the data: Xtrain = Xtrain.reshape(N, D) Xtest = Xtest.reshape(Ntest, D) # print('Xtrain.max():', Xtrain.max()) # print('Xtrain.shape:', Xtrain.shape) Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) # define hyperparameters: epochs = 30 print_period = 10 lr = 0.00004 reg = 0.01 batch_sz = 500 n_batches = N // batch_sz M = 300 # the hidden layer size K = len(set(Ytrain)) # randomly initialize the weights: W1_init = np.random.randn(D, M) / np.sqrt(D) b1_init = np.zeros(M) W2_init = np.random.randn(M, K) / np.sqrt(M) b2_init = np.zeros(K) # 1. mini-batch SGD: losses_batch = [] errors_batch = [] W1 = W1_init.copy() b1 = b1_init.copy() W2 = W2_init.copy() b2 = b2_init.copy() print('\nmini-batch SGD') t0 = datetime.now() for i in range(epochs): Xtrain, Ytrain_ind = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j+1)*batch_sz] Ybatch = Ytrain_ind[j*batch_sz:(j+1)*batch_sz] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # update the params: W2 -= lr*(derivative_W2(Z, Ybatch, pYbatch) + reg*W2) b2 -= lr*(derivative_b2(Ybatch, pYbatch) + reg*b2) W1 -= lr*(derivative_W1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cross_entropy(pY, Ytest) losses_batch.append(l) e = error_rate(pY, Ytest) errors_batch.append(e) sys.stdout.write('epoch: %d, batch: %d, cost: %.6f, error_rate: %.4f\r' % (i, j, l, e)) # print('\nepoch: %d, batch: %d, cost: %6f' % (i, j, l)) # print('error_rate:', e) sys.stdout.flush() pY, _ = forward(Xtest, W1, b1, W2, b2) print('ETA:', datetime.now() - t0, 'final error rate:', error_rate(pY, Ytest), ' '*20) # 2. mini-batch SGD with momentum - version 1: losses_momentum1 = [] errors_momentum1 = [] W1 = W1_init.copy() b1 = b1_init.copy() W2 = W2_init.copy() b2 = b2_init.copy() mu = 0.9 # momentum term # initial values for the 'velocities': dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 print('\nmini-batch SGD with momentum - version 1') t0 = datetime.now() for i in range(epochs): Xtrain, Ytrain_ind = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j+1)*batch_sz] Ybatch = Ytrain_ind[j*batch_sz:(j+1)*batch_sz] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # calculate the gradients: gW2 = derivative_W2(Z, Ybatch, pYbatch) + reg*W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 gW1 = derivative_W1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 # update the 'velocities': dW2 = mu*dW2 - lr*gW2 db2 = mu*db2 - lr*gb2 dW1 = mu*dW1 - lr*gW1 db1 = mu*db1 - lr*gb1 # update the params: W2 += dW2 b2 += db2 W1 += dW1 b1 += db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cross_entropy(pY, Ytest) losses_momentum1.append(l) e = error_rate(pY, Ytest) errors_momentum1.append(e) sys.stdout.write('epoch: %d, batch: %d, cost: %.6f, error_rate: %.4f\r' % (i, j, l, e)) # print('\nepoch: %d, batch: %d, cost: %6f' % (i, j, l)) # print('error_rate:', e) sys.stdout.flush() pY, _ = forward(Xtest, W1, b1, W2, b2) print('ETA:', datetime.now() - t0, 'final error rate:', error_rate(pY, Ytest), ' '*20) ''' # 3. mini-batch SGD with momentum - version 2: losses_momentum2 = [] errors_momentum2 = [] W1 = W1_init.copy() b1 = b1_init.copy() W2 = W2_init.copy() b2 = b2_init.copy() mu = 0.9 # momentum term # initial values for the 'velocities': dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 # lr = 0.0004 print('\nmini-batch SGD with momentum - version 2') t0 = datetime.now() for i in range(epochs): Xtrain, Ytrain_ind = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j+1)*batch_sz] Ybatch = Ytrain_ind[j*batch_sz:(j+1)*batch_sz] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # calculate the gradients: gW2 = derivative_W2(Z, Ybatch, pYbatch) + reg*W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 gW1 = derivative_W1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 # # update the 'velocities': dW2 = mu*dW2 + (1-mu)*gW2 db2 = mu*db2 + (1-mu)*gb2 dW1 = mu*dW1 + (1-mu)*gW1 db1 = mu*db1 + (1-mu)*gb1 # update the 'velocities': # dW2 = mu*dW2 + gW2 # db2 = mu*db2 + gb2 # dW1 = mu*dW1 + gW1 # db1 = mu*db1 + gb1 # update the params: W2 -= lr*dW2 b2 -= lr*db2 W1 -= lr*dW1 b1 -= lr*db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cross_entropy(pY, Ytest) losses_momentum2.append(l) e = error_rate(pY, Ytest) errors_momentum2.append(e) sys.stdout.write('epoch: %d, batch: %d, cost: %.6f, error_rate: %.4f\r' % (i, j, l, e)) sys.stdout.flush() pY, _ = forward(Xtest, W1, b1, W2, b2) print('ETA:', datetime.now() - t0, 'final error rate:', error_rate(pY, Ytest), ' '*20) # best result: epochs = 25, final_error = 0.0179 ''' # 4. mini-batch SGD with Nesterov momentum: losses_nesterov_momentum = [] errors_nesterov_momentum = [] W1 = W1_init.copy() b1 = b1_init.copy() W2 = W2_init.copy() b2 = b2_init.copy() mu = 0.9 # momentum term # initial values for the 'velocities': dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 print('\nmini-batch SGD with Nesterov momentum') t0 = datetime.now() for i in range(epochs): Xtrain, Ytrain_ind = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j+1)*batch_sz] Ybatch = Ytrain_ind[j*batch_sz:(j+1)*batch_sz] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # calculate the gradients: gW2 = derivative_W2(Z, Ybatch, pYbatch) + reg*W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 gW1 = derivative_W1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 # update the 'velocities': dW2 = mu*dW2 - lr*gW2 db2 = mu*db2 - lr*gb2 dW1 = mu*dW1 - lr*gW1 db1 = mu*db1 - lr*gb1 # update the params: W2 += mu*dW2 - lr*gW2 b2 += mu*db2 - lr*gb2 W1 += mu*dW1 - lr*gW1 b1 += mu*db1 - lr*gb1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cross_entropy(pY, Ytest) losses_nesterov_momentum.append(l) e = error_rate(pY, Ytest) errors_nesterov_momentum.append(e) sys.stdout.write('epoch: %d, batch: %d, cost: %.6f, error_rate: %.4f\r' % (i, j, l, e)) sys.stdout.flush() # print('\nepoch: %d, batch: %d, cost: %6f' % (i, j, l)) # print('error_rate:', e) pY, _ = forward(Xtest, W1, b1, W2, b2) print('ETA:', datetime.now() - t0, 'final error rate:', error_rate(pY, Ytest), ' '*20) # plot the losses: plt.plot(losses_batch, label='mini-batch SGD') plt.plot(losses_momentum1, label='+ momentum') plt.plot(losses_nesterov_momentum, label='+ Nesterov momentum') plt.xlabel('iterations') plt.ylabel('loss') plt.legend() plt.show()