def trainSGD(netConfig, x, y, lmb): th1, th2 = SimpleNN2.initRandomThetas(netConfig) alpha = 0.1 costs = [] numSamples = x.shape[0] miniBatchSize = 200 for i in range((numSamples-2)//miniBatchSize): fr = i*miniBatchSize to = (i+1)*miniBatchSize xi = x[fr:to,:] yi = y[fr:to] costBefore = 0.0 if len(costs) > 0: costBefore = costs[-1] else: costBefore = SimpleNN2.computeCost(netConfig, th1, th2, xi, yi, lmb) grad1, grad2 = SimpleNN2.computeGrad(netConfig, th1, th2, xi, yi, lmb) alpha = findOptimalAlpha(netConfig, th1, th2, xi, yi, lmb, grad1, grad2, alpha/2, alpha*2) th1p = th1 - alpha*grad1 th2p = th2 - alpha*grad2 costAfter = SimpleNN2.computeCost(netConfig, th1p, th2p, xi, yi, lmb) if costAfter <= costBefore: costs.append(costAfter) th1 = th1p th2 = th2p #else: # # Find optimal alpha in a wide range # alpha = findOptimalAlpha(netConfig, th1, th2, xi, yi, lmb, grad1, grad2, alpha/50, alpha) # th1p = th1 - alpha*grad1 # th2p = th2 - alpha*grad2 # costAfter = SimpleNN2.computeCost(netConfig, th1p, th2p, xi, yi, lmb) # if costAfter <= costBefore: # costs.append(costAfter) # th1 = th1p # th2 = th2p if len(costs) > 0 and len(costs) % 10 == 0: print('Epoch', len(costs), 'with cost', costs[-1], 'and alpha', alpha) return th1, th2
def compareImplementations2(): (x, y) = DataModel.loadData("..\\train.csv") y = y.astype(int) (x_train, x_cv, y_train, y_cv) = DataModel.splitData(x, y) x_sub = x_train[:500,:] y_sub = y_train[:500] s_my = SimpleNN2.NeuralNetConfig(784, 70, 10) s_t = NN_1HL.NN_1HL(reg_lambda = 10, opti_method = 'CG') np.random.seed(123) thetas = [s_t.rand_init(784,70), s_t.rand_init(70, 10)] # Check costs cost_t = s_t.function(s_t.pack_thetas(thetas[0].copy(), thetas[1].copy()), 784, 70, 10, x_sub, y_sub, 10) print("Cost test: ", cost_t) cost_my = SimpleNN2.computeCost(s_my, thetas[0], thetas[1], x_sub, y_sub, 10) print("Cost my: ", cost_my) # Check gradients grad_t = s_t.function_prime(s_t.pack_thetas(thetas[0].copy(), thetas[1].copy()), 784, 70, 10, x_sub, y_sub, 10) print("Grad sum test: ", np.sum(grad_t)) grad_my1, grad_my2 = SimpleNN2.computeGrad(s_my, thetas[0], thetas[1], x_sub, y_sub, 10) print("Grad sum my: ", np.sum(grad_my1) + np.sum(grad_my2))
def trainGradientDescent2(netConfig, x, y, lmb): th1, th2 = SimpleNN2.initRandomThetas(netConfig) alpha = 2.0 costs = [] while True: costBefore = SimpleNN2.computeCost(netConfig, th1, th2, x, y, lmb) grad1, grad2 = SimpleNN2.computeGrad(netConfig, th1, th2, x, y, lmb) th1p = th1 - alpha*grad1 th2p = th2 - alpha*grad2 costAfter = SimpleNN2.computeCost(netConfig, th1p, th2p, x, y, lmb) skipUpdate = False if costAfter > costBefore: alpha = alpha / 1.01 skipUpdate = True print("Decrease alpha due to cyclic behaviour") if not skipUpdate: costs.append(costAfter) th1 = th1p th2 = th2p if len(costs) > 0 and len(costs) % 10 == 0: print('Epoch', len(costs), 'with cost', costs[-1], 'and alpha', alpha) if len(costs) > 2 and abs(costs[-2] - costs[-1]) < 0.00001: if alpha < 0.02: break else: print("Decrease alpha due to close costs") alpha = alpha / 1.5 return th1, th2
def findOptimalAlpha(netConfig, theta1, theta2, x, y, lmb, grad1, grad2, alphaFrom, alphaTo): alphas = np.linspace(alphaFrom, alphaTo, 15) bestAlpha = 0 bestCost = sys.float_info.max for a in alphas: theta1p = theta1 - a*grad1 theta2p = theta2 - a*grad2 cost = SimpleNN2.computeCost(netConfig, theta1p, theta2p, x, y, lmb) if cost < bestCost: bestCost = cost bestAlpha = a return bestAlpha