def sgd(path, init, lr, lmda): #runs stochastic gradient descent on the function defined above #starting at the intial guess of the params provided as an argument #it also assumes the data you want to use is train_sgd, test_sgd #outputs the letter and word wise error after ~1000 updates #the below line is for testing if needed #print(check_grad(func1, func_prime, guess, random.choice(data), lmda)) print("Reading Train Data...") data = read_data.read_train_sgd() print("Reading Test Data...") test_data = read_data.read_test_sgd() guess = np.copy(init) W, T = guess[:26*129].reshape((26, 129)),\ guess[26*129:].reshape((26, 26)) #variables for printing to file i, f = 0, open(path + f"/sgd-{lr}-{lmda}.txt", "w") #momentum variable m = np.zeros(129 * 26 + 26 * 26, dtype=np.longdouble) #Run descent forever print(f"Starting SGD with Momentum: lr:{lr} lambda:{lmda}") print(f"Starting SGD with Momentum: lr:{lr} lambda:{lmda}", file=f) prev = 0.0 while True: #compute decay rate temp_lr = lr / (1 + 0.5 * i) #now check if we have converged print and return if the case current = func(guess, data, lmda) print(f"{i}:{current}:{temp_lr}", file=f) print(f"{i}\t{current}\t{temp_lr}") if abs(current - prev) < 1e-3: print("Convergence") return else: prev = current for j in range(len(data)): func_prime(guess, data[j], lmda) np.multiply(0.9, m, out=m) np.multiply(temp_lr, log_grad, out=log_grad) np.add(m, log_grad, out=m) np.subtract(guess, m, out=guess) i += 1
def adam_gd(): log_grad = np.zeros(26*129+26*26) m_grad = np.zeros(26*129+26*26) v_grad = np.zeros(26*129+26*26) t=0 alpha=0.001 beta1=0.9 beta2=0.999 epsilon=1e-10 max_iter=3000 tol=1e-6 #initial guess of 0 guess = np.zeros((26*129+26*26)) data = read_data.read_train_sgd() l = 1e-2 print('Running ADAM') while True : temp = np.zeros((26*129+26*26)) guess_new = np.zeros((26*129+26*26)) m_grad_new = np.zeros(26*129+26*26) v_grad_new = np.zeros(26*129+26*26) # print('Iteration '+str(t)) t=t+1 if(t % 5 == 0): print(sgd.func(guess, data, l)) for example in data: # Get gradients w.r.t. stochastic objective at timestep t) log_grad=sgd.func_prime(guess,example,l) # Update biased first moment estimate m_grad=beta1*m_grad+(1-beta1)*log_grad # Update biased second raw moment estimate v_grad=beta2*v_grad + (1-beta2)*np.square(log_grad) # Compute bias-corrected first moment estimate np.divide(m_grad, 1-np.power(beta1,t), out=m_grad_new) # Compute bias-corrected second raw moment estimate np.divide(v_grad, 1-np.power(beta2,t), out=v_grad_new) np.multiply(m_grad_new, alpha*-1, out=temp) np.divide(temp ,(np.sqrt(v_grad_new)+epsilon),out=temp) np.add(guess, temp, out=guess_new) # print("Mean abs gradient is :"+str(np.mean((np.absolute(temp))))) if(np.mean((np.absolute(temp)))<tol): guess=guess_new break else: guess=guess_new if(t>max_iter): break; return guess_new
import numpy as np, read_data, prob_grad, random from scipy.optimize import check_grad l = 10 data = read_data.read_train_sgd() def func(params, *args): #computes function value for a single example W, T = params[:26*129].reshape((26, 129)),\ params[26*129:].reshape((26, 26)) x, y = args[0] l = args[1] log_p = prob_grad.compute_log_p(x, y, W, T) return -1*log_p + 0.5*l*(\ np.sum(np.square(W)) +\ np.sum(np.square(T))) def func_prime(params, *args): #computes the derivative of a single example W, T = params[:26*129].reshape((26, 129)),\ params[26*129:].reshape((26, 26)) x, y = args[0] l = args[1] log_grad = np.zeros(26 * 129 + 26 * 26)
def adam_mcmc(path, init, lr, lmda, epsilon, s): #runs adam optimizer, inspired by ashwani print("Reading Train Data...") data = read_data.read_train_sgd() print("Reading Test Data...") test_data = read_data.read_test_sgd() print("Computing the frequencies") table = compute_freq(data) guess = np.copy(init) W, T = guess[:26*129].reshape((26, 129)),\ guess[26*129:].reshape((26, 26)) #adam parameters t, b1, b2, = 0, 0.9, 0.999 m, v = np.zeros(26 * 129 + 26 * 26, dtype=np.longdouble), np.zeros(26 * 129 + 26 * 26, dtype=np.longdouble) i, f = 0, open(path + f"/adam-{lr}-{lmda}.txt", "w") print(f"Running Adam: lr:{lr} lambda:{lmda} epsilon:{epsilon}") print(f"Running Adam: lr:{lr} lambda:{lmda} epsilon:{epsilon}", file=f) prev = 0.0 while True: if t % 30 == 0: current = func(guess, data, lmda) error = compute_test_error(f, test_data, W, T) print(f"{i}:{current}:{error}", file=f) print(f"{i}:{current}:{error}") if abs(current - prev) < 1e-3: print("Convergence") return else: prev = current i += 1 t += 1 temp_lr = lr / (1 + 0.5 * i) example = sample(data, table, s) func_prime(guess, example, lmda) np.multiply(b1, m, out=m) np.add(m, np.multiply((1 - b1), log_grad), out=m) np.multiply(b2, v, out=v) np.square(log_grad, out=log_grad) np.multiply((1 - b2), log_grad, out=log_grad) np.add(v, log_grad, out=v) np.divide(m, (1 - np.power(b1, t)), out=m) np.divide(v, (1 - np.power(b2, t)), out=v) np.multiply(-1 * temp_lr, m, out=m) np.sqrt(v, out=v) np.add(v, epsilon, out=v) np.divide(m, v, out=m) np.add(guess, m, out=guess)