def adam(grad, init_params, subopt=None, callback=None, break_cond=None, num_iters=100, step_size=0.001, b1=0.9, b2=0.999, eps=10**-8): """Adam as described in http://arxiv.org/pdf/1412.6980.pdf. It's basically RMSprop with momentum and some correction terms.""" flattened_grad, unflatten, x = flatten_func(grad, init_params) # dynamic step sizes if np.isscalar(step_size): step_size = np.ones(num_iters) * step_size assert len(step_size) == num_iters, "step schedule needs to match num iter" m = np.zeros(len(x)) v = np.zeros(len(x)) for i in range(num_iters): g = flattened_grad(x, i) if callback: callback(unflatten(x), i, unflatten(g)) m = (1 - b1) * g + b1 * m # First moment estimate. v = (1 - b2) * (g**2) + b2 * v # Second moment estimate. mhat = m / (1 - b1**(i + 1)) # Bias correction. vhat = v / (1 - b2**(i + 1)) x = x - step_size[i]*mhat/(np.sqrt(vhat) + eps) # do line search on last if subopt is not None: x = subopt(x, g, i) if break_cond is not None: if break_cond(x, i, g): break return unflatten(x)
def gradient_descent_beta(g, w, alpha, max_its, beta, version): # flatten the input function, create gradient based on flat function g_flat, unflatten, w = flatten_func(g, w) grad = compute_grad(g_flat) # record history w_hist = [] w_hist.append(unflatten(w)) # start gradient descent loop z = np.zeros((np.shape(w))) # momentum term # over the line for k in range(max_its): # plug in value into func and derivative grad_eval = grad(w) grad_eval.shape = np.shape(w) ### normalized or unnormalized descent step? ### if version == 'normalized': grad_norm = np.linalg.norm(grad_eval) if grad_norm == 0: grad_norm += 10**-6 * np.sign(2 * np.random.rand(1) - 1) grad_eval /= grad_norm # take descent step with momentum z = beta * z + grad_eval w = w - alpha * z # record weight update w_hist.append(unflatten(w)) return w_hist
def sgd(grad, init_params, subopt=None, callback=None, break_cond=None, num_iters=200, step_size=0.1, mass=0.9): """Stochastic gradient descent with momentum. grad() must have signature grad(x, i), where i is the iteration number.""" flattened_grad, unflatten, x = flatten_func(grad, init_params) # dynamic step sizes if np.isscalar(step_size): step_size = np.ones(num_iters) * step_size assert len(step_size) == num_iters, "step schedule needs to match num iter" velocity = np.zeros(len(x)) for i in range(num_iters): g = flattened_grad(x, i) if callback: callback(unflatten(x), i, unflatten(g)) velocity = mass * velocity - (1.0 - mass) * g x = x + step_size[i] * velocity if subopt is not None: x = subopt(x, g, i) if break_cond is not None: if break_cond(x, i, g): break return unflatten(x)
def gradient_descent(g, w, alpha, max_its, beta): # flatten the input function, create gradient based on flat function g_flat, unflatten, w = flatten_func(g, w) grad = compute_grad(g_flat) # record history w_hist = [] # push the first w w_hist.append(unflatten(w)) # start gradient descent loop z = np.zeros(np.shape(w)) # momentum term # over the line for k in range(max_its): # plug in value into func and derivative grad_eval = grad(w) grad_eval.shape = np.shape(w) # take descent step with momentum z = beta * z + grad_eval w = w - alpha * z # record weight update w_hist.append(unflatten(w)) return w_hist
def gradient_descent(g, alpha, max_its, w, num_pts, batch_size, **kwargs): # flatten the input function, create gradient based on flat function g_flat, unflatten, w = flatten_func(g, w) grad = value_and_grad(g_flat) # record history w_hist = [] w_hist.append(unflatten(w)) # how many mini-batches equal the entire dataset? num_batches = int(np.ceil(np.divide(num_pts, batch_size))) # over the line for k in range(max_its): # loop over each minibatch for b in range(num_batches): # collect indices of current mini-batch batch_inds = np.arange(b * batch_size, min((b + 1) * batch_size, num_pts)) # plug in value into func and derivative cost_eval, grad_eval = grad(w, batch_inds) grad_eval.shape = np.shape(w) # take descent step with momentum w = w - alpha * grad_eval # record weight update w_hist.append(unflatten(w)) return w_hist
def newtons_method(self, g, w, **kwargs): # create gradient and hessian functions self.g = g # flatten gradient for simpler-written descent loop flat_g, unflatten, w = flatten_func(self.g, w) self.grad = compute_grad(flat_g) self.hess = compute_hess(flat_g) # parse optional arguments max_its = 20 if 'max_its' in kwargs: max_its = kwargs['max_its'] self.epsilon = 10**(-5) if 'epsilon' in kwargs: self.epsilon = kwargs['epsilon'] verbose = False if 'verbose' in kwargs: verbose = kwargs['verbose'] # create container for weight history w_hist = [] w_hist.append(unflatten(w)) # start newton's method loop if verbose == True: print('starting optimization...') geval_old = flat_g(w) for k in range(max_its): # compute gradient and hessian grad_val = self.grad(w) hess_val = self.hess(w) hess_val.shape = (np.size(w), np.size(w)) # solve linear system for weights w = w - np.dot( np.linalg.pinv(hess_val + self.epsilon * np.eye(np.size(w))), grad_val) # eject from process if reaching singular system geval_new = flat_g(w) if k > 2 and geval_new > geval_old: print('singular system reached') time.sleep(1.5) clear_output() return w_hist else: geval_old = geval_new # record current weights w_hist.append(unflatten(w)) if verbose == True: print('...optimization complete!') time.sleep(1.5) clear_output() return w_hist
def gradient_descent(self,g,alpha_choice,max_its,w,v): g_flat, unflatten, w = flatten_func(g, w) grad = value_and_grad(g_flat) w_hist = [unflatten(w)] train_hist = [g_flat(w,v)] alpha = 0 for k in range(1,max_its+1): print('iteration: ', k, end = "\r") alpha = 0 if alpha_choice == 'diminishing': alpha = 1/float(k) else: alpha = alpha_choice cost_eval,grad_eval = grad(w,v) grad_eval.shape = np.shape(w) w = w - alpha*grad_eval train_cost = g_flat(w,v) w_hist.append(unflatten(w)) train_hist.append(train_cost) return w_hist,train_hist
def adam(grad, init_params, callback=None, num_iters=100, step_size=0.001, b1=0.9, b2=0.999, eps=10**-8): flattened_grad, unflatten, x = flatten_func(grad, init_params) m = np.zeros(len(x)) v = np.zeros(len(x)) for i in range(num_iters): g = flattened_grad(x, i) if callback: callback(unflatten(x), i, unflatten(g)) m = (1 - b1) * g + b1 * m # First moment estimate. v = (1 - b2) * (g**2) + b2 * v # Second moment estimate. mhat = m / (1 - b1**(i + 1)) # Bias correction. vhat = v / (1 - b2**(i + 1)) x = x - step_size * mhat / (np.sqrt(vhat) + eps) return unflatten(x)
def newtons_method(g, max_its, w, num_pts, batch_size, **kwargs): # flatten input funciton, in case it takes in matrices of weights g_flat, unflatten, w = flatten_func(g, w) # compute the gradient / hessian functions of our input function - gradient = value_and_grad(g_flat) hess = hessian(g_flat) # set numericxal stability parameter / regularization parameter epsilon = 10**(-7) if 'epsilon' in kwargs: epsilon = kwargs['epsilon'] # record history w_hist = [] w_hist.append(unflatten(w)) cost_hist = [g_flat(w, np.arange(num_pts))] # how many mini-batches equal the entire dataset? num_batches = int(np.ceil(np.divide(num_pts, batch_size))) # over the line for k in range(max_its): # loop over each minibatch for b in range(num_batches): # collect indices of current mini-batch batch_inds = np.arange(b * batch_size, min((b + 1) * batch_size, num_pts)) # evaluate the gradient, store current weights and cost function value cost_eval, grad_eval = gradient(w, batch_inds) # evaluate the hessian hess_eval = hess(w, batch_inds) # reshape for numpy linalg functionality hess_eval.shape = (int( (np.size(hess_eval))**(0.5)), int((np.size(hess_eval))**(0.5))) ''' # compute minimum eigenvalue of hessian matrix eigs, vecs = np.linalg.eig(hess_eval) smallest_eig = np.min(eigs) adjust = 0 if smallest_eig < 0: adjust = np.abs(smallest_eig) ''' # solve second order system system for weight update A = hess_eval + (epsilon) * np.eye(np.size(w)) b = grad_eval w = np.linalg.lstsq(A, np.dot(A, w) - b)[0] #w = w - np.dot(np.linalg.pinv(hess_eval + epsilon*np.eye(np.size(w))),grad_eval) # record weights after each epoch w_hist.append(unflatten(w)) cost_hist.append(g_flat(w, np.arange(num_pts))) return w_hist, cost_hist
def gradient_descent(g, w, x_train, x_val, alpha, max_its, batch_size, **kwargs): verbose = True if 'verbose' in kwargs: verbose = kwargs['verbose'] # flatten the input function, create gradient based on flat function g_flat, unflatten, w = flatten_func(g, w) grad = value_and_grad(g_flat) # record history num_train = x_train.shape[1] num_val = x_val.shape[1] w_hist = [unflatten(w)] train_hist = [g_flat(w, x_train, np.arange(num_train))] val_hist = [g_flat(w, x_val, np.arange(num_val))] # how many mini-batches equal the entire dataset? num_batches = int(np.ceil(np.divide(num_train, batch_size))) # over the line for k in range(max_its): # loop over each minibatch start = timer() train_cost = 0 for b in range(num_batches): # collect indices of current mini-batch batch_inds = np.arange(b * batch_size, min((b + 1) * batch_size, num_train)) # plug in value into func and derivative cost_eval, grad_eval = grad(w, x_train, batch_inds) grad_eval.shape = np.shape(w) # take descent step with momentum w = w - alpha * grad_eval end = timer() # update training and validation cost train_cost = g_flat(w, x_train, np.arange(num_train)) val_cost = g_flat(w, x_val, np.arange(num_val)) # record weight update, train and val costs w_hist.append(unflatten(w)) train_hist.append(train_cost) val_hist.append(val_cost) if verbose == True: print('step ' + str(k + 1) + ' done in ' + str(np.round(end - start, 1)) + ' secs, train cost = ' + str(np.round(train_hist[-1][0], 4)) + ', val cost = ' + str(np.round(val_hist[-1][0], 4))) if verbose == True: print('finished all ' + str(max_its) + ' steps') #time.sleep(1.5) #clear_output() return w_hist, train_hist, val_hist
def newtons_method(g, max_its, w, num_pts, batch_size, **kwargs): # flatten input funciton, in case it takes in matrices of weights flat_g, unflatten, w = flatten_func(g, w) # compute the gradient / hessian functions of our input function - # note these are themselves functions. In particular the gradient - # - when evaluated - returns both the gradient and function evaluations (remember # as discussed in Chapter 3 we always ge the function evaluation 'for free' when we use # an Automatic Differntiator to evaluate the gradient) gradient = value_and_grad(flat_g) hess = hessian(flat_g) # set numericxal stability parameter / regularization parameter epsilon = 10**(-7) if 'epsilon' in kwargs: epsilon = kwargs['epsilon'] # record history w_hist = [] w_hist.append(unflatten(w)) # how many mini-batches equal the entire dataset? num_batches = int(np.ceil(np.divide(num_pts, batch_size))) # over the line for k in range(max_its): # loop over each minibatch for b in range(num_batches): # collect indices of current mini-batch batch_inds = np.arange(b * batch_size, min((b + 1) * batch_size, num_pts)) # evaluate the gradient, store current weights and cost function value cost_eval, grad_eval = gradient(w, batch_inds) # evaluate the hessian hess_eval = hess(w, batch_inds) # reshape for numpy linalg functionality hess_eval.shape = (int( (np.size(hess_eval))**(0.5)), int((np.size(hess_eval))**(0.5))) # solve second order system system for weight update A = hess_eval + epsilon * np.eye(np.size(w)) b = grad_eval w = np.linalg.lstsq(A, np.dot(A, w) - b)[0] #w = w - np.dot(np.linalg.pinv(hess_eval + epsilon*np.eye(np.size(w))),grad_eval) # record weights after each epoch w_hist.append(unflatten(w)) # collect final weights w_hist.append(unflatten(w)) return w_hist
def gradient_descent(g, alpha_choice, max_its, w, version, beta): # flatten the input function to more easily deal with costs that have layers of parameters g_flat, unflatten, w = flatten_func( g, w) # note here the output 'w' is also flattened # compute the gradient function of our input function - note this is a function too # that - when evaluated - returns both the gradient and function evaluations (remember # as discussed in Chapter 3 we always ge the function evaluation 'for free' when we use # an Automatic Differntiator to evaluate the gradient) gradient = value_and_grad(g_flat) # run the gradient descent loop weight_history = [] # container for weight history cost_history = [] # container for corresponding cost function history alpha = 0 # start gradient descent loop z = np.zeros((np.shape(w))) # momentum term for k in range(1, max_its + 1): # check if diminishing steplength rule used if alpha_choice == 'diminishing': alpha = 1 / float(k) else: alpha = alpha_choice # evaluate the gradient, store current (unflattened) weights and cost function value cost_eval, grad_eval = gradient(w) if version == 'normalized': grad_norm = np.linalg.norm(grad_eval) # check that magnitude of gradient is not too small, if yes pick a random direction to move if grad_norm == 0: # pick random direction and normalize to have unit legnth grad_eval = 10**-6 * np.sign(2 * np.random.rand(len(w)) - 1) grad_norm = np.linalg.norm(grad_eval) grad_eval /= grad_norm # take descent step with momentum z = beta * z + grad_eval w = w - alpha * z weight_history.append(unflatten(w)) cost_history.append(cost_eval) # take gradient descent step w = w - alpha * grad_eval # collect final weights weight_history.append(unflatten(w)) # compute final cost function value via g itself (since we aren't computing # the gradient at the final step we don't get the final cost function value # via the Automatic Differentiatoor) cost_history.append(g_flat(w)) return weight_history, cost_history
def gradient_descent(g, w, a_train, s_train, alpha, max_its, verbose): ''' A basic gradient descent module (full batch) for system identification training. Inputs to gradient_descent function: g - function to minimize w - initial weights a_train - training action sequence s_train - training state sequence alpha - steplength / learning rate max_its - number of iterations to perform verbose - print out update each step if verbose = True ''' # flatten the input function, create gradient based on flat function g_flat, unflatten, w = flatten_func(g, w) grad = value_and_grad(g_flat) # record history # num_val = y_val.size w_hist = [unflatten(w)] train_hist = [g_flat(w, a_train, s_train)] # over the line alpha_choice = 0 for k in range(1, max_its + 1): # take a single descent step start = timer() # plug in value into func and derivative cost_eval, grad_eval = grad(w, a_train, s_train) grad_eval.shape = np.shape(w) # take descent step with momentum w = w - alpha * grad_eval end = timer() # update training and validation cost train_cost = g_flat(w, a_train, s_train) val_cost = np.nan # record weight update, train cost w_hist.append(unflatten(w)) train_hist.append(train_cost) if verbose == True: print('step ' + str(k + 1) + ' done in ' + str(np.round(end - start, 1)) + ' secs, train cost = ' + str(np.round(train_hist[-1], 4)[0])) if verbose == True: print('finished all ' + str(max_its) + ' steps') return w_hist, train_hist
def rmsprop(grad, init_params, callback=None, num_iters=100, step_size=0.1, gamma=0.9, eps=10**-8): """Root mean squared prop: See Adagrad paper for details.""" flattened_grad, unflatten, x = flatten_func(grad, init_params) avg_sq_grad = np.ones(len(x)) for i in range(num_iters): g = flattened_grad(x, i) if callback: callback(unflatten(x), i, unflatten(g)) avg_sq_grad = avg_sq_grad * gamma + g**2 * (1 - gamma) x = x - step_size * g/(np.sqrt(avg_sq_grad) + eps) return unflatten(x)
def RMSprop(self, g, w, x_train, y_train, lam, alpha, max_its, batch_size, **kwargs): # rmsprop params gamma = 0.9 eps = 10**-8 if 'gamma' in kwargs: gamma = kwargs['gamma'] if 'eps' in kwargs: eps = kwargs['eps'] # flatten the input function, create gradient based on flat function g_flat, unflatten, w = flatten_func(g, w) grad = value_and_grad(g_flat) # initialize average gradient avg_sq_grad = np.ones(np.size(w)) # record history num_train = y_train.size w_hist = [unflatten(w)] train_hist = [g_flat(w, x_train, y_train, lam, np.arange(num_train))] # how many mini-batches equal the entire dataset? num_batches = int(np.ceil(np.divide(num_train, batch_size))) # over the line for k in range(max_its): # loop over each minibatch for b in range(num_batches): # collect indices of current mini-batch batch_inds = np.arange(b * batch_size, min((b + 1) * batch_size, num_train)) # plug in value into func and derivative cost_eval, grad_eval = grad(w, x_train, y_train, lam, batch_inds) grad_eval.shape = np.shape(w) # update exponential average of past gradients avg_sq_grad = gamma * avg_sq_grad + (1 - gamma) * grad_eval**2 # take descent step w = w - alpha * grad_eval / (avg_sq_grad**(0.5) + eps) # update training and validation cost train_cost = g_flat(w, x_train, y_train, lam, np.arange(num_train)) # record weight update, train and val costs w_hist.append(unflatten(w)) train_hist.append(train_cost) return w_hist, train_hist
def gradient_descent(g, alpha, max_its, w, num_pts, batch_size, **kwargs): # pluck out args beta = 0 if 'beta' in kwargs: beta = kwargs['beta'] normalize = False if 'normalize' in kwargs: normalize = kwargs['normalize'] # flatten the input function, create gradient based on flat function g_flat, unflatten, w = flatten_func(g, w) grad = value_and_grad(g_flat) # record history w_hist = [] w_hist.append(unflatten(w)) # how many mini-batches equal the entire dataset? num_batches = int(np.ceil(np.divide(num_pts, batch_size))) # initialization for momentum direction h = np.zeros((w.shape)) # over the line for k in range(max_its): # loop over each minibatch for b in range(num_batches): # collect indices of current mini-batch batch_inds = np.arange(b * batch_size, min((b + 1) * batch_size, num_pts)) # plug in value into func and derivative cost_eval, grad_eval = grad(w, batch_inds) grad_eval.shape = np.shape(w) # normalize? if normalize == True: grad_eval = np.sign(grad_eval) # momentum step # h = beta*h - (1 - beta)*grad_eval # take descent step with momentum w = w - alpha * grad_eval # record weight update w_hist.append(unflatten(w)) return w_hist
def newtons_method(g, max_its, w, **kwargs): # flatten input funciton, in case it takes in matrices of weights flat_g, unflatten, w = flatten_func(g, w) # compute the gradient / hessian functions of our input function - # note these are themselves functions. In particular the gradient - # - when evaluated - returns both the gradient and function evaluations (remember # as discussed in Chapter 3 we always ge the function evaluation 'for free' when we use # an Automatic Differntiator to evaluate the gradient) gradient = value_and_grad(flat_g) hess = hessian(flat_g) # set numericxal stability parameter / regularization parameter epsilon = 10**(-7) if 'epsilon' in kwargs: epsilon = kwargs['epsilon'] # run the newtons method loop weight_history = [] # container for weight history cost_history = [] # container for corresponding cost function history for k in range(max_its): # evaluate the gradient, store current weights and cost function value cost_eval, grad_eval = gradient(w) weight_history.append(unflatten(w)) cost_history.append(cost_eval) # evaluate the hessian hess_eval = hess(w) # reshape for numpy linalg functionality hess_eval.shape = (int( (np.size(hess_eval))**(0.5)), int((np.size(hess_eval))**(0.5))) # solve second order system system for weight update #w = w - np.dot(np.linalg.pinv(hess_eval + epsilon*np.eye(np.size(w))),grad_eval) # solve second order system system for weight update A = hess_eval + epsilon * np.eye(np.size(w)) b = grad_eval w = np.linalg.lstsq(A, np.dot(A, w) - b)[0] # collect final weights weight_history.append(unflatten(w)) # compute final cost function value via g itself (since we aren't computing # the gradient at the final step we don't get the final cost function value # via the Automatic Differentiatoor) cost_history.append(flat_g(w)) return weight_history, cost_history
def minibatch_gradient_descent(g, alpha_choice, max_its, w, batch_size, num_pts): # flatten the input function, create gradient based on flat function g_flat, unflatten, w = flatten_func(g, w) # compute the gradient function of our input function - note this is a function too # that - when evaluated - returns both the gradient and function evaluations (remember # as discussed in Chapter 3 we always ge the function evaluation 'for free' when we use # an Automatic Differntiator to evaluate the gradient) gradient = value_and_grad(g_flat) # run the gradient descent loop weight_history = [] # container for weight history cost_history = [] # container for corresponding cost function history alpha = 0 # record history weight_history.append(unflatten(w)) cost_history.append(g_flat(w, np.arange(num_pts))) # how many mini-batches equal the entire dataset? num_batches = int(np.ceil(np.divide(num_pts, batch_size))) # over the line for k in range(max_its): # check if diminishing steplength rule used if alpha_choice == 'diminishing': alpha = 1 / float(k) else: alpha = alpha_choice # loop over each minibatch for b in range(num_batches): # collect indices of current mini-batch batch_inds = np.arange(b * batch_size, min((b + 1) * batch_size, num_pts)) # plug in value into func and derivative cost_eval, grad_eval = gradient(w, batch_inds) grad_eval.shape = np.shape(w) # take descent step with momentum w = w - alpha * grad_eval # record weight update weight_history.append(unflatten(w)) cost_history.append(g_flat(w, np.arange(num_pts))) return weight_history, cost_history
def gradient_descent(g, w_unflat, alpha_choice, max_its, version, **kwargs): verbose = False if 'verbose' in kwargs: verbose = kwargs['verbose'] # flatten the input function, create gradient based on flat function g_flat, unflatten, w = flatten_func(g, w_unflat) grad = compute_grad(g) # record history w_hist = [] w_hist.append(w_unflat) # over the line for k in range(max_its): if verbose == True: if np.mod(k, 5) == 0: print('started iteration ' + str(k) + ' of ' + str(max_its)) # check if diminishing steplength rule used if alpha_choice == 'diminishing': alpha = 1 / float(k) else: alpha = alpha_choice # plug in value into func and derivative grad_eval = grad(w_unflat) grad_eval, _ = flatten(grad_eval) ### normalized or unnormalized descent step? ### if version == 'normalized': grad_norm = np.linalg.norm(grad_eval) if grad_norm == 0: grad_norm += 10**-6 * np.sign(2 * np.random.rand(1) - 1) grad_eval /= grad_norm # take descent step w = w - alpha * grad_eval # record weight update w_unflat = unflatten(w) w_hist.append(w_unflat) if verbose == True: print('finished all ' + str(max_its) + ' iterations') return w_hist
def myadam(grad, init_params, callback=None, num_iters=100, step_sizes=0.001, b1=0.9, b2=0.999, eps=10**-8, gnorm_max=np.inf, last_m=None, last_v=None, last_i=0, lossfun=[], printstuff=0): """Adam as described in http://arxiv.org/pdf/1412.6980.pdf. It's basically RMSprop with momentum and some correction terms.""" flattened_grad, unflatten, x = flatten_func(grad, init_params) if type(step_sizes) == float or type(step_sizes) == int: step_sizes = step_sizes * np.ones(num_iters) else: assert len(step_sizes) == num_iters m = np.zeros(len(x)) if last_m is None else last_m v = np.zeros(len(x)) if last_v is None else last_v for i in range(num_iters): g = flattened_grad(x, i) gnorm = np.linalg.norm(g) if gnorm > gnorm_max: if printstuff: print(" Gradient norm was: %0.4f" % gnorm) g = g * gnorm_max / gnorm gnorm = np.linalg.norm(g) if printstuff: print(" Gradient norm: %0.4f" % gnorm) print(" Step size: %0.4f" % step_sizes[i]) if callback: callback(unflatten(x), i, unflatten(g), lossfun=lossfun) m = (1 - b1) * g + b1 * m # First moment estimate. v = (1 - b2) * (g**2) + b2 * v # Second moment estimate. mhat = m / (1 - b1**(i + last_i + 1)) # Bias correction. vhat = v / (1 - b2**(i + last_i + 1)) x = x - step_sizes[i] * mhat / (np.sqrt(vhat) + eps) return unflatten(x), (m, v, i + last_i)
def normalized_gradient_descent(g, alpha, max_its, w): # flatten the input function to more easily deal with costs that have layers of parameters g_flat, unflatten, w = flatten_func( g, w) # note here the output 'w' is also flattened print(w) # compute the gradient of our input function - note this is a function too! gradient = value_and_grad(g_flat) # run the gradient descent loop best_w = w # weight we return, should be the one providing lowest evaluation best_eval, _ = gradient(w) # lowest evaluation yet weight_history = [] # container for weight history cost_history = [] # container for corresponding cost function history for k in range(max_its): # evaluate the gradient, compute its length cost_eval, grad_eval = gradient(w) # split it up into the separate matrices for each layer grad_norm = np.linalg.norm(grad_eval) # check that magnitude of gradient is not too small, if yes pick a random direction to move if grad_norm == 0: # pick random direction and normalize to have unit legnth grad_eval = 10**-6 * np.sign(2 * np.random.rand(len(w)) - 1) grad_norm = np.linalg.norm(grad_eval) # do this for each matrix of weights grad_eval /= grad_norm # take gradient descent step w = w - alpha * grad_eval # return only the weight providing the lowest evaluation test_eval, _ = gradient(w) if test_eval < best_eval: best_eval = test_eval best_w = w print(k) weight_history.append(unflatten(w)) cost_history.append(g_flat(w)) weight_history.append(unflatten(best_w)) cost_history.append(g_flat(best_w)) return weight_history, cost_history
def gradient_descent(g, alpha, max_its, w, num_pts, train_portion,**kwargs): # flatten the input function, create gradient based on flat function g_flat, unflatten, w = flatten_func(g, w) grad = value_and_grad(g_flat) # containers for histories weight_hist = [] train_ind_hist = [] test_ind_hist = [] # store first weights weight_hist.append(unflatten(w)) # pick random proportion of training indecies train_num = int(np.round(train_portion*num_pts)) inds = np.random.permutation(num_pts) train_inds = inds[:train_num] test_inds = inds[train_num:] # record train / test inds train_ind_hist.append(train_inds) test_ind_hist.append(test_inds) # over the line for k in range(max_its): # plug in value into func and derivative cost_eval,grad_eval = grad(w,train_inds) grad_eval.shape = np.shape(w) # take descent step with momentum w = w - alpha*grad_eval # record weight update weight_hist.append(unflatten(w)) #### pick new train / test split #### # pick random proportion of training indecies train_num = int(np.round(train_portion*num_pts)) inds = np.random.permutation(num_pts) train_inds = inds[:train_num] test_inds = inds[train_num:] # record train / test inds train_ind_hist.append(train_inds) test_ind_hist.append(test_inds) return weight_hist,train_ind_hist,test_ind_hist
def adam(grad, init_params, callback=None, num_iters=100, step_size=0.001, b1=0.9, b2=0.999, eps=10**-8, m=None, v=None, offset=None): """Adam as described in http://arxiv.org/pdf/1412.6980.pdf. It's basically RMSprop with momentum and some correction terms. :param grad: The gradient function. :param init_params: The initial parameters. :param callback: A callback function to run each iteration. :param num_iters: The number of iterations to run for. :param step_size: The step_size :param b1: Exponential decay rate of first moment. :param b2: Exponential decay rate of second moment. :param eps: Small term added for stability. :param m: The current first moment. :param v: The current second moment. :param offset: What iteration number to start with :return: """ flattened_grad, unflatten, x = flatten_func(grad, init_params) if m is None: m = np.zeros(len(x)) if v is None: v = np.zeros(len(x)) if offset is None: offset = 0 for i in range(num_iters): cur_iter = i + offset g = flattened_grad(x, cur_iter) if callback: callback(unflatten(x), cur_iter, unflatten(g)) m = (1 - b1) * g + b1 * m # First moment estimate. v = (1 - b2) * (g**2) + b2 * v # Second moment estimate. mhat = m / (1 - b1**(cur_iter + 1)) # Bias correction. vhat = v / (1 - b2**(cur_iter + 1)) x -= step_size * mhat / (np.sqrt(vhat) + eps) return unflatten(x), m, v, cur_iter
def newtons_method(g, w, x, y, max_its, **kwargs): # flatten input funciton, in case it takes in matrices of weights g_flat, unflatten, w = flatten_func(g, w) # compute the gradient / hessian functions of our input function grad = value_and_grad(g_flat) hess = hessian(g_flat) # set numericxal stability parameter / regularization parameter epsilon = 10**(-7) if 'epsilon' in kwargs: epsilon = kwargs['epsilon'] # record history num_train = y.size w_hist = [unflatten(w)] train_hist = [g_flat(w, x, y, np.arange(num_train))] # over the line for k in range(max_its): # evaluate the gradient, store current weights and cost function value cost_eval, grad_eval = grad(w, x, y, np.arange(num_train)) # evaluate the hessian hess_eval = hess(w, x, y, np.arange(num_train)) # reshape for numpy linalg functionality hess_eval.shape = (int( (np.size(hess_eval))**(0.5)), int((np.size(hess_eval))**(0.5))) # solve second order system system for weight update A = hess_eval + epsilon * np.eye(np.size(w)) b = grad_eval w = np.linalg.lstsq(A, np.dot(A, w) - b)[0] #w = w - np.dot(np.linalg.pinv(hess_eval + epsilon*np.eye(np.size(w))),grad_eval) # update training and validation cost train_cost = g_flat(w, x, y, np.arange(num_train)) # record weight update, train and val costs w_hist.append(unflatten(w)) train_hist.append(train_cost) return w_hist, train_hist
def newtons_method(g, epsilon, max_its, w, num_pts, batch_size, **kwargs): # flatten the input function, create gradient based on flat function g_flat, unflatten, w = flatten_func(g, w) grad = value_and_grad(g_flat) hess = hessian(g_flat) # record history w_hist = [] w_hist.append(unflatten(w)) # how many mini-batches equal the entire dataset? num_batches = int(np.ceil(np.divide(num_pts, batch_size))) # over the line for k in range(max_its): for b in range(num_batches): # collect indices of current mini-batch batch_inds = np.arange(b * batch_size, min((b + 1) * batch_size, num_pts)) # plug in value into func and derivative cost_eval, grad_eval = grad(w, batch_inds) grad_eval.shape = np.shape(w) # evaluate the hessian hess_eval = hess(w, batch_inds) # reshape for numpy linalg functionality hess_eval.shape = (int( (np.size(hess_eval))**(0.5)), int((np.size(hess_eval))**(0.5))) hess_eval += epsilon * np.eye(np.size(w)) # solve second order system system for weight update A = hess_eval b = grad_eval w = np.linalg.lstsq(A, np.dot(A, w) - b)[0] # record weight update, train and val costs w_hist.append(unflatten(w)) if np.linalg.norm(w) > 100: return w_hist return w_hist
def gradient_descent(self,g,w,alpha,max_its,beta,version,**kwargs): verbose = False if 'verbose' in kwargs: verbose = kwargs['verbose'] # flatten the input function, create gradient based on flat function g_flat, unflatten, w = flatten_func(g, w) grad = compute_grad(g_flat) # record history w_hist = [] w_hist.append(unflatten(w)) # start gradient descent loop z = np.zeros((np.shape(w))) # momentum term if verbose == True: print ('starting optimization...') # over the line for k in range(max_its): # plug in value into func and derivative grad_eval = grad(w) grad_eval.shape = np.shape(w) ### normalized or unnormalized descent step? ### if version == 'normalized': grad_norm = np.linalg.norm(grad_eval) if grad_norm == 0: grad_norm += 10**-6*np.sign(2*np.random.rand(1) - 1) grad_eval /= grad_norm # take descent step with momentum z = beta*z + grad_eval w = w - alpha*z # record weight update w_hist.append(unflatten(w)) if verbose == True: print ('...optimization complete!') time.sleep(1.5) clear_output() return w_hist
def gradient_descent(self, g, w, x_train, y_train, lam, alpha_choice, max_its, batch_size): # flatten the input function, create gradient based on flat function g_flat, unflatten, w = flatten_func(g, w) grad = value_and_grad(g_flat) # record history num_train = y_train.shape[1] w_hist = [unflatten(w)] train_hist = [g_flat(w, x_train, y_train, lam, np.arange(num_train))] # how many mini-batches equal the entire dataset? num_batches = int(np.ceil(np.divide(num_train, batch_size))) # over the line alpha = 0 for k in range(max_its): # check if diminishing steplength rule used if alpha_choice == 'diminishing': alpha = 1 / float(k) else: alpha = alpha_choice for b in range(num_batches): # collect indices of current mini-batch batch_inds = np.arange(b * batch_size, min((b + 1) * batch_size, num_train)) # plug in value into func and derivative cost_eval, grad_eval = grad(w, x_train, y_train, lam, batch_inds) grad_eval.shape = np.shape(w) # take descent step with momentum w = w - alpha * grad_eval # update training and validation cost train_cost = g_flat(w, x_train, y_train, lam, np.arange(num_train)) # record weight update, train and val costs w_hist.append(unflatten(w)) train_hist.append(train_cost) return w_hist, train_hist
def sgd(grad, init_params, callback=None, num_iters=200, step_size=0.1, mass=0.9): flattened_grad, unflatten, x = flatten_func(grad, init_params) velocity = np.zeros(len(x)) for i in range(num_iters): g = flattened_grad(x, i) if callback: callback(unflatten(x), i, unflatten(g)) velocity = mass * velocity - (1.0 - mass) * g x = x + step_size * velocity return unflatten(x)
def newtons_method(g,w,x,y,beta,max_its): # flatten gradient for simpler-written descent loop flat_g, unflatten, w = flatten_func(g, w) grad = compute_grad(flat_g) hess = compute_hess(flat_g) # create container for weight history w_hist = [] w_hist.append(unflatten(w)) g_hist = [] geval_old = flat_g(w,x,y,beta) g_hist.append(geval_old) # main loop epsilon = 10**(-7) for k in range(max_its): # compute gradient and hessian grad_val = grad(w,x,y,beta) hess_val = hess(w,x,y,beta) hess_val.shape = (np.size(w),np.size(w)) # solve linear system for weights w = w - np.dot(np.linalg.pinv(hess_val + epsilon*np.eye(np.size(w))),grad_val) # eject from process if reaching singular system geval_new = flat_g(w,x,y,beta) if k > 2 and geval_new > geval_old: print ('singular system reached') time.sleep(1.5) clear_output() return w_hist else: geval_old = geval_new # record current weights w_hist.append(unflatten(w)) g_hist.append(geval_new) return w_hist,g_hist
def gradient_descent(self, g, w_unflat, alpha, max_its, version, **kwargs): verbose = False if 'verbose' in kwargs: verbose = kwargs['verbose'] # flatten the input function, create gradient based on flat function g_flat, unflatten, w = flatten_func(g, w_unflat) grad = compute_grad(g) # record history w_hist = [] w_hist.append(w_unflat) # over the line for k in range(max_its): # plug in value into func and derivative grad_eval = grad(w_unflat) grad_eval, _ = flatten(grad_eval) ### normalized or unnormalized descent step? ### if version == 'normalized': grad_norm = np.linalg.norm(grad_eval) if grad_norm == 0: grad_norm += 10**-6 * np.sign(2 * np.random.rand(1) - 1) grad_eval /= grad_norm # take descent step w = w - alpha * grad_eval # record weight update w_unflat = unflatten(w) w_hist.append(w_unflat) if verbose == True: print('...optimization complete!') time.sleep(1.5) clear_output() return w_hist