def neural_net_train(features,labels,num_iter = 2000,opt_method = 'forward_backward') : layer_sizes = [2,5,5,1] l2_reg = 2.0 param_scale = 0.5 init_params = neural_net_init(param_scale,layer_sizes) '''def plain_objective(params) : return lms_loss(params,features,labels,l2_reg)''' plain_objective = gen_objective(features,labels,l2_reg) objective_grad = auto_grad(plain_objective) print(" Iteration| Train accuracy") optimized_params = init_params gd_step = 0.2 for i in range(num_iter) : if opt_method == 'forward_backward' : optimized_params_ori = optimized_params value_old = plain_objective(optimized_params) flattened_grad,unflatten,x = flatten_func(objective_grad,optimized_params) x -= flattened_grad(x) * gd_step optimized_params = unflatten(x) value_new = plain_objective(optimized_params) if value_new < value_old : gd_step *= 1.618 else : gd_step *= 0.618 optimized_params = optimized_params_ori elif opt_method == 'stepest' : value_old = plain_objective(optimized_params) flattened_grad,unflatten,x = flatten_func(objective_grad,optimized_params) local_gd_step = gd_step best_gd_step = 0.0 for j in range(10) : x_test = x - flattened_grad(x) * local_gd_step last_optimized_params = unflatten(x_test) value_new = plain_objective(last_optimized_params) if value_new < value_old : best_gd_step = local_gd_step local_gd_step *= 1.618 else : local_gd_step *= 0.618 if auto_np.abs(best_gd_step - 0.0) < 0.00000000001 : gd_step *= 0.618 x -= flattened_grad(x) * best_gd_step optimized_params = unflatten(x) print_perf(optimized_params,i,features,labels) return optimized_params
def adam(grad, init_params, callback=None, num_iters=100, step_size=0.001, b1=0.9, b2=0.999, eps=10**-8): """Adam as described in http://arxiv.org/pdf/1412.6980.pdf. It's basically RMSprop with momentum and some correction terms.""" flattened_grad, unflatten, x = flatten_func(grad, init_params) m = np.zeros(len(x)) v = np.zeros(len(x)) for i in range(num_iters): g = flattened_grad(x, i) if callback: es = callback(unflatten(x), i, unflatten(g)) if es: i = num_iters - 1 m = (1 - b1) * g + b1 * m # First moment estimate. v = (1 - b2) * (g**2) + b2 * v # Second moment estimate. mhat = m / (1 - b1**(i + 1)) # Bias correction. vhat = v / (1 - b2**(i + 1)) x = x - step_size * mhat / (np.sqrt(vhat) + eps) return unflatten(x)
def sgd(grad, init_params, subopt=None, callback=None, break_cond=None, num_iters=200, step_size=0.1, mass=0.9): """Stochastic gradient descent with momentum. grad() must have signature grad(x, i), where i is the iteration number.""" flattened_grad, unflatten, x = flatten_func(grad, init_params) # dynamic step sizes if np.isscalar(step_size): step_size = np.ones(num_iters) * step_size assert len(step_size) == num_iters, "step schedule needs to match num iter" velocity = np.zeros(len(x)) for i in range(num_iters): g = flattened_grad(x, i) if callback: callback(unflatten(x), i, unflatten(g)) velocity = mass * velocity - (1.0 - mass) * g x = x + step_size[i] * velocity if subopt is not None: x = subopt(x, g, i) if break_cond is not None: if break_cond(x, i, g): break return unflatten(x)
def adam(grad, init_params, callback=None, num_iters=100, step_size=0.001, b1=0.9, b2=0.999, eps=10**-8): flattened_grad, unflatten, x = flatten_func(grad, init_params) m = np.zeros(len(x)) v = np.zeros(len(x)) for i in range(num_iters): g = flattened_grad(x, i) if callback: callback(unflatten(x), i, unflatten(g)) m = (1 - b1) * g + b1 * m # First moment estimate. v = (1 - b2) * (g**2) + b2 * v # Second moment estimate. mhat = m / (1 - b1**(i + 1)) # Bias correction. vhat = v / (1 - b2**(i + 1)) x = x - step_size * mhat / (np.sqrt(vhat) + eps) return unflatten(x)
def sgd(grad, init_params, callback=None, num_iters=200, step_size=0.1, mass=0.9): """Stochastic gradient descent with momentum. grad() must have signature grad(x, i), where i is the iteration number.""" flattened_grad, unflatten, x = flatten_func(grad, init_params) velocity = np.zeros(len(x)) for i in range(num_iters): g = flattened_grad(x, i) if callback: callback(unflatten(x), i, unflatten(g)) velocity = mass * velocity - (1.0 - mass) * g x = x + step_size * velocity return unflatten(x)
def rmsprop(grad, init_params, callback=None, num_iters=100, step_size=0.1, gamma=0.9, eps=10**-8): """Root mean squared prop: See Adagrad paper for details.""" flattened_grad, unflatten, x = flatten_func(grad, init_params) avg_sq_grad = np.ones(len(x)) for i in range(num_iters): g = flattened_grad(x, i) if callback: callback(unflatten(x), i, unflatten(g)) avg_sq_grad = avg_sq_grad * gamma + g**2 * (1 - gamma) x = x - step_size * g/(np.sqrt(avg_sq_grad) + eps) return unflatten(x)
def batch_adam( grad, init_params, callback=None, max_iters=1e5, step_size=0.001, b1=0.9, b2=0.999, eps=10**-8, validation_grad=None, stop_criterion=1e-3, patience=50, early_stop_freq=1): """ Adam as described in http://arxiv.org/pdf/1412.6980.pdf. It's basically RMSprop with momentum and some correction terms. """ flattened_grad, unflatten, x = flatten_func(grad, init_params) # initial settings for variables m, v = np.zeros(len(x)), np.zeros(len(x)) cur_iter = 0 reset_patience = patience oldg, g = 0, 1 # early stop on patience, old gradients not too diff. while (cur_iter < max_iters) and (l2(oldg - g) > stop_criterion) and (patience > 0): oldg = copy(g) # save last iter grad g = flattened_grad(x, cur_iter) # pass iter for batch training if callback: callback(unflatten(x), cur_iter, unflatten(g)) m = (1 - b1) * g + b1 * m # First moment estimate. v = (1 - b2) * (g**2) + b2 * v # Second moment estimate. mhat = m / (1 - b1**(cur_iter + 1)) # Bias correction. vhat = v / (1 - b2**(cur_iter + 1)) x = x - step_size*mhat/(np.sqrt(vhat) + eps) # check the validation error if (not validation_grad is None) and \ (((cur_iter % early_stop_freq) == 0) or (cur_iter+1 == max_iters)): valoss, _ = validation_grad(x) # we want to save the best one (in case of bad regions) if cur_iter == 0: best_loss = valoss best_x = x else: if valoss < best_loss: best_loss = valoss best_x = x # update patience patience = patience - 1 if valoss > best_loss else reset_patience else: # if no validation_grad, always save best_x = x cur_iter += 1 return unflatten(best_x)
def prepare_updates(self, cost, params, epsilon, grad=None, diag_hess=None, fd_hess=False, A=1, callbacks=[], callback_every=1000, **kwargs): self.theta = params if grad is not None: if diag_hess is None: if self.precondition and not fd_hess: raise ValueError("If precondition=True you must also prepare a function for" \ " computing the diagonal of the Hessian! Alternatively specify fd_hess=True" \ " in which case a noisy finite difference approximation will be used" \ " note that this can bias the MCMC sampler!") else: self.flattened_hess = None else: self.flattened_hess = diag_hess self.flattened_grad = grad self.unflatten = lambda x: x else: gradient = autograd.grad(cost) self.flattened_grad, self.unflatten, self.theta = flatten_func(gradient, params) self.hess = autograd.grad(self.flattened_grad) self.flattened_hess = lambda x, *inputs: np.diag(self.hess(x, *inputs)).reshape((-1,)) self.epsilon = epsilon self.A = A self.g = np.ones_like(params) self.g2 = np.ones_like(params) # note that xi here is not the same as in the thermostat! self.xi = np.ones_like(params) * self.A self.xi_acc = np.ones_like(params) * self.A self.updates = np.zeros_like(params) self.count = 1 self.callback_every = callback_every self.callbacks = callbacks def Ggrad(*args, **kwargs): saved = lambda: None def return_val_save_aux(*args, **kwargs): val, saved.aux = G(*args, **kwargs) return val gradval = elementwise_grad(return_val_save_aux, 0)(*args, **kwargs) return gradval, saved.aux self.Ggrad = Ggrad return self.updates
def adam(grad, init_params, callback=None, num_iters=100, step_size=0.001, b1=0.9, b2=0.999, eps=10**-8): """Adam as described in http://arxiv.org/pdf/1412.6980.pdf. It's basically RMSprop with momentum and some correction terms.""" flattened_grad, unflatten, x = flatten_func(grad, init_params) m = np.zeros(len(x)) v = np.zeros(len(x)) for i in range(num_iters): g = flattened_grad(x, i) if callback: callback(unflatten(x), i, unflatten(g)) m = (1 - b1) * g + b1 * m # First moment estimate. v = (1 - b2) * (g**2) + b2 * v # Second moment estimate. mhat = m / (1 - b1**(i + 1)) # Bias correction. vhat = v / (1 - b2**(i + 1)) x = x - step_size*mhat/(np.sqrt(vhat) + eps) return unflatten(x)
def __init__(self, grad, init_params, callback=None, step_size=0.01, b1=0.9, b2=0.999, eps=10**-8): self.grad = grad self.init_params = copy.copy(init_params) self.callback = callback self.step_size = step_size self.b1 = b1 self.b2 = b2 self.eps = eps self.flattened_grad, self.unflatten, self.x = flatten_func( self.grad, self.init_params) self.reset()
def sgd(grad, init_params, callback=None, num_iters=200, step_size=0.1, mass=0.9): flattened_grad, unflatten, x = flatten_func(grad, init_params) velocity = np.zeros(len(x)) for i in range(num_iters): g = flattened_grad(x, i) if callback: callback(unflatten(x), i, unflatten(g)) velocity = mass * velocity - (1.0 - mass) * g x = x + step_size * velocity return unflatten(x)
def adam(grad, init_params, subopt=None, callback=None, break_cond=None, num_iters=100, step_size=0.001, b1=0.9, b2=0.999, eps=10**-8): """Adam as described in http://arxiv.org/pdf/1412.6980.pdf. It's basically RMSprop with momentum and some correction terms.""" flattened_grad, unflatten, x = flatten_func(grad, init_params) # dynamic step sizes if np.isscalar(step_size): step_size = np.ones(num_iters) * step_size assert len(step_size) == num_iters, "step schedule needs to match num iter" m = np.zeros(len(x)) v = np.zeros(len(x)) for i in range(num_iters): g = flattened_grad(x, i) if callback: callback(unflatten(x), i, unflatten(g)) m = (1 - b1) * g + b1 * m # First moment estimate. v = (1 - b2) * (g**2) + b2 * v # Second moment estimate. mhat = m / (1 - b1**(i + 1)) # Bias correction. vhat = v / (1 - b2**(i + 1)) x = x - step_size[i] * mhat / (np.sqrt(vhat) + eps) # do line search on last if subopt is not None: x = subopt(x, g, i) if break_cond is not None: if break_cond(x, i, g): break return unflatten(x)
def gradient_descent(g, w, alpha, max_its, beta, version): g_flat, unflatten, w = flatten_func(g, w) grad = compute_grad(g_flat) w_hist = [] w_hist.append(unflatten(w)) z = np.zeros((np.shape(w))) for k in range(max_its): grad_eval = grad(w) grad_eval.shape = np.shape(w) if version == 'normalized': grad_norm = np.linalg.norm(grad_eval) if grad_norm == 0: grad_norm += 10**-6 * np.sign(2 * np.random.rand(1) - 1) grad_eval /= grad_norm z = beta * z + grad_eval w = w - alpha * z w_hist.append(unflatten(w)) return w_hist
def prepare_updates(self, cost, params, epsilon, grad=None, A=1, callbacks=[], callback_every=1000, **kwargs): self.theta = params if grad is not None: self.flattened_grad = grad self.unflatten = lambda x: x else: gradient = autograd.grad(cost) self.flattened_grad, self.unflatten, self.theta = flatten_func(gradient, params) self.hess = autograd.grad(self.flattened_grad) self.flattened_hess = lambda x, *inputs: np.diag(self.hess(x, *inputs)).reshape((-1,)) self.epsilon = epsilon self.A = A self.p = self._srng.normal(size=params.shape) self.xi = np.ones_like(params) * self.A self.xi_acc = np.ones_like(params) * self.A self.updates = np.zeros_like(params) self.count = 1 self.callback_every = callback_every self.callbacks = callbacks return self.updates
dim = len(params['weights']) cov = alpha * np.eye(dim) log_alpha = -np.log(np.sqrt(2 * np.pi)) - np.log( np.sqrt(alpha)) - 0.5 * (params['bias']**2) / (alpha) log_prior = -np.log(np.sqrt(2 * np.pi)) - 0.5 * np.log(np.linalg.det( cov)) - 0.5 * np.dot(np.dot(params['weights'].T, (np.linalg.inv(cov))), (params['weights'])) log_likelihood = y_train * np.log(y_pred) + (1 - y_train) * np.log(1 - y_pred) return np.sum(log_likelihood) + log_prior + log_alpha # Build a function that returns gradients of training loss using autograd. init_params = {'weights': np.array(np.ones(x_train.shape[1])), 'bias': 1} flattened_obj, unflatten, flattened_init_params = flatten_func( training_loss, init_params) # Check the gradients numerically, just to be safe. training_gradient_fun = grad(flattened_obj) n_iter = 10000 warmup = 1000 delta = 0.01 path_length = 1.0 n_steps = int(path_length / delta) import hamiltonian1 as hmc1 import hamiltonian2 as hmc2 print 'Descriptors: ' + str(x_n) print 'Params: n_iter: ' + str(n_iter) + ', warmup: ' + str(
def harmonic_synthesis( source_features, target_features, basis_size=8, gain_penalty=10.0, rate_penalty=0.0, rms_weight=1.0, dissonance_weight=1e-8, #dissonance is large debug=False, max_iters=100, **kwargs): """ Reconstruct audio from descriptors based on approximate matching """ if debug: from librosa.display import specshow import matplotlib.pyplot as plt n_fft = source_features['metadata']['n_fft'] hop_length = source_features['metadata']['hop_length'] sr = source_features['metadata']['sr'] gain = np.ones((1, basis_size)) / basis_size rate = np.ones((1, basis_size)) source_length = source_features['peak_f'].shape[1] target_peak_f = target_features['peak_f'] target_peak_power = target_features['peak_power'] start_frame = np.random.randint(source_length, size=basis_size) source_peak_power = source_features['peak_power'][:, start_frame] source_peak_f = source_features['peak_f'][:, start_frame] source_power = source_features['rms'][:, start_frame] target_power = target_features['rms'] def reconstruct_peaks(gain, rate): reconstruction_peak_power = np.abs(source_peak_power * gain * rate).ravel() reconstruction_peak_f = np.abs(source_peak_f * rate).ravel() return reconstruction_peak_power, reconstruction_peak_f def reconstruct_power(gain, rate): return np.abs(gain * rate * source_power).sum() def dissonance_loss(gain, rate): reconstruct_peak_f, reconstruct_peak_power = reconstruct_peaks( gain, rate) return v_x_dissonance_sethares(reconstruct_peak_f, target_peak_f, reconstruct_peak_power, target_peak_power) def power_loss(gain, rate): return np.abs(reconstruct_power(gain, rate) - target_power)**2 def reconstruct_loss(gain, rate, rms_weight, dissonance_weight): diss_loss = dissonance_loss(gain, rate) pow_loss = power_loss(gain, rate) total_loss = dissonance_weight * diss_loss + rms_weight * pow_loss if debug: print('gain', gain, 'rate', rate) print('loss', total_loss, 'diss loss', diss_loss, 'power loss', pow_loss) return total_loss def reconstruct_penalty(gain, rate, gain_penalty, rate_penalty): return gain_penalty * np.abs(gain).mean() + rate_penalty * np.abs( np.log2(rate)).mean() def objective(gain, rate, rms_weight, dissonance_weight, gain_penalty, rate_penalty): return reconstruct_loss(gain, rate, rms_weight, dissonance_weight) + reconstruct_penalty( gain, rate, gain_penalty, rate_penalty) def local_objective(params): gain, rate = params return objective(gain, rate, rms_weight, dissonance_weight, gain_penalty, rate_penalty) result = local_objective([gain, rate]) fun, unflatten, flat_params = flatten_func(local_objective, [gain, rate]) jac = grad(fun) result = minimize( fun, flat_params, method='L-BFGS-B', jac=jac, bounds=[[0, None]] * (basis_size * 2), # callback=callback_fun, options=dict(maxiter=max_iters, disp=True, gtol=1e-3)) gain, rate = unflatten(result.x) return dict( start_frame=start_frame.ravel(), start_time=librosa.core.frames_to_time(start_frame, sr, hop_length, n_fft).ravel(), start_sample=librosa.core.frames_to_samples(start_frame, hop_length, n_fft).ravel(), gain=np.sqrt(gain).ravel(), rate=rate.ravel(), )
print '{:15}|{:20}|{:20}|'.format(e, te, ve) if i % 10 == 0: print('[%03d][%03d/%03d]') % (e, i % opt['num_batches'], opt['num_batches']) gc.collect() p = sgd(objective_grad, p, step_size=lr, num_iters=opt['num_batches'], callback=stats) print '[opt] ', time.time() - s params = p print '[flat params] ...' flat_f, unflatten, flat_params = flatten_func(objective, params) print '[flat hess] ...' flat_hess = hessian(flat_f) h = None print '[compute hess] ...' for i in np.random.permutation(np.arange( opt['num_batches']))[:opt['hessian_num_batches']]: if h is None: h = flat_hess(flat_params, i) else: np.add(h, flat_hess(flat_params, i), h) print '[progress] ', i, ' dt: ', time.time() - s gc.collect() h = h.squeeze() / float(opt['hessian_num_batches'] * opt['batch_size'])
for log_proportion, mean, cov_sqrt in zip(*unpack_gmm_params(params)): alpha = np.minimum(1.0, np.exp(log_proportion) * 10) plot_ellipse(ax, mean, cov_sqrt, alpha) if __name__ == '__main__': init_params = init_gmm_params(num_components=10, D=2, scale=0.1) data = make_pinwheel(radial_std=0.3, tangential_std=0.05, num_classes=3, num_per_class=100, rate=0.4) def objective(params): return -gmm_log_likelihood(params, data) flattened_obj, unflatten, flattened_init_params =\ flatten_func(objective, init_params) fig = plt.figure(figsize=(12,8), facecolor='white') ax = fig.add_subplot(111, frameon=False) plt.show(block=False) def callback(flattened_params): params = unflatten(flattened_params) print("Log likelihood {}".format(-objective(params))) ax.cla() ax.plot(data[:, 0], data[:, 1], 'k.') ax.set_xticks([]) ax.set_yticks([]) plot_gaussian_mixture(params, ax) plt.draw() plt.pause(1.0/60.0)