def test_isinstance(): def fun(x): assert ag_isinstance(x, dict) assert ag_isinstance(x, ag_dict) return x['x'] fun({'x': 1.}) grad(fun)({'x': 1.})
def compare_smoother_grads(lds): init_params, pair_params, node_params = lds symmetrize = make_unop(lambda x: (x + x.T)/2. if np.ndim(x) == 2 else x, tuple) messages, _ = natural_filter_forward_general(*lds) dotter = randn_like(natural_smoother_general(messages, *lds)) def py_fun(messages): result = natural_smoother_general(messages, *lds) assert shape(result) == shape(dotter) return contract(dotter, result) dense_messages, _ = _natural_filter_forward_general( init_params, pair_params, node_params) def cy_fun(messages): result = _natural_smoother_general(messages, pair_params) result = result[0][:3], result[1], result[2] assert shape(result) == shape(dotter) return contract(dotter, result) result_py = py_fun(messages) result_cy = cy_fun(dense_messages) assert np.isclose(result_py, result_cy) g_py = grad(py_fun)(messages) g_cy = unpack_dense_messages(grad(cy_fun)(dense_messages)) assert allclose(g_py, g_cy)
def unwrap(self, output, i, *args, **kwargs): if not hasattr(output, '__iter__'): def _wrap(*args, **kwargs): return self.func(*args, **kwargs)[i] dfunc = grad(_wrap) return dfunc(*args, **kwargs) elif isinstance(output, np.ndarray): shape = output.shape J = [] axes = [] for dimen in shape: axes.append(range(dimen)) for idx in product(*axes): def _wrap(*args, **kwargs): return self.func(*args, **kwargs)[i][idx] dfunc = grad(_wrap) J.append(dfunc(*args, **kwargs)) if hasattr(J[0], "__iter__"): return list(map(list, zip(*J))) else: return J
def test_fast_conv_grad(): skip = 1 block_size = (11, 11) depth = 1 img = np.random.randn(51, 51, depth) filt = np.dstack([cv.gauss_filt_2D(shape=block_size,sigma=2) for k in range(depth)]) filt = cv.gauss_filt_2D(shape=block_size, sigma=2) def loss_fun(filt): out = fc.convolve(filt, img) return np.sum(np.sin(out) + out**2) loss_fun(filt) loss_grad = grad(loss_fun) def loss_fun_slow(filt): out = auto_convolve(img.squeeze(), filt, mode='valid') return np.sum(np.sin(out) + out**2) loss_fun_slow(filt) loss_grad_slow = grad(loss_fun_slow) # compare gradient timing loss_grad_slow(filt) loss_grad(filt) ## check numerical gradients num_grad = np.zeros(filt.shape) for i in xrange(filt.shape[0]): for j in xrange(filt.shape[1]): de = np.zeros(filt.shape) de[i, j] = 1e-4 num_grad[i,j] = (loss_fun(filt + de) - loss_fun(filt - de)) / (2*de[i,j]) assert np.allclose(loss_grad(filt), num_grad), "convolution gradient failed!"
def test_isinstance(): def fun(x): assert ag_isinstance(x, tuple) assert ag_isinstance(x, ag_tuple) return x[0] fun((1., 2., 3.)) grad(fun)((1., 2., 3.))
def test_isinstance(): def fun(x): assert ag_isinstance(x, list) assert ag_isinstance(x, ag_list) return x[0] fun([1., 2., 3.]) grad(fun)([1., 2., 3.])
def test_array_creation(): # Will always pass, but will take ages (like a minute) if the complexity of # array creation is O(N) N = 100000 def fun(x): arr = [x for i in range(N)] return np.sum(np.array(arr)) grad(fun)(1.0)
def test_sub(): fun = lambda x, y : to_scalar(x - y) d_fun_0 = lambda x, y : to_scalar(grad(fun, 0)(x, y)) d_fun_1 = lambda x, y : to_scalar(grad(fun, 1)(x, y)) for arg1, arg2 in arg_pairs(): check_grads(fun, arg1, arg2) check_grads(d_fun_0, arg1, arg2) check_grads(d_fun_1, arg1, arg2)
def peakmem_needless_nodes(): N, M = 1000, 100 def fun(x): for i in range(M): x = x + 1 return np.sum(x) grad(fun)(np.zeros((N, N)))
def check_fft_n(fft_fun, D, n): def fun(x): return to_scalar(fft_fun(x, D + n)) d_fun = lambda x : to_scalar(grad(fun)(x)) mat = npr.randn(D, D) mat = match_complex(fft_fun, mat) assert_array_equal(grad(fun)(mat).shape, mat.shape) check_grads(fun, mat) check_grads(d_fun, mat)
def test_return_both(): fun = lambda x : 3.0 * np.sin(x) d_fun = grad(fun) f_and_d_fun = grad(fun, return_function_value=True) test_x = npr.randn() f, d = f_and_d_fun(test_x) assert f == fun(test_x) assert d == d_fun(test_x)
def fan_out_fan_in(): """The 'Pearlmutter test' """ def fun(x): for i in range(10**4): x = (x + x)/2.0 return np.sum(x) with tictoc(): grad(fun)(1.0)
def test_add(): fun = lambda x, y : to_scalar(x + y) d_fun_0 = lambda x, y : to_scalar(grad(fun, 0)(x, y)) d_fun_1 = lambda x, y : to_scalar(grad(fun, 1)(x, y)) for arg1, arg2 in arg_pairs(): print(type(arg1), type(arg2)) check_grads(fun, arg1, arg2) check_grads(d_fun_0, arg1, arg2) check_grads(d_fun_1, arg1, arg2)
def check_binary_func(fun): x, y = 0.7, 1.8 a = grad(fun)(x, y) b = nd(lambda x: fun(x, y), x) check_close(a, b) a = grad(fun, 1)(x, y) b = nd(lambda y: fun(x, y), y) check_close(a, b)
def test_nested_higher_order(): def outer_fun(x): def inner_fun(y): return y[0] * y[1] return np.sum(np.sin(np.array(grad(inner_fun)(ag_tuple((x,x)))))) check_grads(outer_fun)(5.) check_grads(grad(outer_fun))(10.) check_grads(grad(grad(outer_fun)))(10.)
def test_nograd(): # we want this to raise non-differentiability error fun = lambda x: np.allclose(x, (x*3.0)/3.0) try: grad(fun)(np.array([1., 2., 3.])) except TypeError: pass else: raise Exception('Expected non-differentiability exception')
def test_return_both(): fun = lambda x : 3.0 * x**3.2 d_fun = grad(fun) f_and_d_fun = grad(fun, return_function_value=True) test_x = 1.7 f, d = f_and_d_fun(test_x) assert f == fun(test_x) assert d == d_fun(test_x)
def test_third_derivative(): fun = lambda x : np.sin(np.sin(x) + np.sin(x)) df = grad(fun) ddf = grad(fun) dddf = grad(fun) check_grads(fun, npr.randn()) check_grads(df, npr.rand()) check_grads(ddf, npr.rand()) check_grads(dddf, npr.rand())
def test_third_derivative_other_args2(): fun = lambda x, y : np.sin(np.sin(x) + np.sin(y)) df = grad(fun, 1) ddf = grad(fun) dddf = grad(fun, 1) check_grads(fun, npr.randn(), npr.randn()) check_grads(df, npr.randn(), npr.randn()) check_grads(ddf, npr.randn(), npr.randn()) check_grads(dddf, npr.randn(), npr.randn())
def check_binary_func(fun, independent=False): with warnings.catch_warnings(independent) as w: x, y = 0.7, 1.8 a = grad(fun)(x, y) b = nd(lambda x: fun(x, y), x) check_close(a, b) a = grad(fun, 1)(x, y) b = nd(lambda y: fun(x, y), y) check_close(a, b)
def test_power_arg0(): # the +1.'s here are to avoid regimes where numerical diffs fail make_fun = lambda y: lambda x: np.power(x, y) fun = make_fun(npr.randn()**2 + 1.) check_grads(fun)(npr.rand()**2 + 1.) # test y == 0. as a special case, c.f. #116 fun = make_fun(0.) assert grad(fun)(0.) == 0. assert grad(grad(fun))(0.) == 0.
def test_dtypes(): def f(x): return np.sum(x**2) # Array y with dtype np.float32 y = np.random.randn(10, 10).astype(np.float32) assert grad(f)(y).dtype.type is np.float32 y = np.random.randn(10, 10).astype(np.float16) assert grad(f)(y).dtype.type is np.float16
def test_pow(): fun = lambda x, y : to_scalar(x ** y) d_fun_0 = lambda x, y : to_scalar(grad(fun, 0)(x, y)) d_fun_1 = lambda x, y : to_scalar(grad(fun, 1)(x, y)) make_positive = lambda x : np.abs(x) + 1.1 # Numeric derivatives fail near zero for arg1, arg2 in arg_pairs(): arg1 = make_positive(arg1) check_grads(fun, arg1, arg2) check_grads(d_fun_0, arg1, arg2) check_grads(d_fun_1, arg1, arg2)
def test_div(): fun = lambda x, y : to_scalar(x / y) d_fun_0 = lambda x, y : to_scalar(grad(fun, 0)(x, y)) d_fun_1 = lambda x, y : to_scalar(grad(fun, 1)(x, y)) make_gap_from_zero = lambda x : np.sqrt(x **2 + 0.5) for arg1, arg2 in arg_pairs(): arg1 = make_gap_from_zero(arg1) arg2 = make_gap_from_zero(arg2) check_grads(fun, arg1, arg2) check_grads(d_fun_0, arg1, arg2) check_grads(d_fun_1, arg1, arg2)
def test_compute_stats_grad(): F = make_unop(lambda x: np.require(x, np.double, 'F'), tuple) dotter = F(randn_like(compute_stats(Ex, ExxT, ExnxT, True))) g1 = grad(lambda x: contract(dotter, compute_stats(*x)))((Ex, ExxT, ExnxT, 1.)) g2 = _compute_stats_grad(dotter) assert allclose(g1[:3], g2) dotter = F(randn_like(compute_stats(Ex, ExxT, ExnxT, False))) g1 = grad(lambda x: contract(dotter, compute_stats(*x)))((Ex, ExxT, ExnxT, 0.)) g2 = _compute_stats_grad(dotter) assert allclose(g1[:3], g2)
def test_mod(): fun = lambda x, y : to_scalar(x % y) d_fun_0 = lambda x, y : to_scalar(grad(fun, 0)(x, y)) d_fun_1 = lambda x, y : to_scalar(grad(fun, 1)(x, y)) make_gap_from_zero = lambda x : np.sqrt(x **2 + 0.5) for arg1, arg2 in arg_pairs(): if not arg1 is arg2: # Gradient undefined at x == y arg1 = make_gap_from_zero(arg1) arg2 = make_gap_from_zero(arg2) check_grads(fun, arg1, arg2) check_grads(d_fun_0, arg1, arg2) check_grads(d_fun_1, arg1, arg2)
def test_hess_vector_prod(): npr.seed(1) randv = npr.randn(10) def fun(x): return np.sin(np.dot(x, randv)) df = grad(fun) def vector_product(x, v): return np.sin(np.dot(v, df(x))) ddf = grad(vector_product) A = npr.randn(10) B = npr.randn(10) check_grads(fun, A) check_grads(vector_product, A, B)
def test_slices(): def f(x): s = slice(None, -1, None) y = x[s] return y[0] grad(f)([1., 2., 3.]) def f(x): y = x[1:3] return y[0] grad(f)([1., 2., 3.])
def test_checkpoint_correctness(): bar = lambda x, y: 2*x + y + 5 checkpointed_bar = checkpoint(bar) foo = lambda x: bar(x, x/3.) + bar(x, x**2) foo2 = lambda x: checkpointed_bar(x, x/3.) + checkpointed_bar(x, x**2) assert np.allclose(foo(3.), foo2(3.)) assert np.allclose(grad(foo)(3.), grad(foo2)(3.)) baz = lambda *args: sum(args) checkpointed_baz = checkpoint(baz) foobaz = lambda x: baz(x, x/3.) foobaz2 = lambda x: checkpointed_baz(x, x/3.) assert np.allclose(foobaz(3.), foobaz2(3.)) assert np.allclose(grad(foobaz)(3.), grad(foobaz2)(3.))
def test_jacobian_against_stacked_grads(): scalar_funs = [ lambda x: np.sum(x ** 3), lambda x: np.prod(np.sin(x) + np.sin(x)), lambda x: grad(lambda y: np.exp(y) * np.tanh(x[0]))(x[1]), ] vector_fun = lambda x: np.array([f(x) for f in scalar_funs]) x = npr.randn(5) jac = jacobian(vector_fun)(x) grads = [grad(f)(x) for f in scalar_funs] assert np.allclose(jac, np.vstack(grads))
def burer_monteiro(target_choi, n_decomp, rank, n_qubits, initial_guess=None, cfac_tol=1.): choi_dim = target_choi.shape[0] # expanding and flattening entries = choi_dim * rank def extract_matrix(x): mat_re = x[0:entries].reshape((rank, choi_dim)) mat_im = x[entries:2 * entries].reshape((rank, choi_dim)) return mat_re + 1j * mat_im matlen = 2 * entries def expand(x): arr_Y_pos = list() arr_Y_neg = list() arr_a_pos = list() arr_a_neg = list() for i in range(n_decomp): arr_Y_pos.append(extract_matrix(x[matlen * i:matlen * (i + 1)])) for i in range(n_decomp): arr_Y_neg.append( extract_matrix(x[matlen * (n_decomp + i):matlen * (n_decomp + i + 1)])) for i in range(n_decomp): arr_a_pos.append(x[matlen * 2 * n_decomp + i]) arr_a_neg.append(x[matlen * 2 * n_decomp + n_decomp + i]) return arr_Y_pos, arr_Y_neg, arr_a_pos, arr_a_neg def flatten_matrix(mat): mat_re = np.real(mat).flatten() mat_im = np.imag(mat).flatten() return [mat_re, mat_im] def flatten(arr_Y_pos, arr_Y_neg, arr_a_pos, arr_a_neg): tot_list = list() for Y in arr_Y_pos: tot_list += flatten_matrix(Y) for Y in arr_Y_neg: tot_list += flatten_matrix(Y) tot_list += arr_a_pos tot_list += arr_a_neg return np.hstack(tot_list) # optimization function def loss(x): arr_Y_pos, arr_Y_neg, arr_a_pos, arr_a_neg = expand(x) return np.sum(np.abs(arr_a_pos)) + np.sum(np.abs(arr_a_neg)) def constraint(x): arr_Y_pos, arr_Y_neg, arr_a_pos, arr_a_neg = expand(x) arr_C_pos = list() arr_C_neg = list() def conj(z): return np.real(z) - 1j * np.imag(z) for i in range(n_decomp): arr_C_pos.append(conj(arr_Y_pos[i].T) @ arr_Y_pos[i]) arr_C_neg.append(conj(arr_Y_neg[i].T) @ arr_Y_neg[i]) retvec = np.array([]) # TP constraint for i in range(n_decomp): pt = anp_partial_trace(arr_C_pos[i], [2**n_qubits, 2**n_qubits], 1) vec = (pt - arr_a_pos[i] * np.identity(2**n_qubits)).flatten() retvec = np.hstack([retvec, vec]) pt = anp_partial_trace(arr_C_neg[i], [2**n_qubits, 2**n_qubits], 1) vec = (pt - arr_a_neg[i] * np.identity(2**n_qubits)).flatten() retvec = np.hstack([retvec, vec]) # equality constraint C_sum = np.zeros_like(target_choi) for i in range(n_decomp): C_sum += arr_C_pos[i] - arr_C_neg[i] vec = (C_sum - target_choi).flatten() retvec = np.hstack([retvec, vec]) # separate complex and real part retvec = np.hstack([np.real(retvec), np.imag(retvec)]) return retvec constraint_jac = autograd.jacobian(constraint) constraint_hess = autograd.hessian(lambda x, v: np.dot(constraint(x), v), argnum=0) # initial guess res = minimize( lambda z: np.linalg.norm( np_partial_trace(target_choi, [2**n_qubits, 2**n_qubits], 1).data - z * np.eye(2**n_qubits)), [1.]) scale = res.x #assert res.fun < 1e-6 arr_Y_pos = list() arr_Y_neg = list() arr_a_pos = list() arr_a_neg = list() if initial_guess is not None: for i in range(n_decomp): arr_Y_pos.append(initial_guess["Y_pos"][i]) arr_Y_neg.append(initial_guess["Y_neg"][i]) arr_a_pos.append(initial_guess["a_pos"][i]) arr_a_neg.append(initial_guess["a_neg"][i]) else: for i in range(n_decomp): arr_Y_pos.append(scale * np.random.normal(size=(rank, choi_dim)) + 1j * scale * np.random.normal(size=(rank, choi_dim))) arr_Y_neg.append(scale * np.random.normal(size=(rank, choi_dim)) + 1j * scale * np.random.normal(size=(rank, choi_dim))) arr_a_pos.append(scale * np.random.uniform()) arr_a_neg.append(scale * np.random.uniform()) x0 = flatten(arr_Y_pos, arr_Y_neg, arr_a_pos, arr_a_neg) len_x0 = x0.shape[0] # check flatten+expand Yp, Yn, ap, an = expand(x0) for i in range(n_decomp): assert np.linalg.norm(arr_Y_pos[i] - Yp[i]) < 1e-10 assert np.linalg.norm(arr_Y_neg[i] - Yn[i]) < 1e-10 assert np.linalg.norm(arr_a_pos[i] - ap[i]) < 1e-10 assert np.linalg.norm(arr_a_neg[i] - an[i]) < 1e-10 # solve def new_loss(x): return np.sum(np.square(constraint(x))) new_loss_grad = autograd.grad(new_loss) lc_mat_dense = np.zeros((1, x0.shape[0])) lc_mat_dense[0, matlen * 2 * n_decomp:] = np.ones((n_decomp * 2)) indices_x = np.zeros((n_decomp * 2)) indices_y = list(range(matlen * 2 * n_decomp, x0.shape[0])) vals = np.ones((n_decomp * 2)) lc_mat = scipy.sparse.csr_matrix((vals, (indices_x, indices_y)), shape=(1, x0.shape[0])) assert np.linalg.norm(lc_mat_dense - lc_mat.toarray()) < 1e-10 if np.max(np.abs(constraint(x0))) < 1e-8 and np.abs(loss(x0) - 1.) < 1e-8: res = OptimizeResult() res.x = x0 else: con = LinearConstraint(lc_mat, 1., cfac_tol) res = minimize(new_loss, x0, jac=new_loss_grad, constraints=con, options={ "verbose": 0, "maxiter": 10000000, "gtol": 1e-12, "xtol": 1e-16 }, method='trust-constr') #assert np.max(np.abs(constraint(res.x))) < 1e-6 # return arr_Y_pos, arr_Y_neg, arr_a_pos, arr_a_neg = expand(res.x) arr_C_pos = list() arr_C_neg = list() for i in range(n_decomp): arr_C_pos.append(np.conj(arr_Y_pos[i].T) @ arr_Y_pos[i]) arr_C_neg.append(np.conj(arr_Y_neg[i].T) @ arr_Y_neg[i]) return arr_a_pos + arr_a_neg, arr_C_pos + arr_C_neg
def gradient(objective, argument): """ Compute the gradient of 'objective' with respect to the first argument and return as a function. """ return ad.grad(objective)
#etas = np.arange(-0.8, 1.2, 0.4) #pts = np.array((3.,7.,15.,20.)) etas = np.array((-0.8, 0.8)) pts = np.array((3., 20.)) #phis = np.arange(-np.pi, np.pi+2.*np.pi/6.,2.*np.pi/6.) #etas = np.array((-0.8,-0.4)) phis = np.array((-np.pi, np.pi)) x = defineState(len(etas) - 1, len(phis) - 1, datasetJ) print "minimising" xtol = np.finfo('float64').eps grad = grad(nllJ) hess = hessian(nllJ) btol = 1.e-8 #lb = [0.999,0.999,0.999,0.999,-0.01,-0.01,-0.01,-0.01,-1e-4,-1e-4,-1e-4,-1e-4,0.] lb = [0.999, -0.01, -1e-4, 0., 0.] #ub = [1.001,1.001,1.001,1.001,0.01,0.01,0.01,0.01,1e-4,1e-4,1e-4,1e-4,100] ub = [1.001, 0.01, 1e-4, 100., 1e9] constraints = LinearConstraint(A=np.eye(x.shape[0]), lb=lb, ub=ub, keep_feasible=True)
def fit(self, X, B, T, W=None): '''Fits the model. :param X: numpy matrix of shape :math:`k \\cdot n` :param B: numpy vector of shape :math:`n` :param T: numpy vector of shape :math:`n` :param W: (optional) numpy vector of shape :math:`n` ''' if W is None: W = numpy.ones(len(X)) X, B, T, W = (Z if type(Z) == numpy.ndarray else numpy.array(Z) for Z in (X, B, T, W)) keep_indexes = (T > 0) & (B >= 0) & (B <= 1) & (W >= 0) if sum(keep_indexes) < X.shape[0]: n_removed = X.shape[0] - sum(keep_indexes) warnings.warn('Warning! Removed %d/%d entries from inputs where ' 'T <= 0 or B not 0/1 or W < 0' % (n_removed, len(X))) X, B, T, W = (Z[keep_indexes] for Z in (X, B, T, W)) n_features = X.shape[1] # scipy.optimize and emcee forces the the parameters to be a vector: # (log k, log p, log sigma_alpha, log sigma_beta, # a, b, alpha_1...alpha_k, beta_1...beta_k) # Generalized Gamma is a bit sensitive to the starting point! x0 = numpy.zeros(6 + 2 * n_features) x0[0] = +1 if self._fix_k is None else log(self._fix_k) x0[1] = -1 if self._fix_p is None else log(self._fix_p) args = (X, B, T, W, self._fix_k, self._fix_p, self._hierarchical, self._flavor) # Set up progressbar and callback bar = progressbar.ProgressBar(widgets=[ progressbar.Variable('loss', width=15, precision=9), ' ', progressbar.BouncingBar(), ' ', progressbar.Counter(width=6), ' [', progressbar.Timer(), ']' ]) def callback(LL, value_history=[]): value_history.append(LL) bar.update(len(value_history), loss=LL) # Define objective and use automatic differentiation f = lambda x: -generalized_gamma_loss(x, *args, callback=callback) jac = autograd.grad(lambda x: -generalized_gamma_loss(x, *args)) # Find the maximum a posteriori of the distribution res = scipy.optimize.minimize(f, x0, jac=jac, method='SLSQP', options={'maxiter': 9999}) if not res.success: raise Exception('Optimization failed with message: %s' % res.message) result = {'map': res.x} # TODO: should not use fixed k/p as search parameters if self._fix_k: result['map'][0] = log(self._fix_k) if self._fix_p: result['map'][1] = log(self._fix_p) # Make sure we're in a local minimum gradient = jac(result['map']) gradient_norm = numpy.dot(gradient, gradient) if gradient_norm >= 1e-2 * len(X): warnings.warn('Might not have found a local minimum! ' 'Norm of gradient is %f' % gradient_norm) # Let's sample from the posterior to compute uncertainties if self._ci: dim, = res.x.shape n_walkers = 5 * dim sampler = emcee.EnsembleSampler( nwalkers=n_walkers, dim=dim, lnpostfn=generalized_gamma_loss, args=args, ) mcmc_initial_noise = 1e-3 p0 = [ result['map'] + mcmc_initial_noise * numpy.random.randn(dim) for i in range(n_walkers) ] n_burnin = 100 n_steps = numpy.ceil(2000. / n_walkers) n_iterations = n_burnin + n_steps bar = progressbar.ProgressBar(max_value=n_iterations, widgets=[ progressbar.Percentage(), ' ', progressbar.Bar(), ' %d walkers [' % n_walkers, progressbar.AdaptiveETA(), ']' ]) for i, _ in enumerate(sampler.sample(p0, iterations=n_iterations)): bar.update(i + 1) result['samples'] = sampler.chain[:, n_burnin:, :] \ .reshape((-1, dim)).T if self._fix_k: result['samples'][0, :] = log(self._fix_k) if self._fix_p: result['samples'][1, :] = log(self._fix_p) self.params = { k: { 'k': exp(data[0]), 'p': exp(data[1]), 'a': data[4], 'b': data[5], 'alpha': data[6:6 + n_features].T, 'beta': data[6 + n_features:6 + 2 * n_features].T, } for k, data in result.items() }
def get_batch_lower_bound(cur_params, iter): encoder_weights = combined_parser.get(cur_params, 'encoder weights') flow_params = combined_parser.get(cur_params, 'flow params') decoder_weights = combined_parser.get(cur_params, 'decoder weights') cur_data = train_images[batch_idxs[iter]] mus, log_sigs = encoder(encoder_weights, cur_data) samples, entropy_estimates = flow_sampler(flow_params, mus, np.exp(log_sigs), rs) loglikes = decoder_log_like(decoder_weights, samples, cur_data) print "Iter", iter, "loglik:", np.mean(loglikes).value, \ "entropy:", np.mean(entropy_estimates).value, "marg. like:", np.mean(entropy_estimates + loglikes).value return np.mean(entropy_estimates + loglikes) lb_grad = grad(get_batch_lower_bound) def callback(weights, iter, grad): #Generate samples num_samples = 100 zs = rs.randn(num_samples, latent_dimension) samples = decoder(combined_parser.get(weights, 'decoder weights'), zs) fig = plt.figure(1) fig.clf() ax = fig.add_subplot(111) plot_images(samples, ax, ims_per_row=10) plt.savefig('samples.png') final_params = adam(lb_grad, combined_params, num_training_iters, callback=callback) finish_time = time.time()
def test_angle_real(): fun = lambda x: np.angle(x) d_fun = lambda x: grad(fun)(x) check_grads(fun)(npr.rand()) check_grads(d_fun)(npr.rand())
def build_branin_objective(D=100): obj_grad = grad(branin) obj_hvp = sliced_hvp(obj_grad) return D, branin, obj_grad, obj_hvp, {}
def test_abs_complex(): fun = lambda x: np.abs(x) d_fun = lambda x: grad(fun)(x) check_grads(fun)(1.1 + 1.2j) check_grads(d_fun)(1.1 + 1.3j)
t2 = targets * 2 - 1 t2 = t2[:, np.newaxis, :] # Now t2 is -1 or 1, which makes the following form nice label_probabilities = -np.logaddexp(0, -unnormalized_logprobs * t2) return np.sum(label_probabilities, axis=-1) # Sum across pixels. def batched_loss(params, iter): data_idx = batch_indices(iter) return neglogprob(params, train_images[data_idx, :]) def neglogprob(params, data): return np.log(K) - logsumexp(bernoulli_log_density(data, params), axis=-1).mean() # Get gradient of objective using autograd. objective_grad = grad(batched_loss) def print_perf(params, iter, gradient): if iter % 30 == 0: save_images(sigmoid(params), 'results/4/thetas.png', vmin=0.0, vmax=1.0) print(batched_loss(params, iter)) # The optimizers provided by autograd can optimize lists, tuples, or dicts of parameters. # You may use these optimizers for Q4, but implement your own gradient descent optimizer for Q3! optimized_params = adam(objective_grad, theta, step_size=0.2, num_iters=10000,
def test_polygamma(): x = npr.randn() fun = lambda x: to_scalar(autograd.scipy.special.polygamma(0, x)) d_fun = grad(fun) check_grads(fun, x) check_grads(d_fun, x)
def test_yn(): x = npr.randn()**2 + 0.2 fun = lambda x: to_scalar(autograd.scipy.special.yn(2, x)) d_fun = grad(fun) check_grads(fun, x) check_grads(d_fun, x)
network_size = [2, 128, 128, 128, 1] A = [sigmoid, sigmoid, sigmoid, identity] network = simple_MLP(network_size, A) layer_data = network.layer_data L = network.L # trial solution def v(x, t, layer_data): input = np.array([x, t]) return np.sin(np.pi * x) + x * (x - 1) * t * network.input_to_output( input, layer_data) # applying the operator D := Dxx - Dt to the trial solution v_xx = grad(grad(v, 0), 0) v_t = grad(v, 1) def Dv(x, t, layer_data): return v_xx(x, t, layer_data) - v_t(x, t, layer_data) # cost function def cost_function(domain, layer_data): x, t = domain[0], domain[1] Dv_eval = np.array([Dv(x_, t_, layer_data) for t_ in t for x_ in x]) cost = np.dot(Dv_eval, Dv_eval) return cost / np.size(x) / np.size(x)
print ('b1', params[2][1][2]) # print ('b', params[2][2]) plt.cla() target_distribution = lambda x: np.exp(log_density(x)) var_distribution = lambda x: np.exp(variational_log_density(params, x)) plot_isocontours(ax, target_distribution) plot_isocontours(ax, var_distribution, cmap=plt.cm.bone) ax.set_autoscale_on(False) # rs = npr.RandomState(0) # samples = variational_sampler(params, num_plotting_samples, rs) # plt.plot(samples[:, 0], samples[:, 1], 'x') plt.draw() plt.pause(1.0/30.0) print("Optimizing variational parameters...") variational_params = adam(grad(objective), init_var_params(D), step_size=0.1, num_iters=2000, callback=callback)
def distance_from_target_image(smoke): return np.mean((target - smoke)**2) def convert_param_vector_to_matrices(params): vx = np.reshape(params[:(rows * cols)], (rows, cols)) vy = np.reshape(params[(rows * cols):], (rows, cols)) return vx, vy def objective(params): init_vx, init_vy = convert_param_vector_to_matrices(params) final_smoke = simulate(init_vx, init_vy, init_smoke, simulation_timesteps) return distance_from_target_image(final_smoke) # Specify gradient of objective function using autograd. objective_with_grad = grad(objective, return_function_value=True) fig = plt.figure(figsize=(8, 8)) ax = fig.add_subplot(111, frameon=False) def callback(params): init_vx, init_vy = convert_param_vector_to_matrices(params) simulate(init_vx, init_vy, init_smoke, simulation_timesteps, ax) print "Optimizing initial conditions..." result = minimize(objective_with_grad, init_dx_and_dy, jac=True, method='CG', options={ 'maxiter': 25,
def d_fun(input_list): g = grad(fun)(input_list) A = np.sum(g[0]) B = np.sum(np.sin(g[0])) C = np.sum(np.sin(g[1])) return A + B + C
def flow_eq_grad1(self, mf, pu, pd): return grad(self.flow_eq, 0)(mf, pu, pd), -1., 1., self.flow_eq(mf, pu, pd)
def test_real_type(): fun = lambda x: np.sum(np.real(x)) df = grad(fun) assert np.isrealobj(df(2.0)) assert np.iscomplexobj(df(1.0j))
def logistic_predictions(weights, inputs): # Outputs probability of a label being true according to logistic model. return sigmoid(np.dot(inputs, weights)) def training_loss(weights): # Training loss is the negative log-likelihood of the training labels. preds = logistic_predictions(weights, inputs) label_probabilities = preds * targets + (1 - preds) * (1 - targets) return -np.sum(np.log(label_probabilities)) # Build a toy dataset. inputs = np.array([[0.52, 1.12, 0.77], [0.88, -1.08, 0.15], [0.52, 0.06, -1.30], [0.74, -2.49, 1.39]]) targets = np.array([True, True, False, True]) # Build a function that returns gradients of training loss using autograd. training_gradient_fun = grad(training_loss) # Check the gradients numerically, just to be safe. weights = np.array([0.0, 0.0, 0.0]) quick_grad_check(training_loss, weights) # Optimize weights using gradient descent. print("Initial loss:", training_loss(weights)) for i in range(100): weights -= training_gradient_fun(weights) * 0.01 print("Trained loss:", training_loss(weights))
def test_abs_real(): fun = lambda x: np.abs(x) d_fun = lambda x: grad(fun)(x) check_grads(fun)(1.1) check_grads(d_fun)(2.1)
def __init__(self, point_estimate, demo_func, data, mut_rate=None, length=1, regime="long", psd_rtol=1e-8, **kwargs): """ Parameters ---------- point_estimate : array a statistically consistent estimate for the true parameters. confidence regions and hypothesis tests are computed for a (shrinking) neighborhood around this point. demo_func : function that returns a Demography from parameters data : SegSites (or Sfs, if regime="many") regime : the limiting regime for the asymptotic confidence region if "long", number of loci is fixed, and the length of the loci -> infinity. * uses time series information to estimate covariance structure * requires isinstance(data, SegSites) * loci should be independent. they don't have to be identically distributed if "many", the number of loci -> infinity * loci should be independent, and roughly identically distributed psd_rtol: for checking if certain matrices (e.g. covariance matrices) are positive semidefinite if psd_rtol = epsilon, then we will consider a matrix positive semidefinite if its most negative eigenvalue has magnitude less than epsilon * most positive eigenvalue. **kwargs : additional arguments passed into composite_log_likelihood """ if regime not in ("long", "many"): raise ValueError("Unrecognized regime '%s'" % regime) try: data = data.seg_sites except AttributeError: data = data if mut_rate is not None: mut_rate = mut_rate * length self.point = np.array(point_estimate) self.demo_func = demo_func self.data = data self.regime = regime self.kwargs = dict(kwargs) self.psd_rtol = psd_rtol self.score = autograd.grad(self.lik_fun)(self.point) self.score_cov = _observed_score_covariance(self.regime, self.point, self.data, self.demo_func, psd_rtol=self.psd_rtol, mut_rate=mut_rate, **self.kwargs) self.fisher = _observed_fisher_information(self.point, self.data, self.demo_func, psd_rtol=self.psd_rtol, assert_psd=False, mut_rate=mut_rate, **self.kwargs)
def test_angle_complex(): fun = lambda x: np.angle(x) d_fun = lambda x: grad(fun)(x) check_grads(fun)(npr.rand() + 1j * npr.rand()) check_grads(d_fun)(npr.rand() + 1j * npr.rand())
def test_drift_force(X, alpha): psi = SimpleGaussian(alpha) pool = SumPooling(psi) expected = 2 * grad(sum_pool_np, 0)(X, alpha) / sum_pool_np(X, alpha) assert_close(expected.ravel(), pool.drift_force(X))
def main(): num_iters = 10 X, y = make_classification( 100, n_classes=3, n_informative=3, n_redundant=0, n_clusters_per_class=2, n_features=20, ) model = lgb.LGBMClassifier( boosting_type="gbdt", objective="binary", n_estimators=3, random_state=1 ) model.fit(X, y) model_dump = model.booster_.dump_model() trees_ = [m["tree_structure"] for m in model_dump["tree_info"]] # needs to infer from model.predict_proba? or labelbinarizer lb = LabelBinarizer() y_ohe = lb.fit_transform(y) nclass = y_ohe.shape[1] if nclass == 2: y_ohe = y if nclass > 2: trees = split_trees_by_classes(trees_, nclass) trees_params = multiclass_trees_to_param(X, y, trees) model_ = gbm_gen( trees_params[0], X, trees_params[2], trees_params[1], True, nclass ) def training_loss(weights, idx=0): # Training loss is the negative log-likelihood of the training labels. preds = model_(weights, X) loglik = -np.sum(np.log(preds + 1e-7) * y_ohe) return loglik else: trees_params = multi_tree_to_param(X, y, trees_) model_ = gbm_gen(trees_params[0], X, trees_params[2], trees_params[1], False, 2) def training_loss(weights, idx=0): # Training loss is the negative log-likelihood of the training labels. preds = sigmoid(model_(weights, X)) label_probabilities = preds * y + (1 - preds) * (1 - y) loglik = -np.sum(np.log(label_probabilities)) return loglik # training the model and outputting results training_gradient_fun = grad(training_loss) param_ = adam( training_gradient_fun, trees_params[0], callback=simple_callback, step_size=0.05, num_iters=num_iters, ) lgb_predict = model.predict_proba(X) if lgb_predict.shape[1] == 2: lgb_predict = lgb_predict[:, 1] results = { "train_base": roc_auc_score(y_ohe, model_(trees_params[0], X)), "train_nnet": roc_auc_score(y_ohe, model_(param_, X)), "train_lgb": roc_auc_score(y_ohe, lgb_predict), } return results
def optimize_latent_weighting_stochastic(self, exp_buffer, wb, task_steps, state_diffs=False, use_all_exp=False): """Learn the latent weights using gradients of the energy function with respect to the latent weights and performing minibatch updates via SGD (ADAM). Arguments: exp_buffer -- Either an ExperienceReplay object, or a list of transitions of a single instance; if use_all_exp==False, then an ExperienceReplay object must be supplied; otherwise, a list of transitions must be supplied (where each transition is a numpy array) wb -- the latent weights for the specific instance task_steps --total steps taken in environment Keyword Arguments: state_diffs -- boolean indicating if the BNN should predict state differences rather than the next state (default: False) use_all_exp -- boolean indicating whether updates should be performed using all experiences """ # Create gradient functional of the energy function wrt wb energy_grad = grad(self.simple_loss, argnum=2) # energy_grad = grad(self.energy, argnum=2) cur_latent_weights = wb m1 = 0 m2 = 0 beta1 = 0.9 beta2 = 0.999 epsilon = 1e-8 t = 0 # With linear top latent weights, use a single sample of the BNN network weights to compute gradients if self.linear_latent_weights: tmp_num_weight_samples = self.num_weight_samples self.num_weight_samples = 1 for epoch in range(self.wb_opt_epochs): # Gather a sample of data from the experience buffer, convert to input and target arrays if use_all_exp: batch = exp_buffer else: batch, __, indices = exp_buffer.sample(task_steps) # batch: [state,self.__encode_action(action),reward,next_state] X = np.array([ np.hstack([batch[tt, 0], batch[tt, 1]]) for tt in range(len(batch)) ]) y = np.array([batch[tt, 3] for tt in range(len(batch))]) if state_diffs: y = y - X[:, :batch[0, 0].shape[0]] self.N = X.shape[0] batch_idxs = self.__make_batches__() # Permute the indices of the training inputs for SGD purposes #permutation = np.random.permutation(X.shape[0]) permutation = np.random.choice(range(X.shape[0]), X.shape[0], replace=False) for idxs in batch_idxs: t += 1 grad_wb = energy_grad(self.weights, X[permutation[idxs]], cur_latent_weights, y[permutation[idxs]]) # m1 = beta1*m1 + (1-beta1)*grad_wb # m2 = beta2*m2 + (1-beta2)*grad_wb**2 # m1_hat = m1 / (1-beta1**t) # m2_hat = m2 / (1-beta2**t) # cur_latent_weights -= self.wb_learning_rate * m1_hat / (np.sqrt(m2_hat)+epsilon) cur_latent_weights -= self.wb_learning_rate * grad_wb # Re-queue sampled data with updated TD-error calculations X_latent_weights = np.vstack( [cur_latent_weights[0] for i in range(X.shape[0])]) if not use_all_exp and exp_buffer.mem_priority: td_loss = self.get_td_error(np.hstack([X, X_latent_weights]), y, 0.0, 1.0) exp_buffer.update_priorities( np.hstack((np.reshape(td_loss, (len(td_loss), -1)), np.reshape(indices, (len(indices), -1))))) if self.linear_latent_weights: self.num_weight_samples = tmp_num_weight_samples return cur_latent_weights
def gradient_function(point): return projector(point, grad(cost)(point))
def fit_network(self, exp_buffer, task_weights, task_steps, state_diffs=False, use_all_exp=False): """Learn BNN network weights using gradients of the energy function with respect to the network weights and performing minibatch updates via SGD (ADAM). Arguments: exp_buffer -- Either an ExperienceReplay object, or a list of transitions; if use_all_exp==False, then an ExperienceReplay object must be supplied; otherwise, a list of transitions must be supplied (where each transition is a numpy array) task_weights -- the latent weights: a numpy array of with dimensions (number of instances x number of latent weights) task_steps --total steps taken in environment Keyword Arguments: state_diffs -- boolean indicating if the BNN should predict state differences rather than the next state (default: False) use_all_exp -- boolean indicating whether updates should be performed using all experiences """ # Create gradient functional of the energy function wrt W energy_grad = grad(self.simple_loss, argnum=0) # energy_grad = grad(self.energy, argnum=0) weights = np.copy(self.weights) m1 = 0 m2 = 0 beta1 = 0.9 beta2 = 0.999 epsilon = 1e-8 t = 0 for epoch in range(self.train_epochs): # Gather a sample of data from the experience buffer, convert to input and target arrays if use_all_exp: batch = exp_buffer else: batch, __, indices = exp_buffer.sample(task_steps) # batch: [state,self.__encode_action(action),reward,next_state] X = np.array([ np.hstack([batch[tt, 0], batch[tt, 1]]) for tt in range(len(batch)) ]) wb = np.array( [task_weights[batch[tt, 4], :] for tt in range(len(batch))]) y = np.array([batch[tt, 3] for tt in range(len(batch))]) if state_diffs: y = y - X[:, :batch[0, 0].shape[0]] self.N = X.shape[0] batch_idxs = self.__make_batches__() # Permute the indices of the training inputs for SGD purposes permutation = np.random.permutation(X.shape[0]) for idxs in batch_idxs: t += 1 grad_w = energy_grad(weights, X[permutation[idxs]], wb[permutation[idxs]], y[permutation[idxs]]) print("GRAD = ", grad_w) # m1 = beta1*m1 + (1-beta1)*grad_w # m2 = beta2*m2 + (1-beta2)*grad_w**2 # m1_hat = m1 / (1-beta1**t) # m2_hat = m2 / (1-beta2**t) # weights = weights - self.learning_rate*m1_hat/(np.sqrt(m2_hat)+epsilon) weights = weights - self.learning_rate * grad_w # Re-queue sampled data with updated TD-error calculations self.weights = weights if (not use_all_exp) and exp_buffer.mem_priority: td_loss = self.get_td_error(np.hstack([X, wb]), y, 0.0, 1.0) exp_buffer.update_priorities( np.hstack((np.reshape(td_loss, (len(td_loss), -1)), np.reshape(indices, (len(indices), -1)))))
def hamiltonian_monte_carlo( n_samples, negative_log_prob, initial_position, tune=500, path_len=1, initial_step_size=0.1, ): """Run Hamiltonian Monte Carlo sampling. Parameters ---------- n_samples : int Number of samples to return negative_log_prob : callable The negative log probability to sample from initial_position : np.array A place to start sampling from. tune: int Number of iterations to run tuning path_len : float How long each integration path is. Smaller is faster and more correlated. initial_step_size : float How long each integration step is. This will be tuned automatically. Returns ------- np.array Array of length `n_samples`. """ initial_position = np.array(initial_position) # autograd magic dVdq = grad(negative_log_prob) # collect all our samples in a list samples = [initial_position] # Keep a single object for momentum resampling momentum = st.norm(0, 1) step_size = initial_step_size step_size_tuning = DualAveragingStepSize(step_size) # If initial_position is a 10d vector and n_samples is 100, we want 100 x 10 momentum draws # we can do this in one call to np.random.normal, and iterate over rows size = (n_samples + tune, ) + initial_position.shape[:1] for idx, p0 in tqdm(enumerate(momentum.rvs(size=size)), total=size[0]): # Integrate over our path to get a new position and momentum q_new, p_new = leapfrog( samples[-1], p0, dVdq, path_len=2 * np.random.rand() * path_len, # We jitter the path length a bit step_size=step_size, ) # Check Metropolis acceptance criterion start_log_p = np.sum(momentum.logpdf(p0)) - negative_log_prob( samples[-1]) new_log_p = np.sum(momentum.logpdf(p_new)) - negative_log_prob(q_new) p_accept = min(1, np.exp(new_log_p - start_log_p)) if np.random.rand() < p_accept: samples.append(q_new) else: samples.append(np.copy(samples[-1])) if idx < tune - 1: step_size, _ = step_size_tuning.update(p_accept) elif idx == tune - 1: _, step_size = step_size_tuning.update(p_accept) return np.array(samples[1 + tune:])
def map_gpp_bnn(layer_sizes, nonlinearity=np.tanh, n_data=200, N_samples=10, L2_reg=0.1, noise_var=0.1): shapes = list(zip(layer_sizes[:-1], layer_sizes[1:])) N_weights = sum((m+1)*n for m, n in shapes) def unpack_params(params): mean, log_std = params[:N_weights], params[N_weights:] return mean, log_std def unpack_layers(weights): """ iterable that unpacks the weights into relevant tensor shapes for each layer""" num_weight_sets = len(weights) for m, n in shapes: yield weights[:, :m*n] .reshape((num_weight_sets, m, n)),\ weights[:, m*n:m*n+n].reshape((num_weight_sets, 1, n)) weights = weights[:, (m+1)*n:] def predictions(weights, inputs): """ implements the forward pass of the bnn weights | dim = [N_weight_samples, N_weights] inputs | dim = [N_data] outputs | dim = [N_weight_samples, N_data, 1] """ inputs = np.expand_dims(inputs, 0) for W, b in unpack_layers(weights): outputs = np.einsum('mnd,mdo->mno', inputs, W) + b inputs = nonlinearity(outputs) return outputs def sample_gpp(x, n_samples): """ Samples from the gp prior x = inputs with shape [N_data] returns : samples from the gp prior [N_data, N_samples] """ x = np.ravel(x) n_data = len(x) K = covariance(x[:, None], x[:, None]) L = cholesky(K + 1e-7 * np.eye(n_data)) e = rs.randn(n_data, n_samples) return np.dot(L, e) def log_gp_prior(y_bnn, x): """ computes: the expectation value of the log of the gp prior : E [ log p_gp(f) ] where p_gp(f) = N(f|0,K) where f ~ p_BNN(f) = -0.5 * E [ (L^-1f)^T(L^-1f) ] + const; K = LL^T (cholesky decomposition) (we ignore constants for now as we are not optimizing the covariance hyper-params) bnn_weights | dim = [N_weights_samples, N_weights] K = covariance/Kernel matrix | dim = [N_data, N_data] ; dim L = dim K y_bnn output of a bnn | dim = [N_data, N_weights_samples] returns : E[log p_gp(y)] | dim = [N_function_samples] """ K = covariance(x, x)+noise_var*np.eye(len(x)) # shape [N_data, N_data] L = cholesky(K) # K = LL^T ; shape L = shape K a = solve(L, y_bnn) # a = L^-1 y_bnn ; shape L^-1 y_bnn = log_gp = -0.5*np.mean(a**2, axis=0) # Compute E [a^2] return log_gp def log_prob(weights, inputs, targets): """ computes log p(y,w) = log p(y|w)+ log p(w) with p(w) = N(w|0,I) weights: | dim = [N_weight_samples, N_weights] preds = f | dim = [N_weight_samples, N_data, 1] targets = y | dim = [N_data] log_prior = log p(w) | dim = [N_weights_samples] log_lik = log(y|w) | dim = [N_weights_samples] """ log_prior = -L2_reg * np.sum(weights ** 2, axis=1) preds = predictions(weights, inputs) log_lik = -np.sum((preds - targets)**2, axis=1)[:, 0] / noise_var return log_prior + log_lik def gaussian_entropy(log_std): return 0.5 * N_weights * (1.0 + np.log(2*np.pi)) + np.sum(log_std) def elbo(var_param, x, y): """ Provides a stochastic estimate of the evidence lower bound ELBO = E_r(w) [log p(y,w)-r(w)] params | dim = [2*N_weights] mean, log_std | dim = [N_weights] ws | dim = [N_samples, N_weights] returns : ELBO | dim = [1] """ mean, log_std = unpack_params(var_param) ws = rs.randn(N_samples, N_weights) * np.exp(log_std) + mean # sample weights from r(w) return gaussian_entropy(log_std) + np.mean(log_prob(ws, x, y)) # ELBO def log_pys(thetas, ys, x): """ creates an array of log p(y) for each y in ys which are estimated by using the ELBO log p(y) => E_r(w) [ log p(y,w)-log r(w)] ys has shape [y_samples, N_data] """ # get E_r(w)[p(y,w) - r(w)] for each w, y elbos = np.array([elbo(theta, x, y) for theta, y in zip(thetas, ys)]) return elbos def kl_objective(params_phi, params_theta, t): """ Provides a stochastic estimate of the kl divergence kl[p(y)|p_GP(y)] = E_p(y) [log p(y) -log p_gp(y)] = -H[ p(y) ] -E_p(y) [log p_gp(y)] using : params_phi dim = [2*N_weights] params_theta list of [2*N_weights] : the var params of each r(w|theta) phi_mean, phi_log_std | dim = [N_weights] w_phi | dim = [N_samples, N_weights] y_bnn | dim = [N_data, N_weights_samples] kl | dim = [1] """ phi_mean, phi_log_std = unpack_params(params_phi) w_phi = rs.randn(N_samples, N_weights) * np.exp(phi_log_std) + phi_mean x = np.random.uniform(low=-10, high=10, size=(n_data, 1)) # X ~ p(X) f_bnn = predictions(w_phi, x)[:, :, 0].T # shape [N_data, N_weights_samples] f ~ p(f) y_bnn = f_bnn + 3*noise_var*rs.randn(n_data, N_samples) # y = f + e ; y ~ p(y) # use monte carlo to approx H[p(y)] = E_p(y)[ log p(y)] entropy = np.mean(log_pys(params_theta, y_bnn.T, x)) # use monte carlo to approx E_p(y) [log p_gp(y)] expected_log_gpp = np.mean(log_gp_prior(y_bnn, x)) kl_div = entropy - expected_log_gpp return kl_div # the KL grad_kl = grad(kl_objective, argnum=(0, 1)) return N_weights, predictions, sample_gpp, unpack_params, kl_objective, grad_kl
def prepare_loss_node(loss, opt_args_ls=None): if global_settings.backend == 'autograd': return ag.grad(loss, opt_args_ls) elif global_settings.backend == 'pytorch': return loss
ub_sigma = np.full((nEtaBins, nEtaBins, nPtBins, nPtBins), 10).flatten() ub_nsig = np.full((nEtaBins, nEtaBins, nPtBins, nPtBins), 20.).flatten() ub_scale[idx] = np.full(len(idx), 1.) ub_sigma[idx] = np.full(len(idx), -3.5) ub_nsig[idx] = np.full(len(idx), 6.9) lb = np.concatenate((lb_scale, lb_sigma, lb_nsig), axis=None) ub = np.concatenate((ub_scale, ub_sigma, ub_nsig), axis=None) constraints = LinearConstraint(A=np.eye(x.shape[0]), lb=lb, ub=ub, keep_feasible=True) grad = grad(nll) hess = hessian(nll) res = minimize(nll, x, args=(nEtaBins,nPtBins,datasetJ,datasetJgen),\ method = 'trust-constr',jac = grad, hess=SR1(), constraints = constraints,\ options={'verbose':3,'disp':True,'maxiter' : 100000, 'gtol' : 0., 'xtol' : xtol, 'barrier_tol' : btol}) print res good_idx = np.where((np.sum(datasetJgen, axis=2) > 1000.).flatten())[0] sep = nEtaBins * nEtaBins * nPtBins * nPtBins good_idx = np.concatenate((good_idx, good_idx + sep, good_idx + 2 * sep), axis=None) fitres = res.x[good_idx]