def extract_parameters(theta, summaries, n_inducing, n_latent, same_z=False): # Picks out the parameters and constrains some to be positive theta_dict = reconstruct_tf(theta, summaries) ms = theta_dict['m'] Ls = create_ls(theta_dict['L_elts'], n_inducing, n_latent) w_means = theta_dict['W_means'] w_vars = theta_dict['W_sds']**2 if same_z: Z = rep_matrix(theta_dict['Z'], n_latent) else: Z = theta_dict['Z'] w_prior_means = theta_dict['W_prior_mean'] w_prior_vars = theta_dict['W_prior_sd']**2 print(tf.reshape(w_prior_means, (-1, ))) print(tf.reshape(w_prior_vars, (-1, ))) kern_params = theta_dict['kernel_params']**2 intercept = tf.squeeze(theta_dict['intercept']) return (ms, Ls, w_means, w_vars, Z, kern_params, w_prior_means, w_prior_vars, intercept)
def to_minimize(x): theta = reconstruct_tf(x, summary) # TODO: Check initial values are still consistent here kerns = [partial(matern_kernel_32, alpha=alpha, lengthscales=lscale, jitter=JITTER) for alpha, lscale in zip(alphas, theta['lscales']**2)] site_ls = create_ls(theta['site_l_elts'], n_latent_site, n_sites) env_ls = create_ls(theta['env_l_elts'], n_inducing, n_latent) objective = compute_objective( X, y, Z, theta['env_ms'], env_ls, kerns, theta['w_means'], theta['w_vars']**2, w_prior_mean, w_prior_var, site_prior_mean, site_prior_cov, theta['site_means'], site_ls, theta['b_mat']) # Add a prior term objective = objective + cur_corr_mat = tf.transpose(theta['b_mat']) @ theta['b_mat'] print(np.round(covar_to_corr(cur_corr_mat.numpy()), 2)) return -objective
def to_optimise( flat_theta, X, z, weights, use_berman_turner, summary, init_kernel_spec, log_theta_dir=None, verbose=True, likelihood_scale_factor=1.0, ): global STEP flat_theta = tf.cast(tf.constant(flat_theta), tf.float32) with tf.GradientTape() as tape: tape.watch(flat_theta) theta = reconstruct_tf(flat_theta, summary) kernel_spec, gp_spec = update_specs(theta, init_kernel_spec) cur_objective = -calculate_objective( X, z, weights, gp_spec, use_berman_turner=use_berman_turner) kernel_prior_prob = calculate_prior_prob(kernel_spec) cur_objective = cur_objective - kernel_prior_prob cur_grad = tape.gradient(cur_objective, flat_theta) if log_theta_dir is not None: makedirs(log_theta_dir, exist_ok=True) grads = reconstruct_np(cur_grad.numpy(), summary) theta = reconstruct_np(flat_theta.numpy(), summary) np.savez( join(log_theta_dir, f"grads_{STEP}"), **grads, objective=cur_objective.numpy(), step=STEP, ) np.savez( join(log_theta_dir, f"theta_{STEP}"), **theta, objective=cur_objective.numpy(), step=STEP, ) STEP += 1 if verbose: print(cur_objective, np.linalg.norm(cur_grad.numpy())) return ( cur_objective.numpy().astype(np.float64), cur_grad.numpy().astype(np.float64), )
def to_minimize(flat_theta): flat_theta = tf.constant(flat_theta) flat_theta = tf.cast(flat_theta, tf.float32) with tf.GradientTape() as tape: tape.watch(flat_theta) theta = reconstruct_tf(flat_theta, summary) objective = -calculate_objective(theta) grad = tape.gradient(objective, flat_theta) print(objective, np.linalg.norm(grad.numpy())) return (objective.numpy().astype(np.float64), grad.numpy().astype(np.float64))
def test_with_tf(): input_arrays = generate_arrays() # Turn into TF version tf_arrays = {x: tf.constant(y) for x, y in input_arrays.items()} flat_array, summaries = flatten_and_summarise(**input_arrays) reconstructed = reconstruct_tf(flat_array, summaries, reshape_fun=tf.reshape) checks = [ tf.reduce_all(tf.equal(reconstructed[x], tf_arrays[x])) for x in input_arrays ] assert tf.reduce_all(checks).numpy()
def to_optimise(flat_theta): flat_theta = tf.cast(tf.constant(flat_theta), tf.float32) with tf.GradientTape() as tape: tape.watch(flat_theta) theta = reconstruct_tf(flat_theta, summary) obj = -compute_objective(n=n, p=p, n_surfaces=n_surfaces, server_ids=server_ids, returner_ids=returner_ids, surf_ids=surf_ids, **theta) grad = tape.gradient(obj, flat_theta) print(obj, np.linalg.norm(grad.numpy())) print(np.round(covar_to_corr(pos_def_mat_from_vector( theta['elts_prior_serve'], n_surfaces)), 2)) return obj.numpy().astype(np.float64), grad.numpy().astype(np.float64)
def to_minimize_with_grad(x): with tf.GradientTape() as tape: x_tf = tf.constant(x) x_tf = tf.cast(x_tf, tf.float32) tape.watch(x_tf) theta = reconstruct_tf(x_tf, summary) alpha, lscales, bias_sd = ( theta["alpha"]**2, theta["lscales"]**2, theta["bias_sd"]**2, ) L_cov = lo_tri_from_elements(theta["L_elts"], n_inducing) kern_fun = get_kernel_fun(kernel_fun, alpha, lscales, bias_sd) objective = -compute_objective(X, y, theta["mu"], L_cov, theta["Z"], bernoulli_probit_lik, kern_fun) objective = objective - (tf.reduce_sum( lscale_prior.log_prob(lscales)) + kernel_var_prior.log_prob( alpha**2) + bias_var_prior.log_prob(bias_sd**2)) grad = tape.gradient(objective, x_tf) if verbose: print(objective, np.linalg.norm(grad.numpy())) return (objective.numpy().astype(np.float64), grad.numpy().astype(np.float64))
def objective_and_grad(flat_theta, X, X_thin, sp_num, z, weights, summary, n_latent, n_data, use_berman_turner, log_cov_alpha, log_thin_alpha=0., thin_Zs=None, w_prior_mean=None): # TODO: Make priors configurable; add docstrings. # Note: if thin_Zs is passed, we are not optimising their locations. # This is not the cleanest way of doing it, but it's the best I can think # of for now. flat_theta = tf.constant(flat_theta.astype(np.float32)) if X_thin is not None: X, X_thin, z, weights = map( lambda x: tf.cast(tf.constant(x), tf.float32), [X, X_thin, z, weights]) else: X, z, weights = map(lambda x: tf.cast(tf.constant(x), tf.float32), [X, z, weights]) with tf.GradientTape() as tape: tape.watch(flat_theta) theta = reconstruct_tf(flat_theta, summary) if thin_Zs is not None: theta['thin_Zs'] = thin_Zs # This is fixed during optimisation, so we're setting it here theta['log_cov_alpha'] = log_cov_alpha theta['log_thin_alpha'] = log_thin_alpha # Also fix the intercept; give it only a very weak prior. # TODO: Maybe make this configurable theta['intercept_prior_var'] = tf.math.log(tf.constant(5.)**2) theta['intercept_prior_mean'] = tf.constant(0.) if w_prior_mean is not None: theta['w_prior_mean'] = w_prior_mean spec = build_spec(theta) # Fix prior mean and var to start with obj = -calculate_objective(spec.cov_mogp_spec, X, sp_num, z, weights, lik_scale_factor=n_data / X.shape[0], thinning_mogp_spec=spec.thin_mogp_spec, X_thin=X_thin, use_berman_turner=use_berman_turner) # Add prior on lengthscales obj = obj - tf.reduce_sum( tfp.distributions.Gamma(3, 1 / 3).log_prob(tf.exp( theta['lscales']))) # TODO: Make these configurable # Add prior on prior w means and variances obj = obj - tf.reduce_sum( tfp.distributions.Normal(0., 1.).log_prob(theta['w_prior_mean'])) obj = obj - tf.reduce_sum( tfp.distributions.Gamma(0.5, 0.5).log_prob( tf.exp(theta['w_prior_var']))) # Add prior on intercept mean and variance obj = obj - tf.reduce_sum( tfp.distributions.Normal(0., 1.).log_prob( theta['intercept_prior_mean'])) obj = obj - tf.reduce_sum( tfp.distributions.Gamma(0.5, 0.5).log_prob( tf.exp(theta['intercept_prior_var']))) if X_thin is not None: obj = obj - tf.reduce_sum( tfp.distributions.Gamma(3, 1 / 3).log_prob( tf.exp(theta['thin_lscales']))) obj = obj - tf.reduce_sum( tfp.distributions.Normal(0., 1.).log_prob( theta['thin_w_prior_mean'])) obj = obj - tf.reduce_sum( tfp.distributions.Gamma(0.5, 0.5).log_prob( tf.exp(theta['thin_w_prior_var']))) grad = tape.gradient(obj, flat_theta) if np.any(np.isnan(grad.numpy())): # Save the current state for investigation np.savez('theta_bug', **{x: y.numpy() for x, y in theta.items()}) np.savez( 'data_bug', **{ 'X': X.numpy(), 'X_thin': X_thin.numpy(), 'sp_num': sp_num, 'z': z.numpy(), 'weights': weights.numpy(), 'n_latent': n_latent, 'n_data': n_data, 'use_berman_turner': use_berman_turner, 'thin_Zs': thin_Zs.numpy() }) exit() return obj.numpy().astype(np.float64), grad.numpy().astype(np.float64)
def fit(X: np.ndarray, y: np.ndarray, n_inducing: int = 100, n_latent: int = 10, kernel: str = 'matern_3/2', random_seed: int = 2): # TODO: This is copied from the mogp_classifier. # Maybe instead make it a function of some sort? np.random.seed(random_seed) # Note that input _must_ be scaled. Some way to enforce that? kernel_fun = kern_lookup[kernel] n_cov = X.shape[1] n_out = y.shape[1] # Set initial values start_lengthscales = np.random.uniform(2., 4., size=(n_latent, n_cov)) Z = find_starting_z(X, n_inducing) Z = np.tile(Z, (n_latent, 1, 1)) start_kernel_funs = get_kernel_funs(kernel_fun, np.sqrt(start_lengthscales)) init_Ls = np.stack([ get_initial_values_from_kernel(cur_z, cur_kernel_fun) for cur_z, cur_kernel_fun in zip(Z, start_kernel_funs) ]) init_ms = np.zeros((n_latent, n_inducing)) start_prior_cov = np.eye(n_latent) start_prior_mean = np.zeros(n_latent) start_prior_cov_elts = corr_mogp.get_initial_w_elements( start_prior_mean, start_prior_cov, n_out) start_w_cov_elts = rep_vector(start_prior_cov_elts, n_out) init_w_means = np.random.randn(n_out, n_latent) start_theta = { 'mu': init_ms, 'L_elts': init_Ls, 'w_means': init_w_means, 'w_cov_elts': start_w_cov_elts, 'lengthscales': start_lengthscales, 'w_prior_cov_elts': start_prior_cov_elts, 'w_prior_mean': start_prior_mean, 'Z': Z } flat_start_theta, summary = flatten_and_summarise_tf(**start_theta) X_tf = tf.constant(X.astype(np.float32)) y_tf = tf.constant(y.astype(np.float32)) def extract_cov_matrices(theta): w_covs = create_pos_def_mat_from_elts_batch(theta['w_cov_elts'], n_latent, n_out, jitter=JITTER) Ls = mogp.create_ls(theta['L_elts'], n_inducing, n_latent) w_prior_cov = create_pos_def_mat_from_elts(theta['w_prior_cov_elts'], n_latent, jitter=JITTER) return w_covs, Ls, w_prior_cov def calculate_objective(theta): w_covs, Ls, w_prior_cov = extract_cov_matrices(theta) print(np.round(covar_to_corr(w_prior_cov.numpy()), 2)) print(np.round(theta['lengthscales'].numpy()**2, 2)) kernel_funs = get_kernel_funs(kernel_fun, theta['lengthscales']**2) cur_objective = corr_mogp.compute_default_objective( X_tf, y_tf, theta['Z'], theta['mu'], Ls, theta['w_means'], w_covs, kernel_funs, bernoulli_probit_lik, theta['w_prior_mean'], w_prior_cov) # Add prior lscale_prior = tfp.distributions.Gamma(3, 1 / 3).log_prob( theta['lengthscales']**2) return cur_objective + tf.reduce_sum(lscale_prior) def to_minimize(flat_theta): flat_theta = tf.constant(flat_theta) flat_theta = tf.cast(flat_theta, tf.float32) with tf.GradientTape() as tape: tape.watch(flat_theta) theta = reconstruct_tf(flat_theta, summary) objective = -calculate_objective(theta) grad = tape.gradient(objective, flat_theta) print(objective, np.linalg.norm(grad.numpy())) return (objective.numpy().astype(np.float64), grad.numpy().astype(np.float64)) result = minimize(to_minimize, flat_start_theta, jac=True, method='L-BFGS-B') final_theta = reconstruct_tf(result.x.astype(np.float32), summary) w_covs, Ls, w_prior_cov = extract_cov_matrices(final_theta) return CorrelatedMOGPResult( Ls=Ls, mu=final_theta['mu'].numpy(), kernel=kernel, lengthscales=final_theta['lengthscales'].numpy()**2, w_means=final_theta['w_means'].numpy(), w_cov=w_covs.numpy(), Z=final_theta['Z'].numpy(), w_prior_means=final_theta['w_prior_mean'].numpy(), w_prior_cov=w_prior_cov.numpy())
def to_minimize_with_grad(x): with tf.GradientTape() as tape: x_tf = tf.constant(x) x_tf = tf.cast(x_tf, tf.float32) tape.watch(x_tf) theta = reconstruct_tf(x_tf, summary) # Square the important parameters (lscales, w_prior_var, intercept_vars, intercept_prior_var, w_vars) = ( theta["lscales"] ** 2, theta["w_prior_var"] ** 2, theta["intercept_vars"] ** 2, theta["intercept_prior_var"] ** 2, theta["w_vars"] ** 2, ) if verbose: print(lscales) print(intercept_prior_var) print(w_prior_var) print(theta["w_prior_mean"]) print(theta["intercept_prior_mean"]) Ls = create_ls(theta["L_elts"], n_inducing, n_latent) kern_funs = get_kernel_funs( kernel_fun, lscales, total_variance=tf.constant(total_kernel_variance, dtype=tf.float32), ) kl = compute_kl_term( theta["mu"], Ls, kern_funs, theta["Z"], theta["w_means"], w_vars, theta["w_prior_mean"], w_prior_var, theta["intercept_means"], intercept_vars, theta["intercept_prior_mean"], intercept_prior_var, ) lik = compute_likelihood_term( X, y, theta["Z"], theta["mu"], Ls, kern_funs, theta["w_means"], w_vars, theta["intercept_means"], intercept_vars, ) objective = -(lik - kl) objective = objective - ( tf.reduce_sum(lscale_prior.log_prob(lscales)) + bias_var_prior.log_prob(intercept_prior_var) + tf.reduce_sum(w_var_prior.log_prob(w_prior_var)) + bias_m_prior.log_prob(theta["intercept_prior_mean"]) + tf.reduce_sum(w_m_prior.log_prob(theta["w_prior_mean"])) ) grad = tape.gradient(objective, x_tf) if verbose: print(objective, np.linalg.norm(grad.numpy())) return (objective.numpy().astype(np.float64), grad.numpy().astype(np.float64))
def fit( X: np.ndarray, y: np.ndarray, n_inducing: int = 100, n_latent: int = 10, kernel: str = "matern_3/2", # Gamma priors (note tfp uses "concentration rate" parameterisation): kernel_lengthscale_prior: Tuple[float, float] = (3, 1 / 3), bias_variance_prior: Tuple[float, float] = (3 / 2, 3 / 2), w_variance_prior: Tuple[float, float] = (3 / 2, 3 / 2), # Normal priors w_mean_prior: Tuple[float, float] = (0, 1), bias_mean_prior: Tuple[float, float] = (0, 1), random_seed: int = 2, test_run: bool = False, total_kernel_variance=6.0, verbose=False, ) -> MOGPResult: np.random.seed(random_seed) # Note that input _must_ be scaled. Some way to enforce that? kernel_fun = kern_lookup[kernel] n_cov = X.shape[1] n_out = y.shape[1] # Set initial values start_lengthscales = np.random.uniform(2.0, 4.0, size=(n_latent, n_cov)).astype( np.float32 ) Z = find_starting_z(X, n_inducing) Z = np.tile(Z, (n_latent, 1, 1)) Z = Z.astype(np.float32) start_kernel_funs = get_kernel_funs( kernel_fun, tf.constant(start_lengthscales), total_variance=tf.constant(total_kernel_variance), ) init_Ls = np.stack( [ get_initial_values_from_kernel(tf.constant(cur_z), cur_kernel_fun) for cur_z, cur_kernel_fun in zip(Z, start_kernel_funs) ] ) init_ms = np.zeros((n_latent, n_inducing)) w_prior_var_init = np.ones((n_latent, 1)) * 1.0 w_prior_mean_init = np.zeros((n_latent, 1)) start_intercept_means = np.zeros(n_out) start_intercept_var = np.ones(n_out) intercept_prior_var_init = np.array(0.4) init_theta = { "L_elts": init_Ls, "mu": init_ms, "w_prior_var": w_prior_var_init, "w_prior_mean": w_prior_mean_init, "intercept_means": start_intercept_means, "intercept_vars": start_intercept_var, "intercept_prior_var": intercept_prior_var_init, "intercept_prior_mean": np.array(0.0), "w_means": np.random.randn(n_latent, n_out) * 0.01, "w_vars": np.ones((n_latent, n_out)), "lscales": np.sqrt(start_lengthscales), "Z": Z, } # Make same type init_theta = {x: tf.constant(y.astype(np.float32)) for x, y in init_theta.items()} flat_theta, summary = flatten_and_summarise_tf(**init_theta) X = tf.constant(X.astype(np.float32)) y = tf.constant(y.astype(np.float32)) lscale_prior = tfp.distributions.Gamma(*kernel_lengthscale_prior) bias_var_prior = tfp.distributions.Gamma(*bias_variance_prior) w_var_prior = tfp.distributions.Gamma(*w_variance_prior) w_m_prior = tfp.distributions.Normal(*w_mean_prior) bias_m_prior = tfp.distributions.Normal(*bias_mean_prior) # TODO: Think about priors for W? def to_minimize_with_grad(x): with tf.GradientTape() as tape: x_tf = tf.constant(x) x_tf = tf.cast(x_tf, tf.float32) tape.watch(x_tf) theta = reconstruct_tf(x_tf, summary) # Square the important parameters (lscales, w_prior_var, intercept_vars, intercept_prior_var, w_vars) = ( theta["lscales"] ** 2, theta["w_prior_var"] ** 2, theta["intercept_vars"] ** 2, theta["intercept_prior_var"] ** 2, theta["w_vars"] ** 2, ) if verbose: print(lscales) print(intercept_prior_var) print(w_prior_var) print(theta["w_prior_mean"]) print(theta["intercept_prior_mean"]) Ls = create_ls(theta["L_elts"], n_inducing, n_latent) kern_funs = get_kernel_funs( kernel_fun, lscales, total_variance=tf.constant(total_kernel_variance, dtype=tf.float32), ) kl = compute_kl_term( theta["mu"], Ls, kern_funs, theta["Z"], theta["w_means"], w_vars, theta["w_prior_mean"], w_prior_var, theta["intercept_means"], intercept_vars, theta["intercept_prior_mean"], intercept_prior_var, ) lik = compute_likelihood_term( X, y, theta["Z"], theta["mu"], Ls, kern_funs, theta["w_means"], w_vars, theta["intercept_means"], intercept_vars, ) objective = -(lik - kl) objective = objective - ( tf.reduce_sum(lscale_prior.log_prob(lscales)) + bias_var_prior.log_prob(intercept_prior_var) + tf.reduce_sum(w_var_prior.log_prob(w_prior_var)) + bias_m_prior.log_prob(theta["intercept_prior_mean"]) + tf.reduce_sum(w_m_prior.log_prob(theta["w_prior_mean"])) ) grad = tape.gradient(objective, x_tf) if verbose: print(objective, np.linalg.norm(grad.numpy())) return (objective.numpy().astype(np.float64), grad.numpy().astype(np.float64)) if test_run: additional_args = {"tol": 1} else: additional_args = {} result = minimize( to_minimize_with_grad, flat_theta, jac=True, method="L-BFGS-B", **additional_args ) final_theta = reconstruct_tf(result.x, summary) final_theta = {x: tf.cast(y, tf.float32) for x, y in final_theta.items()} # Build the results fit_result = MOGPResult( L_elts=final_theta["L_elts"], mu=final_theta["mu"], kernel=kernel, lengthscales=final_theta["lscales"] ** 2, intercept_means=final_theta["intercept_means"], intercept_vars=final_theta["intercept_vars"] ** 2, w_means=final_theta["w_means"], w_vars=final_theta["w_vars"] ** 2, Z=final_theta["Z"], w_prior_means=final_theta["w_prior_mean"], w_prior_vars=final_theta["w_prior_var"] ** 2, intercept_prior_mean=final_theta["intercept_prior_mean"], intercept_prior_var=final_theta["intercept_prior_var"] ** 2, total_kernel_variance=tf.constant(total_kernel_variance, tf.float32), ) return fit_result
def to_minimize_with_grad(x): x = tf.Variable(x, dtype=tf.float32) with tf.GradientTape() as tape: tape.watch(x) cur_objective = to_minimize(x) grad = tape.gradient(cur_objective, x) print(cur_objective) return (cur_objective.numpy().astype(np.float64), grad.numpy().astype(np.float64)) result = minimize(to_minimize_with_grad, start_theta, jac=True, method='L-BFGS-B', **extra_args) final_theta = reconstruct_tf(result.x, summary) np.savez('final_theta_full_fix_alpha_test', Z=Z, alphas=alphas.numpy(), species_subset=species, scaler_mean=scaler.mean_, scaler_scale=scaler.scale_, n_inducing=n_inducing, n_latent=n_latent, n_latent_site=n_latent_site, **{x: y.numpy() for x, y in final_theta.items()})
def fit( X: np.ndarray, y: np.ndarray, n_inducing: int = 100, kernel: str = "matern_3/2", # Gamma priors (note tfp uses "concentration rate" parameterisation): kernel_variance_prior: Tuple[float, float] = (3 / 2, 3 / 2), kernel_lengthscale_prior: Tuple[float, float] = (3, 1 / 3), bias_variance_prior: Tuple[float, float] = (3 / 2, 3 / 2), random_seed: int = 2, verbose: bool = False, ) -> SOGPResult: np.random.seed(random_seed) assert kernel in [ "matern_3/2", "matern_1/2", "rbf", ], "Only these three kernels are currently supported!" # Note that input _must_ be scaled. Some way to enforce that? kernel_fun = kern_lookup[kernel] n_cov = X.shape[1] # Set initial values start_alpha = np.array(1.0, dtype=np.float32) start_lengthscales = np.random.uniform(2.0, 4.0, size=n_cov).astype(np.float32) start_bias_sd = np.array(1.0, dtype=np.float32) Z = find_starting_z(X, n_inducing).astype(np.float32) start_kernel_fun = get_kernel_fun(kernel_fun, start_alpha, start_lengthscales, start_bias_sd) init_L = get_initial_values_from_kernel(Z, start_kernel_fun) init_mu = np.zeros(n_inducing, dtype=np.float32) init_theta = { "L_elts": init_L, "mu": init_mu, "alpha": start_alpha, "lscales": np.sqrt(start_lengthscales), "Z": Z, "bias_sd": start_bias_sd, } flat_theta, summary = flatten_and_summarise_tf(**init_theta) X = tf.constant(X.astype(np.float32)) y = tf.constant(y.astype(np.float32)) lscale_prior = tfp.distributions.Gamma(*kernel_lengthscale_prior) kernel_var_prior = tfp.distributions.Gamma(*kernel_variance_prior) bias_var_prior = tfp.distributions.Gamma(*bias_variance_prior) def to_minimize_with_grad(x): with tf.GradientTape() as tape: x_tf = tf.constant(x) x_tf = tf.cast(x_tf, tf.float32) tape.watch(x_tf) theta = reconstruct_tf(x_tf, summary) alpha, lscales, bias_sd = ( theta["alpha"]**2, theta["lscales"]**2, theta["bias_sd"]**2, ) L_cov = lo_tri_from_elements(theta["L_elts"], n_inducing) kern_fun = get_kernel_fun(kernel_fun, alpha, lscales, bias_sd) objective = -compute_objective(X, y, theta["mu"], L_cov, theta["Z"], bernoulli_probit_lik, kern_fun) objective = objective - (tf.reduce_sum( lscale_prior.log_prob(lscales)) + kernel_var_prior.log_prob( alpha**2) + bias_var_prior.log_prob(bias_sd**2)) grad = tape.gradient(objective, x_tf) if verbose: print(objective, np.linalg.norm(grad.numpy())) return (objective.numpy().astype(np.float64), grad.numpy().astype(np.float64)) result = minimize(to_minimize_with_grad, flat_theta, jac=True, method="L-BFGS-B") final_theta = reconstruct_tf(result.x, summary) final_theta = { x: y.numpy().astype(np.float32) for x, y in final_theta.items() } # Build the results fit_result = SOGPResult( L_elts=final_theta["L_elts"], mu=final_theta["mu"], kernel=kernel, lengthscales=final_theta["lscales"]**2, alpha=final_theta["alpha"]**2, bias_sd=final_theta["bias_sd"]**2, Z=final_theta["Z"], ) return fit_result
def fit_minibatching( X: np.ndarray, z: np.ndarray, weights: np.ndarray, n_inducing: int, thinning_indices: Optional[np.ndarray] = np.array([]), fit_inducing_using_presences_only: bool = False, verbose: bool = True, log_theta_dir: Optional[str] = None, use_berman_turner: bool = False, batch_size: int = 1000, learning_rate: float = 0.01, n_steps: int = 1000, sqrt_decay_learning_rate: bool = True, ): global STEP STEP = 0 makedirs(log_theta_dir, exist_ok=True) n_cov = X.shape[1] if fit_inducing_using_presences_only: X_to_cluster = X[z > 0, :] else: X_to_cluster = X init_Z = find_starting_z(X_to_cluster, n_inducing).astype(np.float32) start_theta, init_kernel_spec = initialise_theta(n_cov, thinning_indices, init_Z) flat_theta, summary = flatten_and_summarise_tf(**start_theta) data_dict = {"X": X, "z": z, "weights": weights} data_dict = {x: y.astype(np.float32) for x, y in data_dict.items()} if sqrt_decay_learning_rate: # Decay with sqrt of time step_size_fun = lambda t: learning_rate * (1 / np.sqrt(t)) # NOQA else: # Constant learning rate step_size_fun = lambda t: learning_rate # NOQA opt_fun = partial( to_optimise, use_berman_turner=use_berman_turner, summary=summary, init_kernel_spec=init_kernel_spec, log_theta_dir=log_theta_dir, verbose=verbose, likelihood_scale_factor=X.shape[0] / batch_size, ) adam_state = initialise_state(flat_theta.shape[0]) adam_fun = partial(adam_step, step_size_fun=step_size_fun) result, loss_log = optimise_minibatching( data_dict, opt_fun, adam_fun, flat_theta, batch_size, n_steps, X.shape[0], join(log_theta_dir, "loss.txt"), False, adam_state, ) final_flat_theta = result.numpy().astype(np.float32) final_theta = reconstruct_tf(final_flat_theta, summary) _, final_spec = update_specs(final_theta, init_kernel_spec) return final_spec
def fit( X: np.ndarray, z: np.ndarray, weights: np.ndarray, n_inducing: int, thinning_indices: Optional[np.ndarray] = np.array([]), fit_inducing_using_presences_only: bool = False, verbose: bool = True, log_theta_dir: Optional[str] = None, use_berman_turner: bool = True, test_run: bool = False, ): global STEP STEP = 0 n_cov = X.shape[1] if fit_inducing_using_presences_only: X_to_cluster = X[z > 0, :] else: X_to_cluster = X init_Z = find_starting_z(X_to_cluster, n_inducing).astype(np.float32) start_theta, init_kernel_spec = initialise_theta(n_cov, thinning_indices, init_Z) # Prepare the tensors X = tf.cast(tf.constant(X), tf.float32) z = tf.cast(tf.constant(z), tf.float32) weights = tf.cast(tf.constant(weights), tf.float32) flat_theta, summary = flatten_and_summarise_tf(**start_theta) opt_fun = partial( to_optimise, X=X, z=z, weights=weights, use_berman_turner=use_berman_turner, summary=summary, init_kernel_spec=init_kernel_spec, log_theta_dir=log_theta_dir, verbose=verbose, likelihood_scale_factor=1.0, ) if test_run: result = minimize( opt_fun, flat_theta.numpy().astype(np.float64), method="L-BFGS-B", jac=True, tol=1, ) else: result = minimize(opt_fun, flat_theta.numpy().astype(np.float64), method="L-BFGS-B", jac=True) final_flat_theta = result.x.astype(np.float32) final_theta = reconstruct_tf(final_flat_theta, summary) _, final_spec = update_specs(final_theta, init_kernel_spec) return final_spec