def save_theta_and_grad_callback(step: int, loss: float, theta: np.ndarray, grad: np.ndarray, opt_state: Any, target_dir: str, summary: Any, save_every: int, additional_vars: Dict[str, np.ndarray] = {}): os.makedirs(target_dir, exist_ok=True) if step % save_every == 0: # Reconstruct theta_dict = reconstruct_np(theta, summary) grad_dict = reconstruct_np(grad, summary) theta_dict['loss'] = loss theta_dict['step'] = step grad_dict['step'] = step # Add any additional variables passed in theta_dict.update(additional_vars) # Save theta_target = os.path.join(target_dir, f'theta_{step}.npz') grad_target = os.path.join(target_dir, f'grad_{step}.npz') np.savez(theta_target, **theta_dict) np.savez(grad_target, **grad_dict)
def to_optimise( flat_theta, X, z, weights, use_berman_turner, summary, init_kernel_spec, log_theta_dir=None, verbose=True, likelihood_scale_factor=1.0, ): global STEP flat_theta = tf.cast(tf.constant(flat_theta), tf.float32) with tf.GradientTape() as tape: tape.watch(flat_theta) theta = reconstruct_tf(flat_theta, summary) kernel_spec, gp_spec = update_specs(theta, init_kernel_spec) cur_objective = -calculate_objective( X, z, weights, gp_spec, use_berman_turner=use_berman_turner) kernel_prior_prob = calculate_prior_prob(kernel_spec) cur_objective = cur_objective - kernel_prior_prob cur_grad = tape.gradient(cur_objective, flat_theta) if log_theta_dir is not None: makedirs(log_theta_dir, exist_ok=True) grads = reconstruct_np(cur_grad.numpy(), summary) theta = reconstruct_np(flat_theta.numpy(), summary) np.savez( join(log_theta_dir, f"grads_{STEP}"), **grads, objective=cur_objective.numpy(), step=STEP, ) np.savez( join(log_theta_dir, f"theta_{STEP}"), **theta, objective=cur_objective.numpy(), step=STEP, ) STEP += 1 if verbose: print(cur_objective, np.linalg.norm(cur_grad.numpy())) return ( cur_objective.numpy().astype(np.float64), cur_grad.numpy().astype(np.float64), )
def test_end_to_end(): input_arrays = generate_arrays() flat_array, summaries = flatten_and_summarise(**input_arrays) reconstructed = reconstruct_np(flat_array, summaries) assert all([ np.array_equal(reconstructed[x], input_arrays[x]) for x in input_arrays ])
def fit(X: np.ndarray, z: np.ndarray, weights: np.ndarray, sp_num: np.ndarray, n_inducing: int, n_latent: int, log_folder: str, use_berman_turner: bool = True, X_thin: Optional[np.ndarray] = None, n_thin_inducing: Optional[int] = None, learning_rate: float = 0.01, steps: int = 100000, batch_size: int = 50000, save_opt_state: bool = False, save_every: Optional[int] = 1000, fix_thin_inducing: bool = False, cov_alpha: Optional[float] = None, thin_alpha: Optional[float] = 1., fix_zero_w_prior_mean: bool = True, separate_w_prior_vars: bool = True): n_cov = X.shape[1] n_data = X.shape[0] n_out = len(np.unique(sp_num)) Z = find_starting_z(X[(z == 0) & (sp_num == np.unique(sp_num)[0])], n_inducing) if X_thin is not None: # Make sure we were given how many thinning inducing to use assert n_thin_inducing is not None Z_thin = find_starting_z( X_thin[(z == 0) & (sp_num == np.unique(sp_num)[0])], n_thin_inducing) else: Z_thin = None log_cov_alpha = np.log(cov_alpha) if cov_alpha is not None else tf.cast( tf.constant(np.log(np.sqrt(2. / n_latent))), tf.float32) log_thin_alpha = np.log(thin_alpha) start_theta = initialise_theta(Z, n_latent, n_cov, n_out, Z_thin=Z_thin, log_cov_alpha=log_cov_alpha, log_thin_alpha=log_thin_alpha, separate_w_prior_vars=separate_w_prior_vars) if fix_thin_inducing: # Remove them from the theta dict of parameters to optimise start_theta = {x: y for x, y in start_theta.items() if x != 'thin_Zs'} if fix_zero_w_prior_mean: # Remove them from the theta dict of parameters to optimise start_theta = { x: y for x, y in start_theta.items() if x != 'w_prior_mean' } flat_theta, summary = flatten_and_summarise_tf(**start_theta) log_folder = os.path.join( log_folder, create_path_with_variables(lr=learning_rate, batch_size=batch_size, steps=steps)) os.makedirs(log_folder, exist_ok=True) opt_step_fun = partial(adam_step, step_size_fun=lambda t: learning_rate) opt_state = initialise_state(flat_theta.shape[0]) flat_theta = flat_theta.numpy() to_optimise = partial(objective_and_grad, n_data=n_data, n_latent=n_latent, summary=summary, use_berman_turner=use_berman_turner, log_cov_alpha=log_cov_alpha) if fix_thin_inducing: to_optimise = partial(to_optimise, thin_Zs=tf.constant( np.expand_dims(Z_thin.astype(np.float32), axis=0))) n_w_means = n_out if separate_w_prior_vars else 1 if fix_zero_w_prior_mean: to_optimise = partial(to_optimise, w_prior_mean=tf.zeros((n_w_means, n_latent))) full_data = {'X': X, 'sp_num': sp_num, 'z': z, 'weights': weights} log_file = os.path.join(log_folder, 'losses.txt') if X_thin is not None: full_data['X_thin'] = X_thin else: to_optimise = partial(to_optimise, X_thin=None) loss_log_file = open(log_file, 'w') additional_vars = {} if fix_thin_inducing: # Store thin Zs for callback to save additional_vars['thin_Zs'] = np.expand_dims(Z_thin, axis=0) if fix_zero_w_prior_mean: additional_vars['w_prior_mean'] = np.zeros((n_w_means, n_latent)) additional_vars['log_cov_alpha'] = log_cov_alpha additional_vars['log_thin_alpha'] = log_thin_alpha def opt_callback(step: int, loss: float, theta: np.ndarray, grad: np.ndarray, opt_state: Any): # Save theta and the gradients save_theta_and_grad_callback(step, loss, theta, grad, opt_state, log_folder, summary, save_every, additional_vars=additional_vars) # Log the loss loss_log_callback(step, loss, theta, grad, opt_state, loss_log_file) flat_theta, loss_log, _ = optimise_minibatching(full_data, to_optimise, opt_step_fun, opt_state, flat_theta, batch_size, steps, X.shape[0], callback=opt_callback) # Cast to float32 flat_theta = flat_theta.astype(np.float32) final_theta = reconstruct_np(flat_theta, summary) if fix_thin_inducing: final_theta['thin_Zs'] = np.expand_dims(Z_thin, axis=0) if fix_zero_w_prior_mean: final_theta['w_prior_mean'] = np.zeros((1, n_latent)) final_theta['log_cov_alpha'] = log_cov_alpha final_theta['log_thin_alpha'] = log_thin_alpha return final_theta
def to_optimise(flat_theta): flat_theta = tf.cast(tf.constant(flat_theta), tf.float32) with tf.GradientTape() as tape: tape.watch(flat_theta) theta = reconstruct_tf(flat_theta, summary) obj = -compute_objective(n=n, p=p, n_surfaces=n_surfaces, server_ids=server_ids, returner_ids=returner_ids, surf_ids=surf_ids, **theta) grad = tape.gradient(obj, flat_theta) print(obj, np.linalg.norm(grad.numpy())) print(np.round(covar_to_corr(pos_def_mat_from_vector( theta['elts_prior_serve'], n_surfaces)), 2)) return obj.numpy().astype(np.float64), grad.numpy().astype(np.float64) result = minimize(to_optimise, flat_theta.numpy().astype(np.float64), method='L-BFGS-B', jac=True) np.savez('surface_model_1990', players=encoder.classes_, surfaces=surf_enc.classes_, **reconstruct_np(result.x, summary))