Esempio n. 1
0
def extract_parameters(theta, summaries, n_inducing, n_latent, same_z=False):
    # Picks out the parameters and constrains some to be positive

    theta_dict = reconstruct_tf(theta, summaries)

    ms = theta_dict['m']
    Ls = create_ls(theta_dict['L_elts'], n_inducing, n_latent)

    w_means = theta_dict['W_means']
    w_vars = theta_dict['W_sds']**2

    if same_z:
        Z = rep_matrix(theta_dict['Z'], n_latent)
    else:
        Z = theta_dict['Z']

    w_prior_means = theta_dict['W_prior_mean']
    w_prior_vars = theta_dict['W_prior_sd']**2

    print(tf.reshape(w_prior_means, (-1, )))
    print(tf.reshape(w_prior_vars, (-1, )))

    kern_params = theta_dict['kernel_params']**2

    intercept = tf.squeeze(theta_dict['intercept'])

    return (ms, Ls, w_means, w_vars, Z, kern_params, w_prior_means,
            w_prior_vars, intercept)
Esempio n. 2
0
def to_minimize(x):

    theta = reconstruct_tf(x, summary)

    # TODO: Check initial values are still consistent here
    kerns = [partial(matern_kernel_32, alpha=alpha, lengthscales=lscale,
                     jitter=JITTER) for
             alpha, lscale in zip(alphas, theta['lscales']**2)]

    site_ls = create_ls(theta['site_l_elts'], n_latent_site, n_sites)
    env_ls = create_ls(theta['env_l_elts'], n_inducing, n_latent)

    objective = compute_objective(
        X, y, Z, theta['env_ms'], env_ls, kerns, theta['w_means'],
        theta['w_vars']**2, w_prior_mean, w_prior_var, site_prior_mean,
        site_prior_cov, theta['site_means'], site_ls, theta['b_mat'])

    # Add a prior term

    objective = objective +

    cur_corr_mat = tf.transpose(theta['b_mat']) @ theta['b_mat']
    print(np.round(covar_to_corr(cur_corr_mat.numpy()), 2))

    return -objective
Esempio n. 3
0
def to_optimise(
    flat_theta,
    X,
    z,
    weights,
    use_berman_turner,
    summary,
    init_kernel_spec,
    log_theta_dir=None,
    verbose=True,
    likelihood_scale_factor=1.0,
):

    global STEP

    flat_theta = tf.cast(tf.constant(flat_theta), tf.float32)

    with tf.GradientTape() as tape:

        tape.watch(flat_theta)

        theta = reconstruct_tf(flat_theta, summary)

        kernel_spec, gp_spec = update_specs(theta, init_kernel_spec)

        cur_objective = -calculate_objective(
            X, z, weights, gp_spec, use_berman_turner=use_berman_turner)

        kernel_prior_prob = calculate_prior_prob(kernel_spec)
        cur_objective = cur_objective - kernel_prior_prob

        cur_grad = tape.gradient(cur_objective, flat_theta)

        if log_theta_dir is not None:
            makedirs(log_theta_dir, exist_ok=True)
            grads = reconstruct_np(cur_grad.numpy(), summary)
            theta = reconstruct_np(flat_theta.numpy(), summary)
            np.savez(
                join(log_theta_dir, f"grads_{STEP}"),
                **grads,
                objective=cur_objective.numpy(),
                step=STEP,
            )
            np.savez(
                join(log_theta_dir, f"theta_{STEP}"),
                **theta,
                objective=cur_objective.numpy(),
                step=STEP,
            )

        STEP += 1

        if verbose:
            print(cur_objective, np.linalg.norm(cur_grad.numpy()))

    return (
        cur_objective.numpy().astype(np.float64),
        cur_grad.numpy().astype(np.float64),
    )
    def to_minimize(flat_theta):

        flat_theta = tf.constant(flat_theta)
        flat_theta = tf.cast(flat_theta, tf.float32)

        with tf.GradientTape() as tape:

            tape.watch(flat_theta)

            theta = reconstruct_tf(flat_theta, summary)

            objective = -calculate_objective(theta)

            grad = tape.gradient(objective, flat_theta)

        print(objective, np.linalg.norm(grad.numpy()))

        return (objective.numpy().astype(np.float64),
                grad.numpy().astype(np.float64))
Esempio n. 5
0
def test_with_tf():

    input_arrays = generate_arrays()

    # Turn into TF version
    tf_arrays = {x: tf.constant(y) for x, y in input_arrays.items()}

    flat_array, summaries = flatten_and_summarise(**input_arrays)

    reconstructed = reconstruct_tf(flat_array,
                                   summaries,
                                   reshape_fun=tf.reshape)

    checks = [
        tf.reduce_all(tf.equal(reconstructed[x], tf_arrays[x]))
        for x in input_arrays
    ]

    assert tf.reduce_all(checks).numpy()
def to_optimise(flat_theta):

    flat_theta = tf.cast(tf.constant(flat_theta), tf.float32)

    with tf.GradientTape() as tape:

        tape.watch(flat_theta)

        theta = reconstruct_tf(flat_theta, summary)

        obj = -compute_objective(n=n, p=p, n_surfaces=n_surfaces,
                                 server_ids=server_ids,
                                 returner_ids=returner_ids, surf_ids=surf_ids,
                                 **theta)

        grad = tape.gradient(obj, flat_theta)

        print(obj, np.linalg.norm(grad.numpy()))

    print(np.round(covar_to_corr(pos_def_mat_from_vector(
        theta['elts_prior_serve'], n_surfaces)), 2))

    return obj.numpy().astype(np.float64), grad.numpy().astype(np.float64)
Esempio n. 7
0
    def to_minimize_with_grad(x):

        with tf.GradientTape() as tape:

            x_tf = tf.constant(x)
            x_tf = tf.cast(x_tf, tf.float32)

            tape.watch(x_tf)

            theta = reconstruct_tf(x_tf, summary)

            alpha, lscales, bias_sd = (
                theta["alpha"]**2,
                theta["lscales"]**2,
                theta["bias_sd"]**2,
            )

            L_cov = lo_tri_from_elements(theta["L_elts"], n_inducing)

            kern_fun = get_kernel_fun(kernel_fun, alpha, lscales, bias_sd)

            objective = -compute_objective(X, y, theta["mu"], L_cov,
                                           theta["Z"], bernoulli_probit_lik,
                                           kern_fun)

            objective = objective - (tf.reduce_sum(
                lscale_prior.log_prob(lscales)) + kernel_var_prior.log_prob(
                    alpha**2) + bias_var_prior.log_prob(bias_sd**2))

            grad = tape.gradient(objective, x_tf)

        if verbose:
            print(objective, np.linalg.norm(grad.numpy()))

        return (objective.numpy().astype(np.float64),
                grad.numpy().astype(np.float64))
Esempio n. 8
0
def objective_and_grad(flat_theta,
                       X,
                       X_thin,
                       sp_num,
                       z,
                       weights,
                       summary,
                       n_latent,
                       n_data,
                       use_berman_turner,
                       log_cov_alpha,
                       log_thin_alpha=0.,
                       thin_Zs=None,
                       w_prior_mean=None):

    # TODO: Make priors configurable; add docstrings.
    # Note: if thin_Zs is passed, we are not optimising their locations.
    # This is not the cleanest way of doing it, but it's the best I can think
    # of for now.

    flat_theta = tf.constant(flat_theta.astype(np.float32))

    if X_thin is not None:

        X, X_thin, z, weights = map(
            lambda x: tf.cast(tf.constant(x), tf.float32),
            [X, X_thin, z, weights])

    else:

        X, z, weights = map(lambda x: tf.cast(tf.constant(x), tf.float32),
                            [X, z, weights])

    with tf.GradientTape() as tape:

        tape.watch(flat_theta)

        theta = reconstruct_tf(flat_theta, summary)

        if thin_Zs is not None:
            theta['thin_Zs'] = thin_Zs

        # This is fixed during optimisation, so we're setting it here
        theta['log_cov_alpha'] = log_cov_alpha
        theta['log_thin_alpha'] = log_thin_alpha

        # Also fix the intercept; give it only a very weak prior.
        # TODO: Maybe make this configurable
        theta['intercept_prior_var'] = tf.math.log(tf.constant(5.)**2)
        theta['intercept_prior_mean'] = tf.constant(0.)

        if w_prior_mean is not None:
            theta['w_prior_mean'] = w_prior_mean

        spec = build_spec(theta)

        # Fix prior mean and var to start with
        obj = -calculate_objective(spec.cov_mogp_spec,
                                   X,
                                   sp_num,
                                   z,
                                   weights,
                                   lik_scale_factor=n_data / X.shape[0],
                                   thinning_mogp_spec=spec.thin_mogp_spec,
                                   X_thin=X_thin,
                                   use_berman_turner=use_berman_turner)

        # Add prior on lengthscales
        obj = obj - tf.reduce_sum(
            tfp.distributions.Gamma(3, 1 / 3).log_prob(tf.exp(
                theta['lscales'])))

        # TODO: Make these configurable
        # Add prior on prior w means and variances
        obj = obj - tf.reduce_sum(
            tfp.distributions.Normal(0., 1.).log_prob(theta['w_prior_mean']))
        obj = obj - tf.reduce_sum(
            tfp.distributions.Gamma(0.5, 0.5).log_prob(
                tf.exp(theta['w_prior_var'])))

        # Add prior on intercept mean and variance
        obj = obj - tf.reduce_sum(
            tfp.distributions.Normal(0., 1.).log_prob(
                theta['intercept_prior_mean']))
        obj = obj - tf.reduce_sum(
            tfp.distributions.Gamma(0.5, 0.5).log_prob(
                tf.exp(theta['intercept_prior_var'])))

        if X_thin is not None:
            obj = obj - tf.reduce_sum(
                tfp.distributions.Gamma(3, 1 / 3).log_prob(
                    tf.exp(theta['thin_lscales'])))
            obj = obj - tf.reduce_sum(
                tfp.distributions.Normal(0., 1.).log_prob(
                    theta['thin_w_prior_mean']))
            obj = obj - tf.reduce_sum(
                tfp.distributions.Gamma(0.5, 0.5).log_prob(
                    tf.exp(theta['thin_w_prior_var'])))

        grad = tape.gradient(obj, flat_theta)

    if np.any(np.isnan(grad.numpy())):
        # Save the current state for investigation
        np.savez('theta_bug', **{x: y.numpy() for x, y in theta.items()})
        np.savez(
            'data_bug', **{
                'X': X.numpy(),
                'X_thin': X_thin.numpy(),
                'sp_num': sp_num,
                'z': z.numpy(),
                'weights': weights.numpy(),
                'n_latent': n_latent,
                'n_data': n_data,
                'use_berman_turner': use_berman_turner,
                'thin_Zs': thin_Zs.numpy()
            })
        exit()

    return obj.numpy().astype(np.float64), grad.numpy().astype(np.float64)
def fit(X: np.ndarray,
        y: np.ndarray,
        n_inducing: int = 100,
        n_latent: int = 10,
        kernel: str = 'matern_3/2',
        random_seed: int = 2):

    # TODO: This is copied from the mogp_classifier.
    # Maybe instead make it a function of some sort?
    np.random.seed(random_seed)

    # Note that input _must_ be scaled. Some way to enforce that?
    kernel_fun = kern_lookup[kernel]

    n_cov = X.shape[1]
    n_out = y.shape[1]

    # Set initial values
    start_lengthscales = np.random.uniform(2., 4., size=(n_latent, n_cov))

    Z = find_starting_z(X, n_inducing)
    Z = np.tile(Z, (n_latent, 1, 1))

    start_kernel_funs = get_kernel_funs(kernel_fun,
                                        np.sqrt(start_lengthscales))

    init_Ls = np.stack([
        get_initial_values_from_kernel(cur_z, cur_kernel_fun)
        for cur_z, cur_kernel_fun in zip(Z, start_kernel_funs)
    ])

    init_ms = np.zeros((n_latent, n_inducing))

    start_prior_cov = np.eye(n_latent)
    start_prior_mean = np.zeros(n_latent)
    start_prior_cov_elts = corr_mogp.get_initial_w_elements(
        start_prior_mean, start_prior_cov, n_out)

    start_w_cov_elts = rep_vector(start_prior_cov_elts, n_out)

    init_w_means = np.random.randn(n_out, n_latent)

    start_theta = {
        'mu': init_ms,
        'L_elts': init_Ls,
        'w_means': init_w_means,
        'w_cov_elts': start_w_cov_elts,
        'lengthscales': start_lengthscales,
        'w_prior_cov_elts': start_prior_cov_elts,
        'w_prior_mean': start_prior_mean,
        'Z': Z
    }

    flat_start_theta, summary = flatten_and_summarise_tf(**start_theta)

    X_tf = tf.constant(X.astype(np.float32))
    y_tf = tf.constant(y.astype(np.float32))

    def extract_cov_matrices(theta):

        w_covs = create_pos_def_mat_from_elts_batch(theta['w_cov_elts'],
                                                    n_latent,
                                                    n_out,
                                                    jitter=JITTER)

        Ls = mogp.create_ls(theta['L_elts'], n_inducing, n_latent)

        w_prior_cov = create_pos_def_mat_from_elts(theta['w_prior_cov_elts'],
                                                   n_latent,
                                                   jitter=JITTER)

        return w_covs, Ls, w_prior_cov

    def calculate_objective(theta):

        w_covs, Ls, w_prior_cov = extract_cov_matrices(theta)

        print(np.round(covar_to_corr(w_prior_cov.numpy()), 2))
        print(np.round(theta['lengthscales'].numpy()**2, 2))

        kernel_funs = get_kernel_funs(kernel_fun, theta['lengthscales']**2)

        cur_objective = corr_mogp.compute_default_objective(
            X_tf, y_tf, theta['Z'], theta['mu'], Ls, theta['w_means'], w_covs,
            kernel_funs, bernoulli_probit_lik, theta['w_prior_mean'],
            w_prior_cov)

        # Add prior
        lscale_prior = tfp.distributions.Gamma(3, 1 / 3).log_prob(
            theta['lengthscales']**2)

        return cur_objective + tf.reduce_sum(lscale_prior)

    def to_minimize(flat_theta):

        flat_theta = tf.constant(flat_theta)
        flat_theta = tf.cast(flat_theta, tf.float32)

        with tf.GradientTape() as tape:

            tape.watch(flat_theta)

            theta = reconstruct_tf(flat_theta, summary)

            objective = -calculate_objective(theta)

            grad = tape.gradient(objective, flat_theta)

        print(objective, np.linalg.norm(grad.numpy()))

        return (objective.numpy().astype(np.float64),
                grad.numpy().astype(np.float64))

    result = minimize(to_minimize,
                      flat_start_theta,
                      jac=True,
                      method='L-BFGS-B')

    final_theta = reconstruct_tf(result.x.astype(np.float32), summary)

    w_covs, Ls, w_prior_cov = extract_cov_matrices(final_theta)

    return CorrelatedMOGPResult(
        Ls=Ls,
        mu=final_theta['mu'].numpy(),
        kernel=kernel,
        lengthscales=final_theta['lengthscales'].numpy()**2,
        w_means=final_theta['w_means'].numpy(),
        w_cov=w_covs.numpy(),
        Z=final_theta['Z'].numpy(),
        w_prior_means=final_theta['w_prior_mean'].numpy(),
        w_prior_cov=w_prior_cov.numpy())
Esempio n. 10
0
    def to_minimize_with_grad(x):

        with tf.GradientTape() as tape:

            x_tf = tf.constant(x)
            x_tf = tf.cast(x_tf, tf.float32)

            tape.watch(x_tf)

            theta = reconstruct_tf(x_tf, summary)

            # Square the important parameters
            (lscales, w_prior_var, intercept_vars, intercept_prior_var, w_vars) = (
                theta["lscales"] ** 2,
                theta["w_prior_var"] ** 2,
                theta["intercept_vars"] ** 2,
                theta["intercept_prior_var"] ** 2,
                theta["w_vars"] ** 2,
            )

            if verbose:
                print(lscales)
                print(intercept_prior_var)
                print(w_prior_var)
                print(theta["w_prior_mean"])
                print(theta["intercept_prior_mean"])

            Ls = create_ls(theta["L_elts"], n_inducing, n_latent)

            kern_funs = get_kernel_funs(
                kernel_fun,
                lscales,
                total_variance=tf.constant(total_kernel_variance, dtype=tf.float32),
            )

            kl = compute_kl_term(
                theta["mu"],
                Ls,
                kern_funs,
                theta["Z"],
                theta["w_means"],
                w_vars,
                theta["w_prior_mean"],
                w_prior_var,
                theta["intercept_means"],
                intercept_vars,
                theta["intercept_prior_mean"],
                intercept_prior_var,
            )

            lik = compute_likelihood_term(
                X,
                y,
                theta["Z"],
                theta["mu"],
                Ls,
                kern_funs,
                theta["w_means"],
                w_vars,
                theta["intercept_means"],
                intercept_vars,
            )

            objective = -(lik - kl)

            objective = objective - (
                tf.reduce_sum(lscale_prior.log_prob(lscales))
                + bias_var_prior.log_prob(intercept_prior_var)
                + tf.reduce_sum(w_var_prior.log_prob(w_prior_var))
                + bias_m_prior.log_prob(theta["intercept_prior_mean"])
                + tf.reduce_sum(w_m_prior.log_prob(theta["w_prior_mean"]))
            )

            grad = tape.gradient(objective, x_tf)

        if verbose:
            print(objective, np.linalg.norm(grad.numpy()))

        return (objective.numpy().astype(np.float64), grad.numpy().astype(np.float64))
Esempio n. 11
0
def fit(
    X: np.ndarray,
    y: np.ndarray,
    n_inducing: int = 100,
    n_latent: int = 10,
    kernel: str = "matern_3/2",
    # Gamma priors (note tfp uses "concentration rate" parameterisation):
    kernel_lengthscale_prior: Tuple[float, float] = (3, 1 / 3),
    bias_variance_prior: Tuple[float, float] = (3 / 2, 3 / 2),
    w_variance_prior: Tuple[float, float] = (3 / 2, 3 / 2),
    # Normal priors
    w_mean_prior: Tuple[float, float] = (0, 1),
    bias_mean_prior: Tuple[float, float] = (0, 1),
    random_seed: int = 2,
    test_run: bool = False,
    total_kernel_variance=6.0,
    verbose=False,
) -> MOGPResult:

    np.random.seed(random_seed)

    # Note that input _must_ be scaled. Some way to enforce that?
    kernel_fun = kern_lookup[kernel]

    n_cov = X.shape[1]
    n_out = y.shape[1]

    # Set initial values
    start_lengthscales = np.random.uniform(2.0, 4.0, size=(n_latent, n_cov)).astype(
        np.float32
    )

    Z = find_starting_z(X, n_inducing)
    Z = np.tile(Z, (n_latent, 1, 1))
    Z = Z.astype(np.float32)

    start_kernel_funs = get_kernel_funs(
        kernel_fun,
        tf.constant(start_lengthscales),
        total_variance=tf.constant(total_kernel_variance),
    )

    init_Ls = np.stack(
        [
            get_initial_values_from_kernel(tf.constant(cur_z), cur_kernel_fun)
            for cur_z, cur_kernel_fun in zip(Z, start_kernel_funs)
        ]
    )

    init_ms = np.zeros((n_latent, n_inducing))
    w_prior_var_init = np.ones((n_latent, 1)) * 1.0
    w_prior_mean_init = np.zeros((n_latent, 1))

    start_intercept_means = np.zeros(n_out)
    start_intercept_var = np.ones(n_out)
    intercept_prior_var_init = np.array(0.4)

    init_theta = {
        "L_elts": init_Ls,
        "mu": init_ms,
        "w_prior_var": w_prior_var_init,
        "w_prior_mean": w_prior_mean_init,
        "intercept_means": start_intercept_means,
        "intercept_vars": start_intercept_var,
        "intercept_prior_var": intercept_prior_var_init,
        "intercept_prior_mean": np.array(0.0),
        "w_means": np.random.randn(n_latent, n_out) * 0.01,
        "w_vars": np.ones((n_latent, n_out)),
        "lscales": np.sqrt(start_lengthscales),
        "Z": Z,
    }

    # Make same type
    init_theta = {x: tf.constant(y.astype(np.float32)) for x, y in init_theta.items()}

    flat_theta, summary = flatten_and_summarise_tf(**init_theta)

    X = tf.constant(X.astype(np.float32))
    y = tf.constant(y.astype(np.float32))

    lscale_prior = tfp.distributions.Gamma(*kernel_lengthscale_prior)
    bias_var_prior = tfp.distributions.Gamma(*bias_variance_prior)
    w_var_prior = tfp.distributions.Gamma(*w_variance_prior)

    w_m_prior = tfp.distributions.Normal(*w_mean_prior)
    bias_m_prior = tfp.distributions.Normal(*bias_mean_prior)

    # TODO: Think about priors for W?

    def to_minimize_with_grad(x):

        with tf.GradientTape() as tape:

            x_tf = tf.constant(x)
            x_tf = tf.cast(x_tf, tf.float32)

            tape.watch(x_tf)

            theta = reconstruct_tf(x_tf, summary)

            # Square the important parameters
            (lscales, w_prior_var, intercept_vars, intercept_prior_var, w_vars) = (
                theta["lscales"] ** 2,
                theta["w_prior_var"] ** 2,
                theta["intercept_vars"] ** 2,
                theta["intercept_prior_var"] ** 2,
                theta["w_vars"] ** 2,
            )

            if verbose:
                print(lscales)
                print(intercept_prior_var)
                print(w_prior_var)
                print(theta["w_prior_mean"])
                print(theta["intercept_prior_mean"])

            Ls = create_ls(theta["L_elts"], n_inducing, n_latent)

            kern_funs = get_kernel_funs(
                kernel_fun,
                lscales,
                total_variance=tf.constant(total_kernel_variance, dtype=tf.float32),
            )

            kl = compute_kl_term(
                theta["mu"],
                Ls,
                kern_funs,
                theta["Z"],
                theta["w_means"],
                w_vars,
                theta["w_prior_mean"],
                w_prior_var,
                theta["intercept_means"],
                intercept_vars,
                theta["intercept_prior_mean"],
                intercept_prior_var,
            )

            lik = compute_likelihood_term(
                X,
                y,
                theta["Z"],
                theta["mu"],
                Ls,
                kern_funs,
                theta["w_means"],
                w_vars,
                theta["intercept_means"],
                intercept_vars,
            )

            objective = -(lik - kl)

            objective = objective - (
                tf.reduce_sum(lscale_prior.log_prob(lscales))
                + bias_var_prior.log_prob(intercept_prior_var)
                + tf.reduce_sum(w_var_prior.log_prob(w_prior_var))
                + bias_m_prior.log_prob(theta["intercept_prior_mean"])
                + tf.reduce_sum(w_m_prior.log_prob(theta["w_prior_mean"]))
            )

            grad = tape.gradient(objective, x_tf)

        if verbose:
            print(objective, np.linalg.norm(grad.numpy()))

        return (objective.numpy().astype(np.float64), grad.numpy().astype(np.float64))

    if test_run:
        additional_args = {"tol": 1}
    else:
        additional_args = {}

    result = minimize(
        to_minimize_with_grad,
        flat_theta,
        jac=True,
        method="L-BFGS-B",
        **additional_args
    )

    final_theta = reconstruct_tf(result.x, summary)
    final_theta = {x: tf.cast(y, tf.float32) for x, y in final_theta.items()}

    # Build the results
    fit_result = MOGPResult(
        L_elts=final_theta["L_elts"],
        mu=final_theta["mu"],
        kernel=kernel,
        lengthscales=final_theta["lscales"] ** 2,
        intercept_means=final_theta["intercept_means"],
        intercept_vars=final_theta["intercept_vars"] ** 2,
        w_means=final_theta["w_means"],
        w_vars=final_theta["w_vars"] ** 2,
        Z=final_theta["Z"],
        w_prior_means=final_theta["w_prior_mean"],
        w_prior_vars=final_theta["w_prior_var"] ** 2,
        intercept_prior_mean=final_theta["intercept_prior_mean"],
        intercept_prior_var=final_theta["intercept_prior_var"] ** 2,
        total_kernel_variance=tf.constant(total_kernel_variance, tf.float32),
    )

    return fit_result
Esempio n. 12
0

def to_minimize_with_grad(x):

    x = tf.Variable(x, dtype=tf.float32)

    with tf.GradientTape() as tape:

        tape.watch(x)

        cur_objective = to_minimize(x)

        grad = tape.gradient(cur_objective, x)

    print(cur_objective)

    return (cur_objective.numpy().astype(np.float64),
            grad.numpy().astype(np.float64))


result = minimize(to_minimize_with_grad, start_theta, jac=True,
                  method='L-BFGS-B', **extra_args)

final_theta = reconstruct_tf(result.x, summary)

np.savez('final_theta_full_fix_alpha_test', Z=Z, alphas=alphas.numpy(),
         species_subset=species, scaler_mean=scaler.mean_,
         scaler_scale=scaler.scale_, n_inducing=n_inducing,
         n_latent=n_latent, n_latent_site=n_latent_site,
         **{x: y.numpy() for x, y in final_theta.items()})
Esempio n. 13
0
def fit(
    X: np.ndarray,
    y: np.ndarray,
    n_inducing: int = 100,
    kernel: str = "matern_3/2",
    # Gamma priors (note tfp uses "concentration rate" parameterisation):
    kernel_variance_prior: Tuple[float, float] = (3 / 2, 3 / 2),
    kernel_lengthscale_prior: Tuple[float, float] = (3, 1 / 3),
    bias_variance_prior: Tuple[float, float] = (3 / 2, 3 / 2),
    random_seed: int = 2,
    verbose: bool = False,
) -> SOGPResult:

    np.random.seed(random_seed)

    assert kernel in [
        "matern_3/2",
        "matern_1/2",
        "rbf",
    ], "Only these three kernels are currently supported!"

    # Note that input _must_ be scaled. Some way to enforce that?

    kernel_fun = kern_lookup[kernel]

    n_cov = X.shape[1]

    # Set initial values
    start_alpha = np.array(1.0, dtype=np.float32)
    start_lengthscales = np.random.uniform(2.0, 4.0,
                                           size=n_cov).astype(np.float32)
    start_bias_sd = np.array(1.0, dtype=np.float32)

    Z = find_starting_z(X, n_inducing).astype(np.float32)

    start_kernel_fun = get_kernel_fun(kernel_fun, start_alpha,
                                      start_lengthscales, start_bias_sd)

    init_L = get_initial_values_from_kernel(Z, start_kernel_fun)
    init_mu = np.zeros(n_inducing, dtype=np.float32)

    init_theta = {
        "L_elts": init_L,
        "mu": init_mu,
        "alpha": start_alpha,
        "lscales": np.sqrt(start_lengthscales),
        "Z": Z,
        "bias_sd": start_bias_sd,
    }

    flat_theta, summary = flatten_and_summarise_tf(**init_theta)

    X = tf.constant(X.astype(np.float32))
    y = tf.constant(y.astype(np.float32))

    lscale_prior = tfp.distributions.Gamma(*kernel_lengthscale_prior)
    kernel_var_prior = tfp.distributions.Gamma(*kernel_variance_prior)
    bias_var_prior = tfp.distributions.Gamma(*bias_variance_prior)

    def to_minimize_with_grad(x):

        with tf.GradientTape() as tape:

            x_tf = tf.constant(x)
            x_tf = tf.cast(x_tf, tf.float32)

            tape.watch(x_tf)

            theta = reconstruct_tf(x_tf, summary)

            alpha, lscales, bias_sd = (
                theta["alpha"]**2,
                theta["lscales"]**2,
                theta["bias_sd"]**2,
            )

            L_cov = lo_tri_from_elements(theta["L_elts"], n_inducing)

            kern_fun = get_kernel_fun(kernel_fun, alpha, lscales, bias_sd)

            objective = -compute_objective(X, y, theta["mu"], L_cov,
                                           theta["Z"], bernoulli_probit_lik,
                                           kern_fun)

            objective = objective - (tf.reduce_sum(
                lscale_prior.log_prob(lscales)) + kernel_var_prior.log_prob(
                    alpha**2) + bias_var_prior.log_prob(bias_sd**2))

            grad = tape.gradient(objective, x_tf)

        if verbose:
            print(objective, np.linalg.norm(grad.numpy()))

        return (objective.numpy().astype(np.float64),
                grad.numpy().astype(np.float64))

    result = minimize(to_minimize_with_grad,
                      flat_theta,
                      jac=True,
                      method="L-BFGS-B")

    final_theta = reconstruct_tf(result.x, summary)
    final_theta = {
        x: y.numpy().astype(np.float32)
        for x, y in final_theta.items()
    }

    # Build the results
    fit_result = SOGPResult(
        L_elts=final_theta["L_elts"],
        mu=final_theta["mu"],
        kernel=kernel,
        lengthscales=final_theta["lscales"]**2,
        alpha=final_theta["alpha"]**2,
        bias_sd=final_theta["bias_sd"]**2,
        Z=final_theta["Z"],
    )

    return fit_result
Esempio n. 14
0
def fit_minibatching(
    X: np.ndarray,
    z: np.ndarray,
    weights: np.ndarray,
    n_inducing: int,
    thinning_indices: Optional[np.ndarray] = np.array([]),
    fit_inducing_using_presences_only: bool = False,
    verbose: bool = True,
    log_theta_dir: Optional[str] = None,
    use_berman_turner: bool = False,
    batch_size: int = 1000,
    learning_rate: float = 0.01,
    n_steps: int = 1000,
    sqrt_decay_learning_rate: bool = True,
):

    global STEP
    STEP = 0

    makedirs(log_theta_dir, exist_ok=True)

    n_cov = X.shape[1]

    if fit_inducing_using_presences_only:
        X_to_cluster = X[z > 0, :]
    else:
        X_to_cluster = X

    init_Z = find_starting_z(X_to_cluster, n_inducing).astype(np.float32)

    start_theta, init_kernel_spec = initialise_theta(n_cov, thinning_indices,
                                                     init_Z)

    flat_theta, summary = flatten_and_summarise_tf(**start_theta)

    data_dict = {"X": X, "z": z, "weights": weights}

    data_dict = {x: y.astype(np.float32) for x, y in data_dict.items()}

    if sqrt_decay_learning_rate:
        # Decay with sqrt of time
        step_size_fun = lambda t: learning_rate * (1 / np.sqrt(t))  # NOQA
    else:
        # Constant learning rate
        step_size_fun = lambda t: learning_rate  # NOQA

    opt_fun = partial(
        to_optimise,
        use_berman_turner=use_berman_turner,
        summary=summary,
        init_kernel_spec=init_kernel_spec,
        log_theta_dir=log_theta_dir,
        verbose=verbose,
        likelihood_scale_factor=X.shape[0] / batch_size,
    )

    adam_state = initialise_state(flat_theta.shape[0])

    adam_fun = partial(adam_step, step_size_fun=step_size_fun)

    result, loss_log = optimise_minibatching(
        data_dict,
        opt_fun,
        adam_fun,
        flat_theta,
        batch_size,
        n_steps,
        X.shape[0],
        join(log_theta_dir, "loss.txt"),
        False,
        adam_state,
    )

    final_flat_theta = result.numpy().astype(np.float32)
    final_theta = reconstruct_tf(final_flat_theta, summary)
    _, final_spec = update_specs(final_theta, init_kernel_spec)

    return final_spec
Esempio n. 15
0
def fit(
    X: np.ndarray,
    z: np.ndarray,
    weights: np.ndarray,
    n_inducing: int,
    thinning_indices: Optional[np.ndarray] = np.array([]),
    fit_inducing_using_presences_only: bool = False,
    verbose: bool = True,
    log_theta_dir: Optional[str] = None,
    use_berman_turner: bool = True,
    test_run: bool = False,
):

    global STEP
    STEP = 0

    n_cov = X.shape[1]

    if fit_inducing_using_presences_only:
        X_to_cluster = X[z > 0, :]
    else:
        X_to_cluster = X

    init_Z = find_starting_z(X_to_cluster, n_inducing).astype(np.float32)

    start_theta, init_kernel_spec = initialise_theta(n_cov, thinning_indices,
                                                     init_Z)

    # Prepare the tensors
    X = tf.cast(tf.constant(X), tf.float32)
    z = tf.cast(tf.constant(z), tf.float32)
    weights = tf.cast(tf.constant(weights), tf.float32)

    flat_theta, summary = flatten_and_summarise_tf(**start_theta)

    opt_fun = partial(
        to_optimise,
        X=X,
        z=z,
        weights=weights,
        use_berman_turner=use_berman_turner,
        summary=summary,
        init_kernel_spec=init_kernel_spec,
        log_theta_dir=log_theta_dir,
        verbose=verbose,
        likelihood_scale_factor=1.0,
    )

    if test_run:
        result = minimize(
            opt_fun,
            flat_theta.numpy().astype(np.float64),
            method="L-BFGS-B",
            jac=True,
            tol=1,
        )
    else:
        result = minimize(opt_fun,
                          flat_theta.numpy().astype(np.float64),
                          method="L-BFGS-B",
                          jac=True)

    final_flat_theta = result.x.astype(np.float32)
    final_theta = reconstruct_tf(final_flat_theta, summary)
    _, final_spec = update_specs(final_theta, init_kernel_spec)

    return final_spec