Exemple #1
0
    def fit(self, step_size=1e-2, max_iteration=5000, check_point=None, params_init=None, call_back=None, verbose=True, optimizer='adam', mass=None, reset=True):
        ''' Optimization of the variational objective '''
        if check_point is not None:
            self.check_point = check_point

        if params_init is None:
            mean_init = self.random.normal(0, 0.1, size=self.D)
            parametrized_var_init = self.random.normal(0, 0.1, size=self.D)
            params_init = np.concatenate([mean_init, parametrized_var_init])

        assert len(params_init) == 2 * self.D

        self.verbose = verbose

        if call_back is None:
            call_back = self.call_back

        if reset:
            self.ELBO = np.empty((1, 1))
            self.variational_params = np.empty((1, 2 * self.D))

        if optimizer == 'adam':
            adam(self.gradient, params_init, step_size=step_size, num_iters=max_iteration, callback=call_back)
        elif optimizer == 'sgd':
            if mass is None:
                mass = 1e-16
            sgd(self.gradient, params_init, step_size=step_size, num_iters=max_iteration, callback=call_back, mass=mass)
        elif optimizer == 'debug':
            params = params_init
            for i in range(max_iteration):
                params -= step_size * self.gradient(params, i)
                self.debug_call_back(params, i)

        self.variational_params = self.variational_params[1:]
        self.ELBO = self.ELBO[1:]
Exemple #2
0
def learn_maxpl(imgs):
    """Learn the weights and bias for the Hopfield network by maximizing the pseudo log-likelihood."""
    img_size = np.prod(imgs[0].shape)

    fake_weights = np.random.normal(0, 0.1, (img_size, img_size))
    bias = np.random.normal(0, 0.1, (img_size))
    diag_mask = np.ones((img_size, img_size)) - np.identity(img_size)

    def objective(params, iter):
        fake_weights, bias = params
        weights = np.multiply((fake_weights + fake_weights.T) / 2, diag_mask)
        pll = 0
        for i in range(len(imgs)):
            img = np.reshape(imgs[i], -1)
            activations = np.matmul(weights, img) + bias
            output = sigmoid(activations)
            eps = 1e-10
            img[img < 0] = 0
            pll += np.sum(np.multiply(img, np.log(output+eps)) + np.multiply(1-img, np.log(1-output+eps)))
        if iter % 100 == 0: print(-pll)
        return -pll

    g = grad(objective)

    fake_weights, bias = sgd(g, (fake_weights, bias), num_iters=300, step_size=0.001)
    weights = np.multiply((fake_weights + fake_weights.T) / 2, diag_mask)

    plt.imsave('weights_mpl.jpg', weights)
    return weights, bias
Exemple #3
0
 def optnat(n, init, cb):
     return sgd(natural_gradient,
                init,
                step_size=step_size,
                num_iters=n,
                callback=cb,
                mass=.001)
def scale_temperature(est,xval,yval):
    probs = est.predict_proba(xval)
    def temp_loss(t,iter):
        t = np.maximum(t,1e-8)
        probs_t = softmax(probs/t)
        loss = nll_loss(probs_t,yval)
        return loss 
    print_freq = 100
    def print_perf(temp,iter,gradient):
        if iter % print_freq == 0:
            print(str(iter)+': '+str(temp)+' '+str(temp_loss(temp,iter)))
    grad_temp = grad(temp_loss)
    tol = .0001
    max_iter = 201
    step_size = .05
    temp_sol = sgd(grad_temp,1.0,step_size=step_size,num_iters=max_iter,callback=print_perf)
    return temp_sol
Exemple #5
0
        init_gresnet(P, [(40, np.tanh), (40, np.tanh), (2*N, gaussian_info)])
    decode,   loglike_params = \
        init_gresnet(N, [(40, np.tanh), (40, np.tanh), (2*P, gaussian_mean)])
    loglike = make_loglike(decode)

    # initialize gmm parameters
    pgm_params = init_pgm_param(K, N, alpha=1., niw_conc=1., random_scale=3.)
    params = pgm_params, loglike_params, recogn_params

    # set up encoder/decoder and plotting
    encode_mean, decode_mean = make_encoder_decoder(recognize, decode)
    plot = make_plotter_2d(recognize,
                           decode,
                           data,
                           num_clusters,
                           params,
                           plot_every=100)

    # instantiate svae gradient function
    gradfun = make_gradfun(run_inference, recognize, loglike, pgm_prior_params,
                           data)

    # optimize
    params = sgd(gradfun(batch_size=50,
                         num_samples=1,
                         natgrad_scale=1e4,
                         callback=plot),
                 params,
                 num_iters=1000,
                 step_size=1e-3)
Exemple #6
0
    sigma2_a = np.sum(np.linalg.solve(sigmainv, x.T).T * x, axis=1)

    kappa = np.sqrt(1 + sigma2_a * np.pi * .125)
    return sigmoid(mu_a / kappa)


x = np.array([[0.52, 1.12, 0.77], [0.88, -1.08, 0.15], [0.52, 0.06, -1.30],
              [0.74, -2.49, 1.39], [0.52, 1.12, 0.77]])

y = np.array([True, True, False, True, False])

x = np.hstack([np.ones((len(x), 1)), x])
training_loss = lambda w, i: nll_loss(w, x, y, alpha=0.1)
g = grad(training_loss)
w = np.array([1, 1, 1, 1], dtype=np.float)
print("Initial loss:", training_loss(w, 0))
#for i in range(100):
#    w -= g(w) * 0.01

w = sgd(g, w)
print("Trained loss:", training_loss(w, 0))

pred = predict(w, x) > 0.5

print(y.astype(int))
print('ml', predict(w, x))

sigmainv = compute_precision(x, y, w, alpha=0.1)

print('var', predict_var(w, sigmainv, x))
print('mc', predict_mc(w, np.linalg.inv(sigmainv), x))
        init_log_std = -5 * np.ones(D)
        init_var_params = np.concatenate([init_mean, init_log_std])
        variational_params = optfun(num_iters, init_var_params, callback)
        return np.array(elbos)

    # let's optimize this with a few different step sizes
    elbo_lists = []
    step_sizes = [.1, .25, .5]
    for step_size in step_sizes:
        # optimize with standard gradient + adam
        optfun = lambda n, init, cb: adam(gradient, init, step_size=step_size,
                                                    num_iters=n, callback=cb)
        standard_lls = optimize_and_lls(optfun)

        # optimize with natural gradient + sgd, no momentum
        optnat = lambda n, init, cb: sgd(natural_gradient, init, step_size=step_size,
                                         num_iters=n, callback=cb, mass=.001)
        natural_lls = optimize_and_lls(optnat)
        elbo_lists.append((standard_lls, natural_lls))

    # visually compare the ELBO
    plt.figure(figsize=(12,8))
    colors = ['b', 'k', 'g']
    for col, ss, (stand_lls, nat_lls) in zip(colors, step_sizes, elbo_lists):
        plt.plot(np.arange(len(stand_lls)), stand_lls,
                 '--', label="standard (adam, step-size = %2.2f)"%ss, alpha=.5, c=col)
        plt.plot(np.arange(len(nat_lls)), nat_lls, '-',
                 label="natural (sgd, step-size = %2.2f)"%ss, c=col)

    llrange = natural_lls.max() - natural_lls.min()
    plt.ylim((natural_lls.max() - llrange*.1, natural_lls.max() + 10))
    plt.xlabel("optimization iteration")
Exemple #8
0
        variational_params = optfun(num_iters, init_var_params, callback)
        return np.array(elbos)

    # let's optimize this with a few different step sizes
    elbo_lists = []
    step_sizes = [.1, .25, .5]
    for step_size in step_sizes:
        # optimize with standard gradient + adam
        optfun = lambda n, init, cb: adam(
            gradient, init, step_size=step_size, num_iters=n, callback=cb)
        standard_lls = optimize_and_lls(optfun)

        # optimize with natural gradient + sgd, no momentum
        optnat = lambda n, init, cb: sgd(natural_gradient,
                                         init,
                                         step_size=step_size,
                                         num_iters=n,
                                         callback=cb,
                                         mass=.001)
        natural_lls = optimize_and_lls(optnat)
        elbo_lists.append((standard_lls, natural_lls))

    # visually compare the ELBO
    plt.figure(figsize=(12, 8))
    colors = ['b', 'k', 'g']
    for col, ss, (stand_lls, nat_lls) in zip(colors, step_sizes, elbo_lists):
        plt.plot(np.arange(len(stand_lls)),
                 stand_lls,
                 '--',
                 label="standard (adam, step-size = %2.2f)" % ss,
                 alpha=.5,
                 c=col)
            u_hat = (m(np.dot(w, u)) - np.dot(w, u)) * (w /
                                                        np.linalg.norm(w)) + u
            z_prev = z_prev + np.outer(h(np.matmul(z_prev, w) + b), u_hat)
        z_K = z_prev
        plt.figure(figsize=(5, 4))
        plt.hist(z_K, 100, density=True)
        plt.show()


init_W = 1 * np.ones((K, dim_z))
init_U = 1 * np.ones((K, dim_z))
init_b = 1 * np.ones((K))
init_params = np.concatenate(
    (init_W.flatten(), init_U.flatten(), init_b.flatten()))

variational_params = sgd(gradient, init_params, callback, num_iter, 5e-4)

W, U, B = unpack_params(variational_params)
z0 = np.random.randn(num_samples, dim_z)
z_prev = z0
for k in range(K):
    w, u, b = W[k], U[k], B[k]
    u_hat = (m(np.dot(w, u)) - np.dot(w, u)) * (w / np.linalg.norm(w)) + u
    z_prev = z_prev + np.outer(h(np.matmul(z_prev, w) + b), u_hat)
z_K = z_prev

plt.figure(figsize=(10, 8))
plt.plot(objectives)
plt.show()

# fig,ax=plt.subplots(1,1,figsize = (10,8))
Exemple #10
0
def relu(x):
    return np.maximum(0, x)


nonlinearity = np.tanh


def forward(params, inputs):
    h1 = nonlinearity(np.dot(inputs, params['W1']) + params['b1'])
    h2 = nonlinearity(np.dot(h1, params['W2']) + params['b2'])
    output = np.dot(h2, params['W3']) + params['b3']
    return output


def loss(params, i=0):
    output = forward(params, inputs)
    # convert col-shape output to row-shape output, align with t
    return (1.0 / (2 * inputs.shape[0])) * np.sum(
        (output.reshape(output.shape[0]) - t)**2)


print(loss(params))
optimized_params = sgd(grad(loss), params, step_size=0.01, num_iters=5000)
print(optimized_params)
print(loss(optimized_params))
final_y = forward(optimized_params, inputs)
plt.plot(x, t, 'r.')
plt.plot(x, final_y, 'b-')
plt.show()
    # Define training objective
    def objective(params, iter):
        idx = batch_indices(iter)
        return -log_posterior(params, train_data[idx], train_labels[idx],
                              L2_reg)

    # Get gradient of objective using autograd.
    objective_grad = grad(objective)

    print(
        "     Epoch     |     Loss        |   Train accuracy  |       Test accuracy  "
    )

    def print_perf(params, iter, gradient):
        if iter % num_batches == 0:
            train_acc = accuracy(params, train_data, train_labels)
            test_acc = accuracy(params, test_data, test_labels)
            #print(flatten(params))
            #print(objective_grad(params,iter))
            print("{:15}|{:20}|{:20}|{:20}".format(iter // num_batches,
                                                   objective(params, iter),
                                                   train_acc, test_acc))

    # The optimizers provided can optimize lists, tuples, or dicts of parameters.
    optimized_params = sgd(objective_grad,
                           init_params,
                           step_size=step_size,
                           num_iters=num_epochs * num_batches,
                           callback=print_perf)
Exemple #12
0
def experiment(train_data,
               valid_data,
               test_data,
               init_scale,
               batch_size,
               num_iters_hypernet,
               step_size_hypernet,
               num_iters_hyper,
               step_size_hyper,
               num_iters,
               graph_mod,
               global_seed=0):
    """Run the second experiment, which consists of fitting a hypernetwork, which outputs neural network parameters.
    These neural network parameters try to fit the training data with some additional loss for the hyperparameters.
    We try to optimize the hyperparameters given the learned neural network response through the hypernetwork.
    We observe how the hypernetwork performs on the training and testing, by graphing it against the true loss.
    The true loss is found by training a neural network to convergence at a discrete number of points.

    :param train_data: The training data which is a tuple of (train_input, train_target).
    :param valid_data: The testing data which is a tuple of (valid_input, valid_target).
    :param test_data: The testing data which is a tuple of (test_input, test_target).
    :param init_scale: The scale (positive float) for the hypernetwork initialization.
    :param batch_size: The number of hyperparameters to sample for each iteration.
    :param num_iters_hypernet: The number of iterations (integer) to run the hypernetwork optimizer for.
    :param step_size_hypernet: The step size (positive float) for the hypernetwork optimizer.
    :param num_iters_hyper: The number of iterations (integer) to run the hyperparameter optimizer for.
    :param step_size_hyper: The step size (positive float) for the hypernetwork optimizer.
    :param num_iters: The number of iterations (integer) to run the optimization for.
    :param graph_mod: How many iterations (integer) to weight between each graph of the loss.
    :param global_seed: The seed (integer) to use when choosing a constant seed.
    :return: None.
    """
    assert init_scale > 0
    assert step_size_hypernet > 0 and step_size_hyper > 0
    assert num_iters > 0 and num_iters_hypernet > 0 and num_iters_hyper > 0
    global hyper_cur
    hyper_cur = -3.5  # Initialize the hyperparameter (float).

    # Define information about hyper loss and how hyper parameters are sampled.
    hyper_sample_var = 0  # 10e-4  # The variance to use when sampling hyperparameters from a Gaussian distribution.

    def sample_hypers(hyper, rs):
        """Sample a hyperparameter.

        :param hyper: The current hyperparameter ([float]).
        :param rs: A numpy randomstate.
        :return: A sampled hyperparameter (float).
        """
        return np.array([rs.randn() * hyper_sample_var + hyper]).reshape(1, -1)

    def hyper_loss(weights, hyper):
        """Find the loss for neural network that is dependant on the hyperparameter.

        :param weights: The weights ([[float]]) of the neural network.
        :param hyper: The hyperparameter (float) input to the hypernetwork.
        :return: The loss (float) of network dependant on the hyperparameter.
        """
        return -log_gaussian(weights, np.exp(hyper))

    example_hyper = sample_hypers(
        hyper_cur, npr.RandomState(global_seed))  # Test the sample function.
    assert example_hyper is not None

    train_inputs, train_targets = train_data
    valid_inputs, valid_targets = valid_data
    test_inputs, test_targets = test_data
    batch_ind, feature_ind = 0, 1
    elementary_input_size = np.shape(train_inputs)[feature_ind]
    elementary_output_size = np.shape(train_targets)[feature_ind]
    elementary_layer_sizes = [elementary_input_size, elementary_output_size]
    num_hypers = example_hyper.shape[
        feature_ind]  # The dimensionality of the hyperparameter space (integer).

    # Define neural network and function to turn a vector into its weight structure.
    example_elementary_params = init_random_params(
        init_scale, elementary_layer_sizes, npr.RandomState(global_seed))
    flat_elementary_params, unflatten_vector_to_network_weights = flatten(
        example_elementary_params)
    assert hyper_loss(example_elementary_params, example_hyper) is not None
    num_elementary_params = len(flat_elementary_params)

    # Define a hypernetwork parametrized by some hyperparameters.
    hypernet_layer_sizes = [num_hypers, num_elementary_params
                            ]  # Note that there are no hidden units.

    objective_functions = get_loss_functions(
        unflatten_vector_to_network_weights, sample_hypers, hyper_loss,
        batch_size, train_inputs, train_targets, test_inputs, test_targets,
        valid_inputs, valid_targets, global_seed)
    hypernet, train_objective, valid_objective, test_objective = objective_functions[:
                                                                                     4]
    hyper_train_objective, hyper_valid_objective, hyper_test_objective = objective_functions[
        4:-1]
    hyper_train_stochastic_objective = objective_functions[-1]

    # Next, train a neural network from scratch with different hyperparameter values.
    real_step_size = 0.0001  # The step size to use to find the real loss (float).
    real_num_iters = 1000  # The number of iterations to use to find the real loss (integer).
    range_min = -2.0  # The min log variance for the hyper parameter of the variance of weight distribution to graph.
    range_max = 4.0  # The max log variance for the hyper parameter of the variance of weight distribution to graph.
    num_visual_points = 10  # The number of points to test the real loss of - expensive (integer).
    real_hyper_range = np.linspace(range_min + 1.0, range_max - 1.0,
                                   num_visual_points)
    real_train_loss = np.zeros(real_hyper_range.shape)
    real_train_performance = np.zeros(real_hyper_range.shape)
    real_valid_loss = np.zeros(real_hyper_range.shape)
    real_test_loss = np.zeros(real_hyper_range.shape)
    min_real_valid_loss, min_real_hyper = 10e32, 10e32
    for i, hypers in enumerate(real_hyper_range):
        print("Optimizing network parameters: ", i)
        init_params = init_random_params(init_scale, elementary_layer_sizes,
                                         npr.RandomState(global_seed))

        def cur_obj(w, seed):
            """The current objective function of the neural network.

            :param w: The weights ([float]) of the neural network.
            :param seed: The seed (integer) for sampling a hyperparameter.
            :return: The current objective value (float).
            """
            return train_objective(w, hypers, seed)

        optimized_params, _, _, _ = adam(grad(cur_obj),
                                         init_params,
                                         step_size=real_step_size,
                                         num_iters=real_num_iters)
        real_train_loss[i] = train_objective(optimized_params, hypers,
                                             global_seed)
        real_train_performance[i] = real_train_loss[i] - hyper_loss(
            optimized_params, hypers)
        real_valid_loss[i] = valid_objective(optimized_params, hypers,
                                             global_seed)
        if real_valid_loss[i] < min_real_valid_loss:
            min_real_valid_loss = real_valid_loss[i]
            print("Best hyperparameter found = ", hypers)
        real_test_loss[i] = test_objective(optimized_params, hypers,
                                           global_seed)

    fig, axs = create_figure_and_axs()

    # Set up the arrays to store information for plotting.
    num_hyper_test_points = 200  # Test a large number of hyperparameters with the learned function - cheap (integer)!
    learned_hyper_range = np.linspace(
        range_min, range_max,
        num_hyper_test_points)  # Hyperparameters to test.
    hyper_train_loss = np.zeros(
        learned_hyper_range.shape
    )  # Hypernetwork training loss per hyperparameter.
    hyper_train_performance = np.zeros(
        learned_hyper_range.shape)  # Hypernetwork training performance per
    # hyperparameter.  Note that performance is loss - regularization loss.
    hyper_valid_loss, hyper_test_loss = np.zeros(
        learned_hyper_range.shape), np.zeros(learned_hyper_range.shape)

    def callback(hyper_weights, opt_iteration, g):
        """Do whatever work is desired on each optimization iteration.
        Draws graphs, prints information, and stores information.

        :param hyper_weights: The weights ([[float]]) of the hypernetwork.
        :param opt_iteration: The current iteration of optimization.
        :param g: The gradient ([[float]]) of the optimizer.
        :return: None.
        """
        global log_likelihoods, valid_loss, test_loss, grad_norms_hyper, grad_norms_hypernet, global_opt_iteration
        global hyper_cur
        log_likelihood = hyper_train_objective(hyper_weights, hyper_cur)
        log_likelihoods[
            global_opt_iteration] = log_likelihood  # Store the training loss.
        weights_cur = hypernet(hyper_weights, hyper_cur)
        train_performance[global_opt_iteration] = log_likelihood - hyper_loss(
            weights_cur, hyper_cur)
        valid_loss[global_opt_iteration] = hyper_valid_objective(
            hyper_weights, hyper_cur)
        test_loss[global_opt_iteration] = hyper_test_objective(
            hyper_weights, hyper_cur)
        grad_norm = np.sum([
            np.sum(
                [np.sum(np.abs(weight_or_bias)) for weight_or_bias in layer])
            for layer in g
        ])
        grad_norms_hypernet[global_opt_iteration] = grad_norm
        grad_norms_hyper[global_opt_iteration] = grad_norms_hyper[
            global_opt_iteration - 1]
        global_opt_iteration += 1
        print("Iteration {} Loss {} Grad L1 Norm {}".format(
            opt_iteration, log_likelihood, grad_norm))

        if global_opt_iteration % graph_mod == 0:  # Only print on every iteration that is a multiple of graph_mod.
            [ax.cla() for ax in axs]  # Clear all of the axes.
            axs[0].set_xlabel('Hyperparameter $\lambda$'), axs[0].set_ylabel(
                'Loss $\mathcal{L}$')

            for cur, hyper in enumerate(learned_hyper_range):
                hyper_train_loss[cur] = hyper_train_objective(
                    hyper_weights, hyper)
                weights = hypernet(hyper_weights, hyper)
                hyper_train_performance[
                    cur] = hyper_train_loss[cur] - hyper_loss(weights, hyper)
                hyper_valid_loss[cur] = hyper_valid_objective(
                    hyper_weights, hyper)
                hyper_test_loss[cur] = hyper_test_objective(
                    hyper_weights, hyper)

            axs[0].plot(real_hyper_range,
                        real_train_loss,
                        'bx',
                        ms=28,
                        label='Train loss of optimized weights')
            axs[0].plot(learned_hyper_range,
                        hyper_train_loss,
                        'b-',
                        label='Train loss of hypernetwork weights')
            axs[0].set_ylim([-1.5, 3.8])

            axs[0].plot(real_hyper_range,
                        real_valid_loss,
                        'rx',
                        ms=28,
                        label='Valid. loss of optimized weights')
            axs[0].plot(learned_hyper_range,
                        hyper_valid_loss,
                        'r-',
                        label='Valid. loss of hypernetwork weights')
            min_hyper_found = 1.836  # Known minimum from doing a search with 1000 points over this range.
            axs[0].axvline(x=min_hyper_found,
                           c='k',
                           linestyle='dashed',
                           label='Optimal hyperparameter $\lambda$')

            pdf_range = np.linspace(hyper_cur - 0.5, hyper_cur + 0.5, 100)
            axs[0].plot(pdf_range,
                        norm.pdf(pdf_range, loc=hyper_cur, scale=0.06) / 4.0 +
                        axs[0].get_ylim()[0],
                        c='g',
                        label='$p (\lambda | \hat{\lambda})$')

            [
                ax.legend(loc='upper center',
                          bbox_to_anchor=(0.5, 1.45),
                          borderaxespad=0.0,
                          fancybox=True,
                          framealpha=0.0,
                          fontsize=28) for ax in axs
            ]  # Create a legend for all the axes.
            setup_ax_and_save(axs, fig, 'hypernets_local_small')

    def callback_outer(hyper, opt_iteration, g):
        """Do whatever work is desired on each outer optimization iteration.
        Stores information.

        :param hyper: The hyperparameter (float) input to the hypernetwork.
        :param opt_iteration: The current iteration of optimization.
        :param g: The gradient ([[float]]) of the optimizer.
        :return: None.
        """
        global grad_norms_hyper, train_hypers, global_hyperopt_iteration
        grad_norms_hyper[global_opt_iteration - 1] = np.abs(g)
        train_hypers[global_hyperopt_iteration] = hyper
        global_hyperopt_iteration += 1
        print("Outer Iteration {} Hyper {} Grad L1 Norm {}".format(
            global_hyperopt_iteration, hyper,
            grad_norms_hyper[global_opt_iteration]))

    init_hypernet_params = init_random_params(init_scale, hypernet_layer_sizes,
                                              npr.RandomState(global_seed))
    m_hyper = None  # A record of the current m for re-starting the Adam optimizer.
    v_hyper = None  # A record of the current v for re-starting the Adam optimizer
    cur_iter_hyper = None  # A record of the current iteration for re-starting the Adam optimizer.
    for _ in range(num_iters):

        def hyper_train_stochastic_objective_current(hyper_weights, seed):
            """The objective for the hypernetwork, with a fixed hyperparameter.

            :param hyper_weights: The weights ([[float]]) of the hypernetwork.
            :param seed: The seed (integer) for sampling a hyperparameter.
            :return: The hypernetwork's loss (float).
            """
            return hyper_train_stochastic_objective(hyper_cur, hyper_weights,
                                                    seed)

        init_hypernet_params = sgd(
            grad(hyper_train_stochastic_objective_current),
            init_hypernet_params,
            step_size=step_size_hypernet,
            num_iters=num_iters_hypernet,
            callback=callback,
            mass=0)

        def valid_objective_current(hyper, seed):
            """The objective for the hyperparameter, with a fixed hypernetwork.

            :param hyper: The hyperparameter (float) input to the hypernetwork.
            :param seed: The seed (integer) for sampling a hyperparameter.
            :return: The validation loss (float).
            """
            return valid_objective(hypernet(init_hypernet_params, hyper),
                                   hyper, seed)

        hyper_cur, m_hyper, v_hyper, cur_iter_hyper = adam(
            grad(valid_objective_current),
            hyper_cur,
            step_size=step_size_hyper,
            num_iters=num_iters_hyper,
            callback=callback_outer,
            m=m_hyper,
            v=v_hyper,
            offset=cur_iter_hyper)
        print("The current hyperparameter is:", hyper_cur)