Exemple #1
0
    def fit(self,
            target_log_q,
            n_iter=1000,
            n_samples=1,
            n_samples_per_report=10,
            report_interval=10,
            annealed=True,
            step_size=0.01,
            l2_penalty=1.0):
        """Optimize with parameters of self.model to minimize the variational free energy
        between target_log_q and the distribution of y = model.transform(x), x ~ N(0,1)"""
        if annealed:
            beta = np.linspace(0.01, 1.0, n_iter)
        else:
            beta = np.ones(n_iter)

        self.optimization_history = []
        progress_log_callback = self.progress_logger_factory(
            target_log_q, n_samples_per_report, report_interval)

        normalization = lambda params: l2_penalty * np.sum(np.abs(params)**2)

        reparam_gradient = lambda params, i: clip_gradients(
            self.reparameterization_gradient(params, target_log_q, n_samples,
                                             beta[i]), 1) + grad(normalization
                                                                 )(params)

        self.params = adam(grad=reparam_gradient,
                           init_params=self.model.params,
                           step_size=step_size,
                           callback=progress_log_callback,
                           num_iters=n_iter)
    def train(self,
              X_train,
              F_train,
              y_train,
              batch_size=32,
              num_iters=1000,
              lr=1e-3,
              param_scale=0.01,
              log_every=100,
              init_weights=None):
        grad_fun = build_batched_grad_fences(grad(self.objective), batch_size,
                                             X_train, F_train, y_train)
        print('Batched gradient fences building completed')
        if init_weights is None:
            init_weights = self.init_weights(param_scale)
        saved_weights = np.zeros((num_iters, self.num_weights))

        def callback(weights, i, gradients):  # 计算全样本log-likelihood过于缓慢,摒弃。
            apl = self.average_path_length(weights, X_train, F_train, y_train)
            saved_weights[i, :] = weights
            loss_train = self.objective(weights, X_train, F_train, y_train)
            if i % log_every == 0:
                print('model: gru | iter: {} | loss: {:.2f} | apl: {:.2f}'.
                      format(i, loss_train, apl))

        print('Optimization started.')
        print(self.num_weights)
        optimized_weights = adam(grad_fun,
                                 init_weights,
                                 num_iters=num_iters,
                                 step_size=lr,
                                 callback=callback)
        self.saved_weights = saved_weights
        self.weights = optimized_weights
        return optimized_weights
    def train(self,
              X_train,
              y_train,
              batch_size=32,
              num_iters=1000,
              lr=1e-3,
              param_scale=0.01,
              log_every=100,
              init_weights=None):
        grad_fun = build_batched_grad(grad(self.objective), batch_size,
                                      X_train, y_train)
        if init_weights is None:
            init_weights = self.init_weights(param_scale)

        def callback(weights, i, gradients):
            loss_train = self.objective(weights, X_train, y_train)
            if i % log_every == 0:
                print('model: mlp | iter: {} | loss: {:.2f}'.format(
                    i, loss_train))

        optimized_weights = adam(grad_fun,
                                 init_weights,
                                 num_iters=num_iters,
                                 step_size=lr,
                                 callback=callback)
        self.weights = optimized_weights
        return optimized_weights
  def fit(self, inputs, targets, A=None, num_epochs=64, batch_size=256,
      step_size=0.001, rs=npr, nonlinearity=relu, verbose=False, normalize=False,
      always_include=None,
      **input_grad_kwargs):
    X = inputs.astype(np.float32)
    y = one_hot(targets)
    if A is None: A = np.zeros_like(X).astype(bool)
    params = init_random_params(0.1, [X.shape[1]] + self.layers + [y.shape[1]], rs=rs)

    if type(verbose) == int:
      v = verbose
      verbose = lambda x: x % v == 0

    batch_size = min(batch_size, X.shape[0])
    num_batches = int(np.ceil(X.shape[0] / batch_size))

    def batch_indices(iteration):
      idx = iteration % num_batches
      return slice(idx * batch_size, (idx+1) * batch_size)

    def objective(params, iteration):
      idx = batch_indices(iteration)
      Ai = A[idx]
      Xi = X[idx]
      yi = y[idx]

      if always_include is not None:
        Ai = np.vstack((A[always_include], Ai))
        Xi = np.vstack((X[always_include], Xi))
        yi = np.vstack((y[always_include], yi))

      if normalize:
        sumA = max(1., float(Ai.sum()))
        lenX = max(1., float(len(Xi)))
      else:
        sumA = 1.
        lenX = 1.

      crossentropy = -np.sum(feed_forward(params, Xi, nonlinearity) * yi) / lenX
      rightreasons = self.l2_grads * l2_norm(input_gradients(params, **input_grad_kwargs)(Xi)[Ai]) / sumA
      smallparams = self.l2_params * l2_norm(params)

      if verbose and verbose(iteration):
        print('Iteration={}, crossentropy={}, rightreasons={}, smallparams={}, sumA={}, lenX={}'.format(
          iteration, crossentropy.value, rightreasons.value, smallparams.value, sumA, lenX))

      return crossentropy + rightreasons + smallparams

    self.params = adam(grad(objective), params, step_size=step_size, num_iters=num_epochs*num_batches)
Exemple #5
0
    def fit(self, target_log_q, init_params=None, n_iter=1000, n_samples=1,
            n_samples_per_report=10, report_interval=10, step_size=0.01, annealed=True):
        """Fit normalizing flow to target_log_q by minimizing variational free energy"""
        if annealed:
            beta = np.linspace(0.001, 1.0, n_iter)
        else:
            beta = np.ones(n_iter)

        if init_params == None:
            init_params = 0.01 * np.random.randn(self.n_params)

        self.optimization_history = []
        progress_log_callback = self.progress_logger_factory(target_log_q, n_samples_per_report, report_interval)

        reparam_gradient = lambda params, i: self.reparameterization_gradient(params, target_log_q, n_samples, beta[i])

        self.params = adam(grad=reparam_gradient, init_params=init_params, step_size=step_size,
                           callback=progress_log_callback, num_iters=n_iter)
    def fit(self,
            inputs,
            targets,
            A=None,
            num_epochs=64,
            batch_size=256,
            step_size=0.001,
            rs=npr,
            nonlinearity=relu,
            **input_grad_kwargs):
        X = inputs.astype(np.float32)
        y = one_hot(targets)
        if A is None: A = np.zeros_like(X).astype(bool)
        params = init_random_params(0.1,
                                    [X.shape[1]] + self.layers + [y.shape[1]],
                                    rs=rs)

        batch_size = min(batch_size, X.shape[0])
        num_batches = int(np.ceil(X.shape[0] / batch_size))

        def batch_indices(iteration):
            idx = iteration % num_batches
            return slice(idx * batch_size, (idx + 1) * batch_size)

        def objective(params, iteration):
            idx = batch_indices(iteration)
            return -(
                np.sum(feed_forward(params, X[idx], nonlinearity) *
                       y[idx])  # cross-entropy
                - self.l2_params *
                l2_norm(params)  # L2 regularization on parameters directly
                - self.l2_grads * l2_norm(
                    input_gradients(  # "Explanation regularization"
                        params, **input_grad_kwargs)(X[idx])[A[idx]]))

        self.params = adam(grad(objective),
                           params,
                           step_size=step_size,
                           num_iters=num_epochs * num_batches)
Exemple #7
0
def train(train_data, test_data, layer_widths, step_size, num_epochs, batch_size):
    num_batches = int(np.ceil(len(train_data) / batch_size))
    num_iters = num_batches * num_epochs
    print("num_batches: ", num_batches)
    print("num_iters: ", num_iters)

    def objective(params, iteration):
        idx = iteration % num_batches
        chunk = slice(idx * batch_size, (idx + 1) * batch_size)
        return auto_enc_loss(train_data[chunk], params)

    def print_perf(params, iteration, gradient):
        if iteration % num_batches == 0:
            print(100.0 * iteration / num_iters, '% done')
            print('Training error: ', auto_enc_loss(train_data, params))
            print('Test error: ', auto_enc_loss(test_data, params))
    
    objective_grad = grad(objective)
    params = init_params(layer_widths, 0.1)
    
    optimized_params = adam(
        grad=objective_grad,
        init_params=params,
        step_size=step_size,
        num_iters=num_iters,
        callback=print_perf
    )

    # optimized_params = rmsprop(
    #     grad=objective_grad,
    #     init_params=params,
    #     step_size=step_size,
    #     num_iters=num_iters,
    #     callback=print_perf
    # )
    
    return optimized_params
            elbos.append(elbo_val)
            if t % 50 == 0:
                print("Iteration {} lower bound {}".format(t, elbo_val))

        init_mean    = -1 * np.ones(D)
        init_log_std = -5 * np.ones(D)
        init_var_params = np.concatenate([init_mean, init_log_std])
        variational_params = optfun(num_iters, init_var_params, callback)
        return np.array(elbos)

    # let's optimize this with a few different step sizes
    elbo_lists = []
    step_sizes = [.1, .25, .5]
    for step_size in step_sizes:
        # optimize with standard gradient + adam
        optfun = lambda n, init, cb: adam(gradient, init, step_size=step_size,
                                                    num_iters=n, callback=cb)
        standard_lls = optimize_and_lls(optfun)

        # optimize with natural gradient + sgd, no momentum
        optnat = lambda n, init, cb: sgd(natural_gradient, init, step_size=step_size,
                                         num_iters=n, callback=cb, mass=.001)
        natural_lls = optimize_and_lls(optnat)
        elbo_lists.append((standard_lls, natural_lls))

    # visually compare the ELBO
    plt.figure(figsize=(12,8))
    colors = ['b', 'k', 'g']
    for col, ss, (stand_lls, nat_lls) in zip(colors, step_sizes, elbo_lists):
        plt.plot(np.arange(len(stand_lls)), stand_lls,
                 '--', label="standard (adam, step-size = %2.2f)"%ss, alpha=.5, c=col)
        plt.plot(np.arange(len(nat_lls)), nat_lls, '-',
            L2_VAR_2, NUM_TRAIN, train_images, train_labels, C, D, L)

        # Build callback for ADAM optimizer
        def init_callback(params, t, g):
            lik = -objective(params, t)
            print("Initialization iteration {} log-likelihood {}".format(
                t, lik))

        # initialize weights
        pre_init_weights = np.ones(L)

        # optimize weights
        print("Initializing weights...")
        init_weights = adam(init_gradient,
                            pre_init_weights,
                            step_size=0.1,
                            num_iters=INIT_ITERS,
                            callback=init_callback)

        # pickle processed data in /cache (if doesn't already exist)
        if not os.path.exists('cache'):
            print('creating cache folder')
            os.makedirs('cache')
        if not os.path.isfile(picklefilename):
            print('saving pickled regression initalization data')
            np.savez(picklefilename, init_weights=init_weights)
    ###############################################################

    ###############################################################
    # OPTIMIZE NOISE-AWARE LIKELIHOOD #
    ###############################################################
Exemple #10
0
        print("Iteration {} lower bound {}".format(
            t, -objective(params, t, 1000)))
        if t % 10 == 0:
            plt.cla()
            # plot target
            target_distribution = lambda x: np.exp(lnpdf(x, t))
            plot_isocontours(ax, target_distribution, fill=True)
            # plot approximate distribution
            plot_q_dist(ax, params)
            ax.set_xlim((-3, 3))
            ax.set_ylim((-4, 4))
            plt.draw()
            plt.pause(1.0 / 30.0)

    #####################
    # Run optimization  #
    #####################
    print("Optimizing variational parameters...")
    th = .5 * npr.randn(num_variational_params) - 3.

    num_objective_samps = 10

    def grad_wrap(th, t):
        return gradient(th, t, num_objective_samps)

    variational_params = adam(grad_wrap,
                              th,
                              step_size=.02,
                              num_iters=10000,
                              callback=callback)
Exemple #11
0
        ax.set_xticks([])
        ax.set_ylim([-4, 1])
        ax.set_xlim([-2, 2])

    # Set up figure.
    fig = plt.figure(figsize=(8,8), facecolor='white')
    ax = fig.add_subplot(111, frameon=False)
    plt.ion()
    plt.show(block=False)

    num_plotting_samples = 51

    def callback(params, t, g):
        print("Iteration {} lower bound {}".format(t, -objective(params, t)))

        plt.cla()
        target_distribution = lambda x : np.exp(log_posterior(x, t))
        var_distribution = lambda x: np.exp(diag_gaussian_density_from_params(params, x))
        plot_isocontours(ax, target_distribution)
        plot_isocontours(ax, var_distribution)

        rs = npr.RandomState(0)
        samples = iwae_sample(log_posterior, params, t, k, num_plotting_samples, rs)
        plt.plot(samples[:, 0], samples[:, 1], 'x')

        plt.draw()
        plt.pause(1.0/30.0)

    print("Optimizing variational parameters...")
    adam(grad(objective), init_gaussian_var_params(D), step_size=0.1, num_iters=2000, callback=callback)
        # log_weights = params[:10] - logsumexp(params[:10])
        print("Iteration {} lower bound {}".format(t, -objective(params, t)))
        # print (np.exp(log_weights))

        plt.cla()
        target_distribution = lambda x: np.exp(log_density(x))
        var_distribution    = lambda x: np.exp(variational_log_density(params, x))
        plot_isocontours(ax, target_distribution)
        plot_isocontours(ax, var_distribution, cmap=plt.cm.bone)
        ax.set_autoscale_on(False)


        # rs = npr.RandomState(0)
        # samples = variational_sampler(params, num_plotting_samples, rs)
        # plt.plot(samples[:, 0], samples[:, 1], 'x')

        plt.draw()
        plt.pause(1.0/30.0)

    print("Optimizing variational parameters...")
    variational_params = adam(grad(objective), init_var_params(D), step_size=0.1,
                              num_iters=2000, callback=callback)







            elbos.append(elbo_val)
            if t % 50 == 0:
                print("Iteration {} lower bound {}".format(t, elbo_val))

        init_mean = -1 * np.ones(D)
        init_log_std = -5 * np.ones(D)
        init_var_params = np.concatenate([init_mean, init_log_std])
        variational_params = optfun(num_iters, init_var_params, callback)
        return np.array(elbos)

    # let's optimize this with a few different step sizes
    elbo_lists = []
    step_sizes = [.1, .25, .5]
    for step_size in step_sizes:
        # optimize with standard gradient + adam
        optfun = lambda n, init, cb: adam(
            gradient, init, step_size=step_size, num_iters=n, callback=cb)
        standard_lls = optimize_and_lls(optfun)

        # optimize with natural gradient + sgd, no momentum
        optnat = lambda n, init, cb: sgd(natural_gradient,
                                         init,
                                         step_size=step_size,
                                         num_iters=n,
                                         callback=cb,
                                         mass=.001)
        natural_lls = optimize_and_lls(optnat)
        elbo_lists.append((standard_lls, natural_lls))

    # visually compare the ELBO
    plt.figure(figsize=(12, 8))
    colors = ['b', 'k', 'g']

    def callback(params, t, g):
        print("Iteration {} lower bound {}".format(t, -objective(params, t)))

        # Sample functions from posterior.
        rs = npr.RandomState(0)
        mean, log_std = unpack_params(params)
        #rs = npr.RandomState(0)
        sample_weights = rs.randn(10, num_weights) * np.exp(log_std) + mean
        plot_inputs = np.linspace(-8, 8, num=400)
        outputs = predictions(sample_weights, np.expand_dims(plot_inputs, 1))

        # Plot data and functions.
        plt.cla()
        ax.plot(inputs.ravel(), targets.ravel(), 'bx')
        ax.plot(plot_inputs, outputs[:, :, 0].T)
        ax.set_ylim([-2, 3])
        plt.draw()
        plt.pause(1.0/60.0)

    # Initialize variational parameters
    rs = npr.RandomState(0)
    init_mean    = rs.randn(num_weights)
    init_log_std = -5 * np.ones(num_weights)
    init_var_params = np.concatenate([init_mean, init_log_std])

    print("Optimizing variational parameters...")
    variational_params = adam(gradient, init_var_params,
                              step_size=0.1, num_iters=1000, callback=callback)
Exemple #15
0
    inputs, targets = build_toy_dataset()

    def objective(weights, t):
        return -logprob(weights, inputs, targets)\
               -log_gaussian(weights, weight_prior_variance)

    print(grad(objective)(init_params, 0))

    # Set up figure.
    fig = plt.figure(figsize=(12,8), facecolor='white')
    ax = fig.add_subplot(111, frameon=False)
    plt.show(block=False)

    def callback(params, t, g):
        print("Iteration {} log likelihood {}".format(t, -objective(params, t)))

        # Plot data and functions.
        plt.cla()
        ax.plot(inputs.ravel(), targets.ravel(), 'bx', ms=12)
        plot_inputs = np.reshape(np.linspace(-7, 7, num=300), (300,1))
        outputs = nn_predict(params, plot_inputs)
        ax.plot(plot_inputs, outputs, 'r', lw=3)
        ax.set_ylim([-1, 1])
        plt.draw()
        plt.pause(1.0/60.0)

    print("Optimizing network parameters...")
    optimized_params = adam(grad(objective), init_params,
                            step_size=0.01, num_iters=1000, callback=callback)
Exemple #16
0
    log_likelihoods = []

    print("     Epoch     |    params   ")

    def print_logLikelihood(params, iter, gradient):
        log_likelihood = logLikelihood(params, iter)
        h1, h2, s = feed_forward(params, iter)
        #h = np.vstack((h1, h2))
        plt.scatter(h1, h2)
        plt.show()
        log_likelihoods.append(log_likelihood)
        print("{:15}|{:20}".format(iter, log_likelihood))
        #h1, h2 = np.random.randn(batch_size,2).T
        #x1, x2 = inverse_flow(h1, h2, params)
        #plt.scatter(x1, x2)
        #plt.show()

    optimized_params = adam(grad_logLikelihood,
                            init_params,
                            step_size=learning_rate,
                            num_iters=num_epoch,
                            callback=print_logLikelihood)
    x_axis = np.linspace(0, num_epoch, num_epoch)
    plt.plot(x_axis, log_likelihoods)
    plt.show()
    h1 = np.random.randn(batch_size, 1)
    h2 = np.random.randn(batch_size, 1)
    x1, x2 = inverse_flow(h1, h2, optimized_params)
    plt.scatter(x1, x2)
    plt.show()
Exemple #17
0
            training_text  = one_hot_to_string(train_inputs[:,t,:])
            predicted_text = one_hot_to_string(logprobs[:,t,:])
            print(training_text.replace('\n', ' ') + "|" +
                  predicted_text.replace('\n', ' '))

    def training_loss(params, iter):
        return -rnn_log_likelihood(params, train_inputs, train_inputs)

    def callback(weights, iter, gradient):
        if iter % 10 == 0:
            print("Iteration", iter, "Train loss:", training_loss(weights, 0))
            print_training_prediction(weights)

    # Build gradient of loss function using autograd.
    training_loss_grad = grad(training_loss)

    print("Training RNN...")
    trained_params = adam(training_loss_grad, init_params, step_size=0.1,
                          num_iters=1000, callback=callback)

    print()
    print("Generating text from RNN...")
    num_letters = 30
    for t in range(20):
        text = ""
        for i in range(num_letters):
            seqs = string_to_one_hot(text, num_chars)[:, np.newaxis, :]
            logprobs = rnn_predict(trained_params, seqs)[-1].ravel()
            text += chr(npr.choice(len(logprobs), p=np.exp(logprobs)))
        print(text)
# Testing the mapping mechanism
w_spc = utils.spacing_gen(10, -1, +1, dim=1)
#A,B,C,L,P = sig.params_init(10,mode='linear')
params = sig.params_init(num_sig=50, mode='random')

z_spc = sig.reparam(w_spc, params, indep=False)
dzdw = sig.df_dw(w_spc, params)
#z_spc = sig.reparam(w_spc, A,B,C,L,P, indep=True)

#disply.line_2d(w_spc, z_spc )

SAMPLING_SIZE = 1000

log_qw, w_gen = utils.uniform_init(-1, +1, dim=1)
log_pz, pz_gen = utils.gaussian_mix_init(np.array([1.0, -0.5, -2.]),
                                         np.array([0.1, 0.2, 0.05]),
                                         np.array([0.3, 0.3, 0.4]))

#sig.plot_qz(params,log_qw,target=log_qw, testing=True)
#w_samples = w_gen(SAMPLING_SIZE)

grad_kl = sig.grad_kl_init(log_pz, log_qw, params, w_gen, SAMPLING_SIZE)

trained_params = adam(grad_kl, params, step_size=0.1, num_iters=500)

#sig.plot_qz(params,log_qw,target=log_pz, testing=True)
sig.plot_qz(trained_params, log_qw, target=log_pz, testing=True)
#grad_A = sig.grad_kl(w_samples,log_pz, log_qw, params)
#print
#print grad_A
print('Done')
Exemple #19
0
        g - ignore
    '''

    def callback(params, t, g):
        print("i {}, lower bound {}, test {}, train {} ".format(
            t, -objective(params, t), accuracy(test_images, test_labels,
                                               params),
            accuracy(train_images, train_labels, params)))

    print("Optimizing variational parameters...")
    init_mean = 0 * np.ones(D)
    init_log_std = 0 * np.ones(D)
    init_var_params = np.concatenate([init_mean, init_log_std])
    variational_params = adam(gradient,
                              init_var_params,
                              step_size=learning_rate,
                              num_iters=train_iters,
                              callback=callback)

    # ---------------- STOCHASTIC VARIATIONAL INFERENCE DONE ---------------
    # now get Monte Carlo estimate p(t | x) over the test and training set

    print('TRAIN set accuracy: ',
          accuracy(train_images, train_labels, variational_params))
    print('TEST set accuracy: ',
          accuracy(test_images, test_labels, variational_params))

    means = variational_params[:D].reshape(784, 10).T
    std = np.exp(variational_params[D:]).reshape(784, 10).T

    save_images(means, 'svi_means_sigma_%.5f.png' % sigma_prior, ims_per_row=5)
Exemple #20
0
                predicted_text.replace('\n', ' '))

    def training_loss(params, iter):
        return -rnn_log_likelihood(params, train_inputs, train_inputs)

    def callback(weights, iter, gradient):
        if iter % 10 == 0:
            print("Iteration", iter, "Train loss:", training_loss(weights, 0))
            print_training_prediction(weights)

    # Build gradient of loss function using autograd.
    training_loss_grad = grad(training_loss)

    print("Training RNN...")
    trained_params = adam(training_loss_grad,
                          init_params,
                          step_size=0.1,
                          num_iters=1000,
                          callback=callback)

    print()
    print("Generating text from RNN...")
    num_letters = 30
    for t in range(20):
        text = ""
        for i in range(num_letters):
            seqs = string_to_one_hot(text, num_chars)[:, np.newaxis, :]
            logprobs = rnn_predict(trained_params, seqs)[-1].ravel()
            text += chr(npr.choice(len(logprobs), p=np.exp(logprobs)))
        print(text)
                Z,
                zdir='z',
                offset=-100,
                cmap=cm.coolwarm,
                zorder=0,
                levels=np.linspace(0, 30, 30))

    a = Arrow(params[0], params[1], -g[0], -g[1], width=0.5, zorder=2)
    ax2.add_patch(a)
    art3d.pathpatch_2d_to_3d(a, z=-100, zdir="z")
    # ax2.plot([params[0], params[0]],
    #          [params[1], params[1]],
    #          [-50, elbo(params, 0)], '--', linewidth=2.0, zorder=5)
    # ax2.scatter(params[0], params[1], elbo(params, 0), marker='o', s=100)
    plt.draw()
    plt.pause(1.0 / 30.0)


gradient = grad(elbo)

init_mean = 4 * np.ones(1)
init_log_std = -5 * np.ones(1)
init_var_params = np.concatenate([init_mean, init_log_std])
variational_params = adam(gradient,
                          init_var_params,
                          step_size=0.1,
                          num_iters=400,
                          callback=callback)

plt.show()
Exemple #22
0
def run_variational_inference_gumbel(Ys,
                                     A,
                                     W_true,
                                     Ps_true,
                                     Cs,
                                     etasq,
                                     stepsize=0.1,
                                     init_with_true=True,
                                     num_iters=250,
                                     temp_prior=0.1,
                                     num_sinkhorn=20,
                                     num_mcmc_samples=500,
                                     temp=1):
    def sample_q(params, unpack_W, unpack_Ps, Cs, num_sinkhorn, temp):

        # Sample W
        mu_W, log_sigmasq_W, log_mu_Ps = params
        W_flat = mu_W + np.sqrt(np.exp(log_sigmasq_W)) * npr.randn(*mu_W.shape)

        W = unpack_W(W_flat)
        #W = W_true
        # Sample Ps: run sinkhorn to move mu close to Birkhoff
        Ps = []
        for log_mu_P , unpack_P, C in \
                zip(log_mu_Ps,  unpack_Ps, Cs):
            # Unpack the mean, run sinkhorn, the pack it again
            log_mu_P = unpack_P(log_mu_P)
            a = log_mu_P.shape
            log_mu_P = (
                log_mu_P +
                -np.log(-np.log(np.random.uniform(0, 1, (a[0], a[1]))))) / temp

            log_mu_P = sinkhorn_logspace(log_mu_P - 1e8 * (1 - C),
                                         num_sinkhorn)
            log_mu_P = log_mu_P[C]

            ##Notice how we limit the variance
            P = np.exp(log_mu_P)
            P = unpack_P(P)

            Ps.append(P)

        Ps = np.array(Ps)
        return W, Ps

    def elbo(params, unpack_W, unpack_Ps, Ys, A, Cs, etasq, num_sinkhorn,
             num_mcmc_samples, temp_prior, temp):
        """
        Provides a stochastic estimate of the variational lower bound.
        sigma_Lim: limits for the variance of the re-parameterization of the permutation
        """
        def gumbel_distance(log_mu_Ps, temp_prior, temperature, Cs):
            arr = 0
            for n in range(len(log_mu_Ps)):
                log_mu_P = unpack_Ps[n](log_mu_Ps[n])
                C = Cs[n]
                log_mu_P = log_mu_P[C]
                log_mu_P = log_mu_P[:]
                arr += np.sum(
                    np.log(temp_prior) -
                    0.5772156649 * temp_prior / temperature -
                    log_mu_P * temp_prior / temperature - np.exp(
                        gammaln(1 + temp_prior / temperature) -
                        log_mu_P * temp_prior / temperature) -
                    (np.log(temperature) - 1 - 0.5772156649))
            return arr

        M, T, N = Ys.shape
        assert A.shape == (N, N)
        assert len(unpack_Ps) == M

        mu_W, log_sigmasq_W, log_mu_Ps = params

        L = 0

        for smpl in range(num_mcmc_samples):
            W, Ps = sample_q(params, unpack_W, unpack_Ps, Cs, num_sinkhorn,
                             temp)

            # Compute the ELBO
            L += log_likelihood(Ys, A, W, Ps, etasq) / num_mcmc_samples

            L += gumbel_distance(log_mu_Ps, temp_prior, temp, Cs)
        # Add the entropy terms

        L += gaussian_entropy(log_sigmasq_W)
        fac = 1000
        ## This terms adds the KL divergence between the W prior and posterior with entries of W having a prior variance
        # sigma = 1/fac, for details see the appendix of the VAE paper.

        L += - 0.5 * log_sigmasq_W.size * (np.log(2 * np.pi)) -\
             0.5 * fac* np.sum(np.exp(log_sigmasq_W)) - 0.5 * fac * np.sum(
            np.power(mu_W, 2))
        # Normalize objective

        L /= (T * M * N)

        return L

    M, T, N = Ys.shape
    # Initialize variational parameters
    if init_with_true:
        mu_W, log_sigmasq_W, unpack_W, log_mu_Ps,  unpack_Ps = \
            initialize_params_gumbel(A, Cs,  map_W=W_true)
    else:
        mu_W, log_sigmasq_W, unpack_W, log_mu_Ps, unpack_Ps = \
            initialize_params_gumbel(A, Cs)

    # Make a function to convert an array of params into
    # a set of parameters mu_W, sigmasq_W, [mu_P1, sigmasq_P1, ... ]
    flat_params, unflatten = \
        flatten((mu_W, log_sigmasq_W, log_mu_Ps ))

    objective = \
        lambda flat_params, t: \
            -1 * elbo(unflatten(flat_params), unpack_W, unpack_Ps, Ys, A, Cs, etasq,
                      num_sinkhorn, num_mcmc_samples, temp_prior, temp)

    # Define a callback to monitor optimization progress
    elbos = []
    lls = []
    mses = []

    num_corrects = []
    times = []

    W_samples = []
    Ps_samples = []

    def collect_stats(params, t):

        if t % 10 == 0:
            W_samples.append([])
            Ps_samples.append([])
            for i in range(100):
                W, Ps = sample_q(unflatten(params), unpack_W, unpack_Ps, Cs,
                                 num_sinkhorn, temp)
                W_samples[-1].append(W)
                Ps_samples[-1].append(Ps)

        times.append(time.time())
        elbos.append(-1 * objective(params, 0))

        # Sample the variational posterior and compute num correct matches
        mu_W, log_sigmasq_W, log_mu_Ps = unflatten(params)

        W, Ps = sample_q(unflatten(params), unpack_W, unpack_Ps, Cs, 10, 1.0)

        list = []
        for i in range(A.shape[0]):
            list.extend(np.where(Ps[0, i, :] + Ps_true[0, i, :] == 1)[0])

        mses.append(np.mean((W * A - W_true * A)**2))

        # Round doubly stochastic matrix P to the nearest permutation matrix
        num_correct = np.zeros(M)
        Ps2 = np.zeros((Ps.shape[0], A.shape[0], A.shape[0]))
        for m, P in enumerate(Ps):
            row, col = linear_sum_assignment(-P + 1e8 * (1 - Cs[m]))
            Ps2[m] = perm_to_P(col)
            num_correct[m] = n_correct(perm_to_P(col), Ps_true[m])
        num_corrects.append(num_correct)

        lls.append(log_likelihood(Ys, A, W, Ps2, etasq) / (M * T * N))

    def callback(params, t, g):
        collect_stats(params, t)
        print(
            "Iteration {}.  ELBO: {:.4f} LL: {:.4f} MSE(W): {:.4f}, Num Correct: {}"
            .format(t, elbos[-1], lls[-1], mses[-1], num_corrects[-1]))

    # Run optimizer

    callback(flat_params, -1, None)
    variational_params = adam(grad(objective),
                              flat_params,
                              step_size=stepsize,
                              num_iters=num_iters,
                              callback=callback)

    times = np.array(times)
    times -= times[0]


    return times, np.array(elbos), np.array(lls), np.array(mses), \
           np.array(num_corrects), Ps_samples, W_samples, A, W_true
Exemple #23
0
    # construct recognition and decoder networks and initialize them
    recognize, recogn_params = \
        init_gresnet(P, [(40, np.tanh), (40, np.tanh), (2*N, gaussian_info)])
    decode,   loglike_params = \
        init_gresnet(N, [(40, np.tanh), (40, np.tanh), (2*P, gaussian_mean)])
    loglike = make_loglike(decode)

    # initialize gmm parameters
    pgm_params = init_pgm_param(T, N, alpha=1., niw_conc=1., random_scale=3.)
    params = pgm_params, loglike_params, recogn_params

    # set up encoder/decoder and plotting
    encode_mean, decode_mean = make_encoder_decoder(recognize, decode)
    plot = make_plotter_2d(recognize,
                           decode,
                           data,
                           num_clusters,
                           params,
                           plot_every=100,
                           plot_every_density=500)

    # instantiate svae gradient function
    gradfun = make_gradfun(run_inference, recognize, loglike, pgm_prior_params,
                           data)

    # optimize
    params = adam(gradfun(batch_size=50, num_samples=1, callback=plot),
                  params,
                  num_iters=1000)
    seed = npr.RandomState(0)

    def objective(combined_params, iter):
        data_idx = batch_indices(iter)
        gen_params, rec_params = combined_params
        return -vae_lower_bound(gen_params, rec_params, train_images[data_idx],
                                seed) / data_dim

    # Get gradients of objective using autograd.
    objective_grad = grad(objective)

    print(
        "     Epoch     |    Objective  |       Fake probability | Real Probability  "
    )

    def print_perf(combined_params, iter, grad):
        if iter % 10 == 0:
            gen_params, rec_params = combined_params
            bound = np.mean(objective(combined_params, iter))
            print("{:15}|{:20}".format(iter // num_batches, bound))

            fake_data = generate_from_prior(gen_params, 20, latent_dim, seed)
            save_images(fake_data, 'vae_samples.png', vmin=0, vmax=1)

    # The optimizers provided can optimize lists, tuples, or dicts of parameters.
    optimized_params = adam(objective_grad,
                            combined_init_params,
                            step_size=step_size,
                            num_iters=num_epochs * num_batches,
                            callback=print_perf)
Exemple #25
0
            annealing = (.999**iter)

            # fake_data = generate_from_prior(gen_params, 20, latent_dim, seed)
            # save_images(fake_data, 'vae_samples.png', vmin=0, vmax=1)
            # epoch = iter//num_batches
            # batch_size = original_batch_size * (.5**epoch)
            # if batch_size < 1: batch_size =1
            # print ('batch size', batch_size)

            print("{}|{:15}|{:20}|{:25}".format(iter, iter // num_batches,
                                                bound, annealing))

        # if new_epoch != epoch:
        #     batch_size = batch_size / 2
        #     if batch_size < 1: batch_size =1
        #     print ('batch size', batch_size)
        #     epoch = new_epoch

    # # The optimizers provided can optimize lists, tuples, or dicts of parameters.
    # optimized_params = adam(objective_grad, combined_init_params, step_size=step_size,
    #                         num_iters=num_epochs * num_batches, callback=print_perf)

    # The optimizers provided can optimize lists, tuples, or dicts of parameters.
    optimized_params = adam(objective_grad,
                            params,
                            step_size=step_size,
                            num_iters=2000,
                            callback=print_perf)

    params = optimized_params
    combined_init_params = (init_gen_params, init_rec_params)

    num_batches = int(np.ceil(len(train_images) / batch_size))
    def batch_indices(iter):
        idx = iter % num_batches
        return slice(idx * batch_size, (idx+1) * batch_size)

    # Define training objective
    seed = npr.RandomState(0)
    def objective(combined_params, iter):
        data_idx = batch_indices(iter)
        gen_params, rec_params = combined_params
        return -vae_lower_bound(gen_params, rec_params, train_images[data_idx], seed) / data_dim

    # Get gradients of objective using autograd.
    objective_grad = grad(objective)

    print("     Epoch     |    Objective  |       Fake probability | Real Probability  ")
    def print_perf(combined_params, iter, grad):
        if iter % 10 == 0:
            gen_params, rec_params = combined_params
            bound = np.mean(objective(combined_params, iter))
            print("{:15}|{:20}".format(iter//num_batches, bound))

            fake_data = generate_from_prior(gen_params, 20, latent_dim, seed)
            save_images(fake_data, 'vae_samples.png', vmin=0, vmax=1)

    # The optimizers provided can optimize lists, tuples, or dicts of parameters.
    optimized_params = adam(objective_grad, combined_init_params, step_size=step_size,
                            num_iters=num_epochs * num_batches, callback=print_perf)
Exemple #27
0
        if iter % 10 == 0:
            bound = training_loss_noAnneal(params)
            print("{:15}|{:20}".format(iter, bound))
        with open(r'dkfTrainTest.csv', 'a') as f:
            btrain = np.mean(training_loss_noAnneal(params))
            if btrain > 1:
                btrain = 1
            if btrain < -1:
                btrain = -1
            writer = csv.writer(f)
            writer.writerow([btrain])

    training_loss_grad = grad(training_loss)
    #pdb.set_trace()
    trained_params = adam(training_loss_grad,
                          params,
                          step_size=0.05,
                          num_iters=1000,
                          callback=print_perf)

    def plotTrainingCurve():
        X = np.genfromtxt(r'dkfTrainTest.csv', delimiter=',')
        t = np.arange(X.shape[0])
        plt.clf()
        plt.plot(t, X)
        #plt.plot(t,X[:,1])
        #plt.legend(['Train', 'Test'])
        plt.savefig('trainingCurvedkf.jpg')

    plotTrainingCurve()
Exemple #28
0
        ax.set_xticks([])

    fig = plt.figure(figsize=(8,8), facecolor='white')
    ax = fig.add_subplot(111, frameon=False)
    plt.ion()
    plt.show(block=False)

    num_plotting_samples = 51

    def callback(params, t, g):
        print("Iteration {} lower bound {}".format(t, -objective(params, t)))
        print (params)

        plt.cla()
        target_distribution = lambda x: np.exp(log_density(x, t))
        var_distribution    = lambda x: np.exp(variational_log_density(params, x))
        plot_isocontours(ax, target_distribution)
        plot_isocontours(ax, var_distribution, cmap=plt.cm.bone)
        ax.set_autoscale_on(False)


        rs = npr.RandomState(np.random.randint(0,6))
        samples = variational_sampler(params, num_plotting_samples, rs)
        plt.plot(samples[:, 0], samples[:, 1], 'x')

        plt.draw()
        plt.pause(1.0/30.0)

    print("Optimizing variational parameters...")
    variational_params = adam(grad(objective), init_var_params(D), step_size=0.1,
                              num_iters=2000, callback=callback)
    plt.ion()
    plt.show(block=False)

    num_plotting_samples = 51

    def callback(params, t, g):
        print("Iteration {} lower bound {}".format(t, -objective(params, t)))

        plt.cla()
        target_distribution = lambda x: np.exp(log_posterior(x, t))
        plot_isocontours(ax, target_distribution)
        for inner_params in np.split(params, k):
            var_distribution = lambda x: np.exp(
                diag_gaussian_density_from_params(inner_params, x))
            plot_isocontours(ax, var_distribution)

        rs = npr.RandomState(0)
        samples = iwae_qf_sample(log_posterior, params, t, k,
                                 num_plotting_samples, rs)
        plt.plot(samples[:, 0], samples[:, 1], 'x')

        plt.draw()
        plt.pause(1.0 / 30.0)

    print("Optimizing variational parameters...")
    adam(grad(objective),
         init_qf_params(k, D),
         step_size=0.1,
         num_iters=2000,
         callback=callback)