Ejemplo n.º 1
0
def make_gradfun(run_inference, recognize, loglike, pgm_prior, data,
                 batch_size, num_samples, natgrad_scale=1., callback=callback):
    _, unflat = flatten(pgm_prior)
    num_datapoints = get_num_datapoints(data)
    data_batches, num_batches = split_into_batches(data, batch_size)
    get_batch = lambda i: data_batches[i % num_batches]
    saved = lambda: None

    def mc_elbo(pgm_params, loglike_params, recogn_params, i):
        nn_potentials = recognize(recogn_params, get_batch(i))
        samples, saved.stats, global_kl, local_kl = \
            run_inference(pgm_prior, pgm_params, nn_potentials, num_samples)
        return (num_batches * loglike(loglike_params, samples, get_batch(i))
                - global_kl - num_batches * local_kl) / num_datapoints

    def gradfun(params, i):
        pgm_params, loglike_params, recogn_params = params
        objective = lambda (loglike_params, recogn_params): \
            -mc_elbo(pgm_params, loglike_params, recogn_params, i)
        val, (loglike_grad, recogn_grad) = vgrad(objective)((loglike_params, recogn_params))
        # this expression for pgm_natgrad drops a term that can be computed using
        # the function autograd.misc.fixed_points.fixed_point
        pgm_natgrad = -natgrad_scale / num_datapoints * \
            (flat(pgm_prior) + num_batches*flat(saved.stats) - flat(pgm_params))
        grad = unflat(pgm_natgrad), loglike_grad, recogn_grad
        if callback: callback(i, val, params, grad)
        return grad

    return gradfun
Ejemplo n.º 2
0
    def adadelta(allparams,
                 nat_stepsize,
                 num_epochs,
                 seq_len,
                 num_seqs=None,
                 rho=0.95,
                 epsilon=1e-6,
                 num_samples=1,
                 permute=True):
        natparams, params = allparams[:1], allparams[1:]
        sum_gsq = zeros_like(params)  # accumulated sq. grads
        sum_usq = zeros_like(params)  # accumulated sq. updates
        accumulate = lambda a, b: add(scale(rho, a), scale(1 - rho, b))

        for epoch in xrange(num_epochs):
            vals = []
            batches, num_batches = split_into_batches(data, seq_len, num_seqs)
            for y in batches:
                val, grad = scale(
                    1. / num_datapoints,
                    val_and_grad(y, num_batches, num_samples, *allparams))
                natgrad, grad = grad[:1], grad[1:]
                sum_gsq = accumulate(sum_gsq, square(grad))
                diag_scaling = div(sqrt(add_scalar(epsilon, sum_usq)),
                                   sqrt(add_scalar(epsilon, sum_gsq)))
                update = mul(diag_scaling, grad)
                sum_usq = accumulate(sum_usq, square(update))

                natparams = add(natparams, scale(nat_stepsize, natgrad))
                params = add(params, update)
                allparams = concat(natparams, params)
                vals.append(val)

                if callback: callback(epoch, vals, natgrad, allparams)
        return allparams
Ejemplo n.º 3
0
    def adam(allparams,
             nat_stepsize,
             stepsize,
             num_epochs,
             seq_len,
             num_seqs=None,
             b1=0.9,
             b2=0.999,
             eps=1e-8,
             num_samples=1):
        natparams, params = allparams[:1], allparams[1:]
        m = zeros_like(params)
        v = zeros_like(params)
        i = 0
        accumulate = lambda rho, a, b: add(scale(1 - rho, a), scale(rho, b))

        for epoch in xrange(num_epochs):
            vals = []
            batches, num_batches = split_into_batches(data, seq_len, num_seqs)
            for y in batches:
                val, grad = scale(
                    1. / num_datapoints,
                    val_and_grad(y, num_batches, num_samples, *allparams))
                natgrad, grad = grad[:1], grad[1:]

                m = accumulate(b1, grad, m)  # first moment estimate
                v = accumulate(b2, square(grad), v)  # second moment estimate
                mhat = scale(1. / (1 - b1**(i + 1)), m)  # bias correction
                vhat = scale(1. / (1 - b2**(i + 1)), v)
                update = scale(stepsize, div(mhat, add_scalar(eps,
                                                              sqrt(vhat))))

                natparams = add(natparams, scale(nat_stepsize, natgrad))
                params = add(params, update)
                allparams = concat(natparams, params)
                vals.append(val)
                i += 1

                if callback: callback(epoch, vals, natgrad, allparams)

        return allparams
Ejemplo n.º 4
0
Archivo: svae.py Proyecto: lfywork/svae
def make_gradfun(run_inference,
                 recognize,
                 loglike,
                 pgm_prior,
                 pgm_expectedstats,
                 data,
                 batch_size,
                 num_samples,
                 natgrad_scale=1.,
                 callback=callback):
    _, unflat = flatten(pgm_prior)
    num_datapoints = get_num_datapoints(data)
    data_batches, num_batches = split_into_batches(data, batch_size)
    get_batch = lambda i: data_batches[i % num_batches]
    saved = lambda: None

    def mc_elbo(pgm_params, pgm_stats, loglike_params, recogn_params, i):
        nn_potentials = recognize(recogn_params, get_batch(i))
        samples, saved.stats, global_kl, local_kl = \
            run_inference(pgm_prior, pgm_params, pgm_stats, nn_potentials, num_samples)
        return (num_batches * loglike(loglike_params, samples, get_batch(i)) -
                global_kl - num_batches * local_kl) / num_datapoints

    def gradfun(params, i):
        pgm_params, loglike_params, recogn_params = params
        objective = lambda (pgm_stats, loglike_params, recogn_params): \
            -mc_elbo(pgm_params, pgm_stats, loglike_params, recogn_params, i)
        pgm_stats = pgm_expectedstats(pgm_params)
        val, (pgm_stats_grad, loglike_grad, recogn_grad) = vgrad(objective)(
            (pgm_stats, loglike_params, recogn_params))
        pgm_natgrad = -natgrad_scale / num_datapoints * \
            (flat(pgm_prior) + num_batches*(flat(saved.stats) + flat(pgm_stats_grad)) - flat(pgm_params))
        grad = unflat(pgm_natgrad), loglike_grad, recogn_grad
        if callback: callback(i, val, params, grad)
        return grad

    return gradfun
Ejemplo n.º 5
0
    def train(self, X, Y, sig2, rff_dim=1200, batch_size=16, epochs=16):

        model_graph = tf.Graph()
        model_sess = tf.Session(graph=model_graph)

        with model_graph.as_default():
            X_tr = tf.placeholder(dtype=tf.float64, shape=[None, self.dim_in])
            Y_true = tf.placeholder(dtype=tf.float64, shape=[None, 1])
            H_inv = tf.placeholder(dtype=tf.float64, shape=[rff_dim, rff_dim])
            Phi_y = tf.placeholder(dtype=tf.float64, shape=[rff_dim, 1])

            rff_layer = kernel_layers.RandomFourierFeatures(
                output_dim=rff_dim,
                kernel_initializer='gaussian',
                trainable=True)

            ## define model
            rff_output = tf.cast(rff_layer(X_tr) * np.sqrt(2. / rff_dim),
                                 dtype=tf.float64)

            weight_cov = util.minibatch_woodbury_update(rff_output, H_inv)

            covl_xy = util.minibatch_interaction_update(
                Phi_y, rff_output, Y_true)

            random_feature_weight = rff_layer.kernel

            random_feature_bias = rff_layer.bias

        ### Training and Evaluation ###
        X_batches = util.split_into_batches(X, batch_size) * epochs
        Y_batches = util.split_into_batches(Y, batch_size) * epochs

        num_steps = X_batches.__len__()
        num_batch = int(num_steps / epochs)

        with model_sess as sess:
            sess.run(tf.global_variables_initializer())

            rff_1 = sess.run(rff_output, feed_dict={X_tr: X_batches[0]})
            weight_cov_val = util.compute_inverse(rff_1, sig_sq=sig2**2)
            covl_xy_val = np.matmul(rff_1.T, Y_batches[0])

            rff_weight, rff_bias = sess.run(
                [random_feature_weight, random_feature_bias])

            for batch_id in range(1, num_batch):
                X_batch = X_batches[batch_id]
                Y_batch = Y_batches[batch_id]

                ## update posterior mean/covariance
                try:
                    weight_cov_val, covl_xy_val = sess.run(
                        [weight_cov, covl_xy],
                        feed_dict={
                            X_tr: X_batch,
                            Y_true: Y_batch,
                            H_inv: weight_cov_val,
                            Phi_y: covl_xy_val
                        })
                except:
                    print("\n================================\n"
                          "Problem occurred at Step {}\n"
                          "================================".format(batch_id))

        self.beta = np.matmul(weight_cov_val, covl_xy_val)[:, 0]

        self.Sigma_beta = weight_cov_val * sig2**2

        self.RFF_weight = rff_weight  # (d, D)

        self.RFF_bias = rff_bias  # (D, )