Example #1
0
def test_elbo():
    mu0 = 1.5
    sigma = 1.0
    y_obs = np.array([1.6, 1.4])
        
    # Create a model for test
    with Model() as model:
        mu = Normal('mu', mu=mu0, sd=sigma)
        y = Normal('y', mu=mu, sd=1, observed=y_obs)

    vars = inputvars(model.vars)

    # Create variational gradient tensor
    grad, elbo, shared, uw = variational_gradient_estimate(
        vars, model, n_mcsamples=10000, random_seed=1)

    # Variational posterior parameters
    uw_ = np.array([1.88, np.log(1)])

    # Calculate elbo computed with MonteCarlo
    f = function([uw], elbo)
    elbo_mc = f(uw_)

    # Exact value
    elbo_true = (-0.5 * (
        3 + 3 * uw_[0]**2 - 2 * (y_obs[0] + y_obs[1] + mu0) * uw_[0] +
        y_obs[0]**2 + y_obs[1]**2 + mu0**2 + 3 * np.log(2 * np.pi)) +
        0.5 * (np.log(2 * np.pi) + 1))

    np.testing.assert_allclose(elbo_mc, elbo_true, rtol=0, atol=1e-1)
Example #2
0
    def __init__(
        self,
        draws=10000,
        model=None,
        random_seed=-1,
        chain=0,
        frac_validate=0.8,
        alpha=(0,0),
        rho=0.01,
        verbose=False,
    ):

        self.draws = draws
        self.model = model
        self.random_seed = random_seed
        self.chain = chain
        self.frac_validate = frac_validate
        self.alpha = alpha
        self.rho = rho
        self.verbose = verbose
        
        self.model = modelcontext(model)

        if self.random_seed != -1:
            np.random.seed(self.random_seed)

        self.variables = inputvars(self.model.vars)
        self.log_marginal_likelihood = 0
        self.log_volume_factor = np.zeros(1)
        self.prior_weight = np.ones(self.draws) / self.draws
        self.posterior_weights = np.array([])
        self.log_evidences = np.array([])
        self.cumul_evidences = np.zeros(1)
        self.likelihood_logp_thresh = np.array([-np.inf])
        self.posterior_logp_thresh = np.array([])
Example #3
0
def test_elbo():
    mu0 = 1.5
    sigma = 1.0
    y_obs = np.array([1.6, 1.4])

    # Create a model for test
    with Model() as model:
        mu = Normal('mu', mu=mu0, sd=sigma)
        Normal('y', mu=mu, sd=1, observed=y_obs)

    vars = inputvars(model.vars)

    # Create variational gradient tensor
    elbo, _ = _calc_elbo(vars, model, n_mcsamples=10000, random_seed=1)

    # Variational posterior parameters
    uw_ = np.array([1.88, np.log(1)])

    # Calculate elbo computed with MonteCarlo
    uw_shared = shared(uw_, 'uw_shared')
    elbo = CallableTensor(elbo)(uw_shared)
    f = function([], elbo)
    elbo_mc = f()

    # Exact value
    elbo_true = (-0.5 * (3 + 3 * uw_[0]**2 - 2 *
                         (y_obs[0] + y_obs[1] + mu0) * uw_[0] + y_obs[0]**2 +
                         y_obs[1]**2 + mu0**2 + 3 * np.log(2 * np.pi)) + 0.5 *
                 (np.log(2 * np.pi) + 1))

    np.testing.assert_allclose(elbo_mc, elbo_true, rtol=0, atol=1e-1)
Example #4
0
    def test_elbo(self):
        mu0 = 1.5
        sigma = 1.0
        y_obs = np.array([1.6, 1.4])

        # Create a model for test
        with Model() as model:
            mu = Normal('mu', mu=mu0, sd=sigma)
            Normal('y', mu=mu, sd=1, observed=y_obs)

        model_vars = inputvars(model.vars)

        # Create variational gradient tensor
        elbo, _ = _calc_elbo(model_vars, model, n_mcsamples=10000, random_seed=self.random_seed)

        # Variational posterior parameters
        uw_ = np.array([1.88, np.log(1)])

        # Calculate elbo computed with MonteCarlo
        uw_shared = shared(uw_, 'uw_shared')
        elbo = CallableTensor(elbo)(uw_shared)
        f = function([], elbo)
        elbo_mc = f()

        # Exact value
        elbo_true = (-0.5 * (
            3 + 3 * uw_[0]**2 - 2 * (y_obs[0] + y_obs[1] + mu0) * uw_[0] +
            y_obs[0]**2 + y_obs[1]**2 + mu0**2 + 3 * np.log(2 * np.pi)) +
            0.5 * (np.log(2 * np.pi) + 1))

        np.testing.assert_allclose(elbo_mc, elbo_true, rtol=0, atol=1e-1)
Example #5
0
    def __init__(self,
                 vars=None,
                 num_particles=10,
                 max_stages=5000,
                 chunk="auto",
                 model=None):
        _log.warning("The BART model is experimental. Use with caution.")
        model = modelcontext(model)
        vars = inputvars(vars)
        self.bart = vars[0].distribution

        self.tune = True
        self.idx = 0
        self.iter = 0
        self.sum_trees = []
        self.chunk = chunk

        if chunk == "auto":
            self.chunk = max(1, int(self.bart.m * 0.1))
        self.bart.chunk = self.chunk
        self.num_particles = num_particles
        self.log_num_particles = np.log(num_particles)
        self.indices = list(range(1, num_particles))
        self.max_stages = max_stages
        self.old_trees_particles_list = []
        for i in range(self.bart.m):
            p = ParticleTree(self.bart.trees[i],
                             self.bart.prior_prob_leaf_node)
            self.old_trees_particles_list.append(p)

        shared = make_shared_replacements(vars, model)
        self.likelihood_logp = logp([model.datalogpt], vars, shared)
        super().__init__(vars, shared)
Example #6
0
    def __init__(
        self,
        draws=1000,
        kernel="metropolis",
        n_steps=25,
        parallel=False,
        start=None,
        cores=None,
        tune_steps=True,
        p_acc_rate=0.99,
        threshold=0.5,
        epsilon=1.0,
        dist_func="absolute_error",
        sum_stat=False,
        progressbar=False,
        model=None,
        random_seed=-1,
    ):

        self.draws = draws
        self.kernel = kernel
        self.n_steps = n_steps
        self.parallel = parallel
        self.start = start
        self.cores = cores
        self.tune_steps = tune_steps
        self.p_acc_rate = p_acc_rate
        self.threshold = threshold
        self.epsilon = epsilon
        self.dist_func = dist_func
        self.sum_stat = sum_stat
        self.progressbar = progressbar
        self.model = model
        self.random_seed = random_seed

        self.model = modelcontext(model)

        if self.random_seed != -1:
            np.random.seed(self.random_seed)

        if self.cores is None:
            self.cores = _cpu_count()

        self.beta = 0
        self.max_steps = n_steps
        self.proposed = draws * n_steps
        self.acc_rate = 1
        self.acc_per_chain = np.ones(self.draws)
        self.model.marginal_log_likelihood = 0
        self.variables = inputvars(self.model.vars)
        dimension = sum(v.dsize for v in self.variables)
        self.scalings = np.ones(self.draws) * min(1, 2.38 ** 2 / dimension)
        self.discrete = np.concatenate(
            [[v.dtype in discrete_types] * (v.dsize or 1) for v in self.variables]
        )
        self.any_discrete = self.discrete.any()
        self.all_discrete = self.discrete.all()
Example #7
0
    def __init__(self, vars=None, scaling=None, step_scale=0.25, is_cov=False,
                 model=None, blocked=True, use_single_leapfrog=False,
                 potential=None, integrator="leapfrog", **theano_kwargs):
        """Superclass to implement Hamiltonian/hybrid monte carlo

        Parameters
        ----------
        vars : list of theano variables
        scaling : array_like, ndim = {1,2}
            Scaling for momentum distribution. 1d arrays interpreted matrix diagonal.
        step_scale : float, default=0.25
            Size of steps to take, automatically scaled down by 1/n**(1/4)
        is_cov : bool, default=False
            Treat scaling as a covariance matrix/vector if True, else treat it as a
            precision matrix/vector
        model : pymc3 Model instance.  default=Context model
        blocked: Boolean, default True
        use_single_leapfrog: Boolean, will leapfrog steps take a single step at a time.
            default False.
        potential : Potential, optional
            An object that represents the Hamiltonian with methods `velocity`,
            `energy`, and `random` methods.
        **theano_kwargs: passed to theano functions
        """
        model = modelcontext(model)

        if vars is None:
            vars = model.cont_vars
        vars = inputvars(vars)

        if scaling is None and potential is None:
            scaling = model.test_point

        if isinstance(scaling, dict):
            scaling = guess_scaling(Point(scaling, model=model), model=model, vars=vars)

        if scaling is not None and potential is not None:
            raise ValueError("Can not specify both potential and scaling.")

        self.step_size = step_scale / (model.ndim ** 0.25)
        if potential is not None:
            self.potential = potential
        else:
            self.potential = quad_potential(scaling, is_cov, as_cov=False)

        shared = make_shared_replacements(vars, model)
        if theano_kwargs is None:
            theano_kwargs = {}

        self.H, self.compute_energy, self.compute_velocity, self.leapfrog, self.dlogp = get_theano_hamiltonian_functions(
            vars, shared, model.logpt, self.potential, use_single_leapfrog, integrator, **theano_kwargs)

        super(BaseHMC, self).__init__(vars, shared, blocked=blocked)
    def __init__(self, vars=None, scaling=None, step_scale=0.25, is_cov=False,
                 model=None, blocked=True, potential=None,
                 integrator="leapfrog", dtype=None, **theano_kwargs):
        """Set up Hamiltonian samplers with common structures.

        Parameters
        ----------
        vars : list of theano variables
        scaling : array_like, ndim = {1,2}
            Scaling for momentum distribution. 1d arrays interpreted matrix
            diagonal.
        step_scale : float, default=0.25
            Size of steps to take, automatically scaled down by 1/n**(1/4)
        is_cov : bool, default=False
            Treat scaling as a covariance matrix/vector if True, else treat
            it as a precision matrix/vector
        model : pymc3 Model instance
        blocked: bool, default=True
        potential : Potential, optional
            An object that represents the Hamiltonian with methods `velocity`,
            `energy`, and `random` methods.
        **theano_kwargs: passed to theano functions
        """
        model = modelcontext(model)

        if vars is None:
            vars = model.cont_vars
        vars = inputvars(vars)

        super(BaseHMC, self).__init__(vars, blocked=blocked, model=model,
                                      dtype=dtype, **theano_kwargs)

        size = self._logp_dlogp_func.size

        if scaling is None and potential is None:
            mean = floatX(np.zeros(size))
            var = floatX(np.ones(size))
            potential = QuadPotentialDiagAdapt(size, mean, var, 10)

        if isinstance(scaling, dict):
            point = Point(scaling, model=model)
            scaling = guess_scaling(point, model=model, vars=vars)

        if scaling is not None and potential is not None:
            raise ValueError("Can not specify both potential and scaling.")

        self.step_size = step_scale / (size ** 0.25)
        if potential is not None:
            self.potential = potential
        else:
            self.potential = quad_potential(scaling, is_cov)

        self.integrator = integration.CpuLeapfrogIntegrator(self.potential, self._logp_dlogp_func)
Example #9
0
    def __init__(self,
                 vars=None,
                 prior_cov=None,
                 prior_chol=None,
                 model=None,
                 **kwargs):
        self.model = modelcontext(model)
        chol = get_chol(prior_cov, prior_chol)
        self.prior_chol = tt.as_tensor_variable(chol)

        if vars is None:
            vars = self.model.cont_vars
        vars = inputvars(vars)

        super().__init__(vars, [self.model.fastlogp], **kwargs)
Example #10
0
    def __init__(self, vars=None, scaling=None, step_scale=0.25, is_cov=False,
                 model=None, blocked=True, use_single_leapfrog=False, **theano_kwargs):
        """Superclass to implement Hamiltonian/hybrid monte carlo

        Parameters
        ----------
        vars : list of theano variables
        scaling : array_like, ndim = {1,2}
            Scaling for momentum distribution. 1d arrays interpreted matrix diagonal.
        step_scale : float, default=0.25
            Size of steps to take, automatically scaled down by 1/n**(1/4)
        is_cov : bool, default=False
            Treat scaling as a covariance matrix/vector if True, else treat it as a
            precision matrix/vector
        state
            State object
        model : pymc3 Model instance.  default=Context model
        blocked: Boolean, default True
        use_single_leapfrog: Boolean, will leapfrog steps take a single step at a time.
            default False.
        **theano_kwargs: passed to theano functions
        """
        model = modelcontext(model)

        if vars is None:
            vars = model.cont_vars
        vars = inputvars(vars)

        if scaling is None:
            scaling = model.test_point

        if isinstance(scaling, dict):
            scaling = guess_scaling(Point(scaling, model=model), model=model, vars=vars)

        n = scaling.shape[0]
        self.step_size = step_scale / (n ** 0.25)
        self.potential = quad_potential(scaling, is_cov, as_cov=False)

        shared = make_shared_replacements(vars, model)
        if theano_kwargs is None:
            theano_kwargs = {}

        self.H, self.compute_energy, self.leapfrog, self._vars = get_theano_hamiltonian_functions(
            vars, shared, model.logpt, self.potential, use_single_leapfrog, **theano_kwargs)

        super(BaseHMC, self).__init__(vars, shared, blocked=blocked)
Example #11
0
    def __init__(self,
                 vars=None,
                 w=1.0,
                 tune=True,
                 model=None,
                 iter_limit=np.inf,
                 **kwargs):
        self.model = modelcontext(model)
        self.w = w
        self.tune = tune
        self.n_tunes = 0.0
        self.iter_limit = iter_limit

        if vars is None:
            vars = self.model.cont_vars
        vars = inputvars(vars)

        super().__init__(vars, [self.model.fastlogp], **kwargs)
Example #12
0
    def __new__(cls, *args, **kwargs):
        blocked = kwargs.get("blocked")
        if blocked is None:
            # Try to look up default value from class
            blocked = getattr(cls, "default_blocked", True)
            kwargs["blocked"] = blocked

        model = modelcontext(kwargs.get("model"))
        kwargs.update({"model": model})

        # vars can either be first arg or a kwarg
        if "vars" not in kwargs and len(args) >= 1:
            vars = args[0]
            args = args[1:]
        elif "vars" in kwargs:
            vars = kwargs.pop("vars")
        else:  # Assume all model variables
            vars = model.vars

        # get the actual inputs from the vars
        vars = inputvars(vars)

        if len(vars) == 0:
            raise ValueError("No free random variables to sample.")

        if not blocked and len(vars) > 1:
            # In this case we create a separate sampler for each var
            # and append them to a CompoundStep
            steps = []
            for var in vars:
                step = super().__new__(cls)
                # If we don't return the instance we have to manually
                # call __init__
                step.__init__([var], *args, **kwargs)
                # Hack for creating the class correctly when unpickling.
                step.__newargs = ([var], ) + args, kwargs
                steps.append(step)

            return CompoundStep(steps)
        else:
            step = super().__new__(cls)
            # Hack for creating the class correctly when unpickling.
            step.__newargs = (vars, ) + args, kwargs
            return step
Example #13
0
    def __init__(
        self,
        draws=2000,
        kernel="metropolis",
        n_steps=25,
        start=None,
        tune_steps=True,
        p_acc_rate=0.85,
        threshold=0.5,
        save_sim_data=False,
        save_log_pseudolikelihood=True,
        model=None,
        random_seed=-1,
        chain=0,
    ):

        self.draws = draws
        self.kernel = kernel.lower()
        self.n_steps = n_steps
        self.start = start
        self.tune_steps = tune_steps
        self.p_acc_rate = p_acc_rate
        self.threshold = threshold
        self.save_sim_data = save_sim_data
        self.save_log_pseudolikelihood = save_log_pseudolikelihood
        self.model = model
        self.random_seed = random_seed
        self.chain = chain

        self.model = modelcontext(model)

        if self.random_seed != -1:
            np.random.seed(self.random_seed)

        self.beta = 0
        self.max_steps = n_steps
        self.proposed = draws * n_steps
        self.acc_rate = 1
        self.variables = inputvars(self.model.vars)
        self.weights = np.ones(self.draws) / self.draws
        self.log_marginal_likelihood = 0
        self.sim_data = []
        self.log_pseudolikelihood = []
Example #14
0
def fixed_hessian(point, vars=None, model=None):
    """
    Returns a fixed Hessian for any chain location.

    Parameters
    ----------
    model: Model (optional if in `with` context)
    point: dict
    vars: list
        Variables for which Hessian is to be calculated.
    """

    model = modelcontext(model)
    if vars is None:
        vars = model.cont_vars
    vars = inputvars(vars)

    point = Point(point, model=model)

    bij = DictToArrayBijection(ArrayOrdering(vars), point)
    rval = np.ones(bij.map(point).size) / 10
    return rval
Example #15
0
    def __init__(self, vars=None, model=None, point=None):
        self.model = pm.modelcontext(model)

        # Work out the full starting coordinates
        if point is None:
            point = self.model.test_point
        else:
            pm.util.update_start_vals(point, self.model.test_point, self.model)

        # Fit all the parameters by default
        if vars is None:
            vars = self.model.cont_vars
        self.vars = inputvars(vars)
        allinmodel(self.vars, self.model)

        # Work out the relevant bijection map
        point = Point(point, model=self.model)
        self.bijection = DictToArrayBijection(ArrayOrdering(self.vars), point)

        # Pre-compile the theano model and gradient
        nlp = -self.model.logpt
        grad = theano.grad(nlp, self.vars, disconnected_inputs="ignore")
        self.func = get_theano_function_for_var([nlp] + grad, model=self.model)
Example #16
0
def advi_minibatch(vars=None,
                   start=None,
                   model=None,
                   n=5000,
                   n_mcsamples=1,
                   minibatch_RVs=None,
                   minibatch_tensors=None,
                   minibatches=None,
                   global_RVs=None,
                   local_RVs=None,
                   observed_RVs=None,
                   encoder_params=None,
                   total_size=None,
                   optimizer=None,
                   learning_rate=.001,
                   epsilon=.1,
                   random_seed=None,
                   mode=None):
    """Perform mini-batch ADVI.

    This function implements a mini-batch automatic differentiation variational
    inference (ADVI; Kucukelbir et al., 2015) with the meanfield
    approximation. Autoencoding variational Bayes (AEVB; Kingma and Welling,
    2014) is also supported.

    For explanation, we classify random variables in probabilistic models into
    three types. Observed random variables
    :math:`{\cal Y}=\{\mathbf{y}_{i}\}_{i=1}^{N}` are :math:`N` observations.
    Each :math:`\mathbf{y}_{i}` can be a set of observed random variables,
    i.e., :math:`\mathbf{y}_{i}=\{\mathbf{y}_{i}^{k}\}_{k=1}^{V_{o}}`, where
    :math:`V_{k}` is the number of the types of observed random variables
    in the model.

    The next ones are global random variables
    :math:`\Theta=\{\\theta^{k}\}_{k=1}^{V_{g}}`, which are used to calculate
    the probabilities for all observed samples.

    The last ones are local random variables
    :math:`{\cal Z}=\{\mathbf{z}_{i}\}_{i=1}^{N}`, where
    :math:`\mathbf{z}_{i}=\{\mathbf{z}_{i}^{k}\}_{k=1}^{V_{l}}`.
    These RVs are used only in AEVB.

    The goal of ADVI is to approximate the posterior distribution
    :math:`p(\Theta,{\cal Z}|{\cal Y})` by variational posterior
    :math:`q(\Theta)\prod_{i=1}^{N}q(\mathbf{z}_{i})`. All of these terms
    are normal distributions (mean-field approximation).

    :math:`q(\Theta)` is parametrized with its means and standard deviations.
    These parameters are denoted as :math:`\gamma`. While :math:`\gamma` is
    a constant, the parameters of :math:`q(\mathbf{z}_{i})` are dependent on
    each observation. Therefore these parameters are denoted as
    :math:`\\xi(\mathbf{y}_{i}; \\nu)`, where :math:`\\nu` is the parameters
    of :math:`\\xi(\cdot)`. For example, :math:`\\xi(\cdot)` can be a
    multilayer perceptron or convolutional neural network.

    In addition to :math:`\\xi(\cdot)`, we can also include deterministic
    mappings for the likelihood of observations. We denote the parameters of
    the deterministic mappings as :math:`\eta`. An example of such mappings is
    the deconvolutional neural network used in the convolutional VAE example
    in the PyMC3 notebook directory.

    This function maximizes the evidence lower bound (ELBO)
    :math:`{\cal L}(\gamma, \\nu, \eta)` defined as follows:

    .. math::

        {\cal L}(\gamma,\\nu,\eta) & =
        \mathbf{c}_{o}\mathbb{E}_{q(\Theta)}\left[
        \sum_{i=1}^{N}\mathbb{E}_{q(\mathbf{z}_{i})}\left[
        \log p(\mathbf{y}_{i}|\mathbf{z}_{i},\Theta,\eta)
        \\right]\\right] \\\\ &
        - \mathbf{c}_{g}KL\left[q(\Theta)||p(\Theta)\\right]
        - \mathbf{c}_{l}\sum_{i=1}^{N}
            KL\left[q(\mathbf{z}_{i})||p(\mathbf{z}_{i})\\right],

    where :math:`KL[q(v)||p(v)]` is the Kullback-Leibler divergence

    .. math::

        KL[q(v)||p(v)] = \int q(v)\log\\frac{q(v)}{p(v)}dv,

    :math:`\mathbf{c}_{o/g/l}` are vectors for weighting each term of ELBO.
    More precisely, we can write each of the terms in ELBO as follows:

    .. math::

        \mathbf{c}_{o}\log p(\mathbf{y}_{i}|\mathbf{z}_{i},\Theta,\eta) & = &
        \sum_{k=1}^{V_{o}}c_{o}^{k}
            \log p(\mathbf{y}_{i}^{k}|
                   {\\rm pa}(\mathbf{y}_{i}^{k},\Theta,\eta)) \\\\
        \mathbf{c}_{g}KL\left[q(\Theta)||p(\Theta)\\right] & = &
        \sum_{k=1}^{V_{g}}c_{g}^{k}KL\left[
            q(\\theta^{k})||p(\\theta^{k}|{\\rm pa(\\theta^{k})})\\right] \\\\
        \mathbf{c}_{l}KL\left[q(\mathbf{z}_{i}||p(\mathbf{z}_{i})\\right] & = &
        \sum_{k=1}^{V_{l}}c_{l}^{k}KL\left[
            q(\mathbf{z}_{i}^{k})||
            p(\mathbf{z}_{i}^{k}|{\\rm pa}(\mathbf{z}_{i}^{k}))\\right],

    where :math:`{\\rm pa}(v)` denotes the set of parent variables of :math:`v`
    in the directed acyclic graph of the model.

    When using mini-batches, :math:`c_{o}^{k}` and :math:`c_{l}^{k}` should be
    set to :math:`N/M`, where :math:`M` is the number of observations in each
    mini-batch. Another weighting scheme was proposed in
    (Blundell et al., 2015) for accelarating model fitting.

    For working with ADVI, we need to give the probabilistic model
    (:code:`model`), the three types of RVs (:code:`observed_RVs`,
    :code:`global_RVs` and :code:`local_RVs`), the tensors to which
    mini-bathced samples are supplied (:code:`minibatches`) and
    parameters of deterministic mappings :math:`\\xi` and :math:`\eta`
    (:code:`encoder_params`) as input arguments.

    :code:`observed_RVs` is a :code:`OrderedDict` of the form
    :code:`{y_k: c_k}`, where :code:`y_k` is a random variable defined in the
    PyMC3 model. :code:`c_k` is a scalar (:math:`c_{o}^{k}`) and it can be a
    shared variable.

    :code:`global_RVs` is a :code:`OrderedDict` of the form
    :code:`{t_k: c_k}`, where :code:`t_k` is a random variable defined in the
    PyMC3 model. :code:`c_k` is a scalar (:math:`c_{g}^{k}`) and it can be a
    shared variable.

    :code:`local_RVs` is a :code:`OrderedDict` of the form
    :code:`{z_k: ((m_k, s_k), c_k)}`, where :code:`z_k` is a random variable
    defined in the PyMC3 model. :code:`c_k` is a scalar (:math:`c_{l}^{k}`)
    and it can be a shared variable. :code:`(m_k, s_k)` is a pair of tensors
    of means and log standard deviations of the variational distribution;
    samples drawn from the variational distribution replaces :code:`z_k`.
    It should be noted that if :code:`z_k` has a transformation that changes
    the dimension (e.g., StickBreakingTransform), the variational distribution
    must have the same dimension. For example, if :code:`z_k` is distributed
    with Dirichlet distribution with :code:`p` choices, :math:`m_k` and
    :code:`s_k` has the shape :code:`(n_samples_in_minibatch, p - 1)`.

    :code:`minibatch_tensors` is a list of tensors (can be shared variables)
    to which mini-batch samples are set during the optimization.
    These tensors are observations (:code:`obs=`) in :code:`observed_RVs`.

    :code:`minibatches` is a generator of a list of :code:`numpy.ndarray`.
    Each item of the list will be set to tensors in :code:`minibatch_tensors`.

    :code:`encoder_params` is a list of shared variables of the parameters
    :math:`\\nu` and :math:`\eta`. We do not need to include the variational
    parameters of the global variables, :math:`\gamma`, because these are
    automatically created and updated in this function.

    The following is a list of example notebooks using advi_minibatch:

    - docs/source/notebooks/GLM-hierarchical-advi-minibatch.ipynb
    - docs/source/notebooks/bayesian_neural_network_advi.ipynb
    - docs/source/notebooks/convolutional_vae_keras_advi.ipynb
    - docs/source/notebooks/gaussian-mixture-model-advi.ipynb
    - docs/source/notebooks/lda-advi-aevb.ipynb

    Parameters
    ----------
    vars : object
        List of random variables. If None, variational posteriors (normal
        distribution) are fit for all RVs in the given model.
    start : Dict or None
        Initial values of parameters (variational means).
    model : Model
        Probabilistic model.
    n : int
        Number of iterations updating parameters.
    n_mcsamples : int
        Number of Monte Carlo samples to approximate ELBO.
    minibatch_RVs : list of ObservedRVs
        Random variables in the model for which mini-batch tensors are set.
        When this argument is given, both of arguments local_RVs and
        observed_RVs must be None.
    minibatch_tensors : list of (tensors or shared variables)
        Tensors used to create ObservedRVs in minibatch_RVs.
    minibatches : generator of list
        Generates a set of minibatches when calling next().
        The length of the returned list must be the same with the number of
        random variables in `minibatch_tensors`.
    total_size : int
        Total size of training samples. This is used to appropriately scale the
        log likelihood terms corresponding to mini-batches in ELBO.
    observed_RVs : Ordered dict
        Include a scaling constant for the corresponding RV. See the above
        description.
    global_RVs : Ordered dict or None
        Include a scaling constant for the corresponding RV. See the above
        description. If :code:`None`, it is set to
        :code:`{v: 1 for v in grvs}`, where :code:`grvs` is
        :code:`list(set(vars) - set(list(local_RVs) + list(observed_RVs)))`.
    local_RVs : Ordered dict or None
        Include encoded variational parameters and a scaling constant for
        the corresponding RV. See the above description.
    encoder_params : list of theano shared variables
        Parameters of encoder.
    optimizer : (loss, list of shared variables) -> dict or OrderedDict
        A function that returns parameter updates given loss and shared
        variables of parameters. If :code:`None` (default), a default
        Adagrad optimizer is used with parameters :code:`learning_rate`
        and :code:`epsilon` below.
    learning_rate: float
        Base learning rate for adagrad.
        This parameter is ignored when :code:`optimizer` is set.
    epsilon : float
        Offset in denominator of the scale of learning rate in Adagrad.
        This parameter is ignored when :code:`optimizer` is set.
    random_seed : int
        Seed to initialize random state.

    Returns
    -------
    ADVIFit
        Named tuple, which includes 'means', 'stds', and 'elbo_vals'.

    References
    ----------
    - Kingma, D. P., & Welling, M. (2014).
      Auto-Encoding Variational Bayes. stat, 1050, 1.
    - Kucukelbir, A., Ranganath, R., Gelman, A., & Blei, D. (2015).
      Automatic variational inference in Stan. In Advances in neural
      information processing systems (pp. 568-576).
    - Blundell, C., Cornebise, J., Kavukcuoglu, K., & Wierstra, D. (2015).
      Weight Uncertainty in Neural Network. In Proceedings of the 32nd
      International Conference on Machine Learning (ICML-15) (pp. 1613-1622).
    """
    if encoder_params is None:
        encoder_params = []

    model = pm.modelcontext(model)
    vars = inputvars(vars if vars is not None else model.vars)
    start = start if start is not None else model.test_point

    if not pm.model.all_continuous(vars):
        raise ValueError('Model can not include discrete RVs for ADVI.')

    _check_minibatches(minibatch_tensors, minibatches)

    if encoder_params is None:
        encoder_params = []

    # Prepare optimizer
    if optimizer is None:
        optimizer = adagrad_optimizer(learning_rate, epsilon)

    # For backward compatibility in how input arguments are given
    local_RVs, observed_RVs = _get_rvss(minibatch_RVs, local_RVs, observed_RVs,
                                        minibatch_tensors, total_size)

    # Replace local_RVs with transformed variables
    def get_transformed(v):
        if hasattr(v, 'transformed'):
            return v.transformed
        return v

    local_RVs = OrderedDict([(get_transformed(v), (uw, s))
                             for v, (uw, s) in local_RVs.items()])

    # Get global variables
    grvs = list(set(vars) - set(list(local_RVs) + list(observed_RVs)))
    if global_RVs is None:
        global_RVs = OrderedDict({v: 1 for v in grvs})
    elif len(grvs) != len(global_RVs):
        _value_error('global_RVs ({}) must have all global RVs: {}'.format(
            [v for v in global_RVs], grvs))

    # ELBO wrt variational parameters
    elbo, uw_l, uw_g = _make_elbo_t(observed_RVs, global_RVs, local_RVs,
                                    model.potentials, n_mcsamples, random_seed)

    # Replacements tensors of variational parameters in the graph
    replaces = dict()

    # Variational parameters for global RVs
    if 0 < len(global_RVs):
        uw_global_shared, bij = _init_uw_global_shared(start, global_RVs)
        replaces.update({uw_g: uw_global_shared})

    # Variational parameters for local RVs, encoded from samples in
    # mini-batches
    if 0 < len(local_RVs):
        uws = [uw for _, (uw, _) in local_RVs.items()]
        uw_local_encoded = tt.concatenate([uw[0].ravel() for uw in uws] +
                                          [uw[1].ravel() for uw in uws])
        replaces.update({uw_l: uw_local_encoded})

    # Replace tensors of variational parameters in ELBO
    elbo = theano.clone(elbo, OrderedDict(replaces), strict=False)

    # Replace input shared variables with tensors
    def is_shared(t):
        return isinstance(t, theano.compile.sharedvalue.SharedVariable)

    tensors = [(t.type() if is_shared(t) else t) for t in minibatch_tensors]
    updates = OrderedDict(
        {t: t_
         for t, t_ in zip(minibatch_tensors, tensors) if is_shared(t)})
    elbo = theano.clone(elbo, updates, strict=False)

    # Create parameter update function used in the training loop
    params = encoder_params
    if 0 < len(global_RVs):
        params += [uw_global_shared]
    updates = OrderedDict(optimizer(loss=-1 * elbo, param=params))
    f = theano.function(tensors, elbo, updates=updates, mode=mode)

    # Optimization loop
    elbos = np.empty(n)
    progress = tqdm.trange(n)
    for i in progress:
        e = f(*next(minibatches))
        if np.isnan(e):
            raise FloatingPointError('NaN occurred in ADVI optimization.')
        elbos[i] = e
        if n < 10:
            progress.set_description('ELBO = {:,.2f}'.format(elbos[i]))
        elif i % (n // 10) == 0 and i > 0:
            avg_elbo = infmean(elbos[i - n // 10:i])
            progress.set_description('Average ELBO = {:,.2f}'.format(avg_elbo))

    pm._log.info('Finished minibatch ADVI: ELBO = {:,.2f}'.format(elbos[-1]))

    # Variational parameters of global RVs
    if 0 < len(global_RVs):
        l = int(uw_global_shared.get_value(borrow=True).size / 2)
        u = bij.rmap(uw_global_shared.get_value(borrow=True)[:l])
        w = bij.rmap(uw_global_shared.get_value(borrow=True)[l:])
        # w is in log space
        for var in w.keys():
            w[var] = np.exp(w[var])
    else:
        u = dict()
        w = dict()

    return ADVIFit(u, w, elbos)
Example #17
0
    def __init__(self,
                 vars=None,
                 out_vars=None,
                 covariance=None,
                 scale=1.,
                 n_chains=100,
                 tune=True,
                 tune_interval=100,
                 model=None,
                 check_bound=True,
                 likelihood_name='like',
                 proposal_name='MultivariateNormal',
                 coef_variation=1.,
                 **kwargs):

        model = modelcontext(model)

        if vars is None:
            vars = model.vars

        vars = inputvars(vars)

        if out_vars is None:
            out_vars = model.unobserved_RVs

        out_varnames = [out_var.name for out_var in out_vars]

        self.scaling = np.atleast_1d(scale)

        if covariance is None and proposal_name == 'MultivariateNormal':
            self.covariance = np.eye(sum(v.dsize for v in vars))
            scale = self.covariance

        self.tune = tune
        self.check_bnd = check_bound
        self.tune_interval = tune_interval
        self.steps_until_tune = tune_interval

        self.proposal_name = proposal_name
        self.proposal_dist = choose_proposal(self.proposal_name, scale=scale)

        self.proposal_samples_array = self.proposal_dist(n_chains)

        self.stage_sample = 0
        self.accepted = 0

        self.beta = 0
        self.stage = 0
        self.chain_index = 0
        self.resampling_indexes = np.arange(n_chains)

        self.coef_variation = coef_variation
        self.n_chains = n_chains
        self.likelihoods = np.zeros(n_chains)

        self.likelihood_name = likelihood_name
        self._llk_index = out_varnames.index(likelihood_name)
        self.discrete = np.concatenate(
            [[v.dtype in discrete_types] * (v.dsize or 1) for v in vars])
        self.any_discrete = self.discrete.any()
        self.all_discrete = self.discrete.all()

        # create initial population
        self.population = []
        self.array_population = np.zeros(n_chains)
        for i in range(self.n_chains):
            dummy = pm.Point({v.name: v.random() for v in vars}, model=model)
            self.population.append(dummy)

        self.population[0] = model.test_point

        self.chain_previous_lpoint = copy.deepcopy(self.population)

        shared = make_shared_replacements(vars, model)
        self.logp_forw = logp_forw(out_vars, vars, shared)
        self.check_bnd = logp_forw([model.varlogpt], vars, shared)

        super(ATMCMC, self).__init__(vars, out_vars, shared)
Example #18
0
def optimize(start=None,
             vars=None,
             model=None,
             return_info=False,
             verbose=True,
             **kwargs):
    """Maximize the log prob of a PyMC3 model using scipy

    All extra arguments are passed directly to the ``scipy.optimize.minimize``
    function.

    Args:
        start: The PyMC3 coordinate dictionary of the starting position
        vars: The variables to optimize
        model: The PyMC3 model
        return_info: Return both the coordinate dictionary and the result of
            ``scipy.optimize.minimize``
        verbose: Print the success flag and log probability to the screen

    """
    from scipy.optimize import minimize

    model = pm.modelcontext(model)

    # Work out the full starting coordinates
    if start is None:
        start = model.test_point
    else:
        update_start_vals(start, model.test_point, model)

    # Fit all the parameters by default
    if vars is None:
        vars = model.cont_vars
    vars = inputvars(vars)
    allinmodel(vars, model)

    # Work out the relevant bijection map
    start = Point(start, model=model)
    bij = DictToArrayBijection(ArrayOrdering(vars), start)

    # Pre-compile the theano model and gradient
    nlp = -model.logpt
    grad = theano.grad(nlp, vars, disconnected_inputs="ignore")
    func = get_theano_function_for_var([nlp] + grad, model=model)

    if verbose:
        names = [
            get_untransformed_name(v.name)
            if is_transformed_name(v.name) else v.name for v in vars
        ]
        sys.stderr.write("optimizing logp for variables: [{0}]\n".format(
            ", ".join(names)))
        bar = tqdm.tqdm()

    # This returns the objective function and its derivatives
    def objective(vec):
        res = func(*get_args_for_theano_function(bij.rmap(vec), model=model))
        d = dict(zip((v.name for v in vars), res[1:]))
        g = bij.map(d)
        if verbose:
            bar.set_postfix(logp="{0:e}".format(-res[0]))
            bar.update()
        return res[0], g

    # Optimize using scipy.optimize
    x0 = bij.map(start)
    initial = objective(x0)[0]
    kwargs["jac"] = True
    info = minimize(objective, x0, **kwargs)

    # Only accept the output if it is better than it was
    x = info.x if (np.isfinite(info.fun) and info.fun < initial) else x0

    # Coerce the output into the right format
    vars = get_default_varnames(model.unobserved_RVs, True)
    point = {
        var.name: value
        for var, value in zip(vars,
                              model.fastfn(vars)(bij.rmap(x)))
    }

    if verbose:
        bar.close()
        sys.stderr.write("message: {0}\n".format(info.message))
        sys.stderr.write("logp: {0} -> {1}\n".format(-initial, -info.fun))
        if not np.isfinite(info.fun):
            logger.warning("final logp not finite, returning initial point")
            logger.warning(
                "this suggests that something is wrong with the model")
            logger.debug("{0}".format(info))

    if return_info:
        return point, info
    return point
Example #19
0
def advi_minibatch(vars=None,
                   start=None,
                   model=None,
                   n=5000,
                   n_mcsamples=1,
                   minibatch_RVs=None,
                   minibatch_tensors=None,
                   minibatches=None,
                   local_RVs=None,
                   observed_RVs=None,
                   encoder_params=[],
                   total_size=None,
                   optimizer=None,
                   learning_rate=.001,
                   epsilon=.1,
                   random_seed=None,
                   verbose=1,
                   dp_par=None):
    """Perform mini-batch ADVI.
    This function implements a mini-batch ADVI with the meanfield 
    approximation. Autoencoding variational inference is also supported. 
    The log probability terms for mini-batches, corresponding to RVs in 
    minibatch_RVs, are scaled to (total_size) / (the number of samples in each 
    mini-batch), where total_size is an argument for the total data size. 
    minibatch_tensors is a list of tensors (can be shared variables) to which 
    mini-batch samples are set during the optimization. In most cases, these 
    tensors are observations for RVs in the model. 
    local_RVs and observed_RVs are used for autoencoding variational Bayes. 
    Both of these RVs are associated with each of given samples. 
    The difference is that local_RVs are unkown and their posterior 
    distributions are approximated. 
    local_RVs are Ordered dict, whose keys and values are RVs and a tuple of 
    two objects. The first is the theano expression of variational parameters 
    (mean and log of std) of the approximate posterior, which are encoded from 
    given samples by an arbitrary deterministic function, e.g., MLP. The other 
    one is a scaling constant to be multiplied to the log probability term 
    corresponding to the RV. 
    observed_RVs are also Ordered dict with RVs as the keys, but whose values 
    are only the scaling constant as in local_RVs. In this case, total_size is 
    ignored. 
    If local_RVs is None (thus not using autoencoder), the following two 
    settings are equivalent: 
    - observed_RVs=OrderedDict([(rv, total_size / minibatch_size)])
    - minibatch_RVs=[rv], total_size=total_size
    where minibatch_size is minibatch_tensors[0].shape[0]. 
    The variational parameters and the parameters of the autoencoder are 
    simultaneously optimized with given optimizer, which is a function that 
    returns a dictionary of parameter updates as provided to Theano function. 
    See the docstring of pymc3.variational.advi(). 
    Parameters
    ----------
    vars : object
        List of random variables. If None, variational posteriors (normal 
        distribution) are fit for all RVs in the given model. 
    start : Dict or None
        Initial values of parameters (variational means).
    model : Model
        Probabilistic model.
    n : int
        Number of interations updating parameters.
    n_mcsamples : int
        Number of Monte Carlo samples to approximate ELBO.
    minibatch_RVs : list of ObservedRVs
        Random variables in the model for which mini-batch tensors are set. 
        When this argument is given, both of arguments local_RVs and global_RVs 
        must be None. 
    minibatch_tensors : list of (tensors or shared variables)
        Tensors used to create ObservedRVs in minibatch_RVs.
    minibatches : generator of list
        Generates a set of minibatches when calling next().
        The length of the returned list must be the same with the number of
        random variables in `minibatch_tensors`.
    total_size : int
        Total size of training samples. This is used to appropriately scale the 
        log likelihood terms corresponding to mini-batches in ELBO. 
    local_RVs : Ordered dict
        Include encoded variational parameters and a scaling constant for 
        the corresponding RV. See the above description. 
    observed_RVs : Ordered dict
        Include a scaling constant for the corresponding RV. See the above 
        description
    encoder_params : list of theano shared variables
        Parameters of encoder. 
    optimizer : (loss, tensor) -> dict or OrderedDict
        A function that returns parameter updates given loss and parameter
        tensor. If :code:`None` (default), a default Adagrad optimizer is
        used with parameters :code:`learning_rate` and :code:`epsilon` below.
    learning_rate: float
        Base learning rate for adagrad. This parameter is ignored when
        an optimizer is given.
    epsilon : float
        Offset in denominator of the scale of learning rate in Adagrad.
        This parameter is ignored when an optimizer is given.
    random_seed : int
        Seed to initialize random state.
    Returns
    -------
    ADVIFit
        Named tuple, which includes 'means', 'stds', and 'elbo_vals'.
    """
    theano.config.compute_test_value = 'ignore'

    model = modelcontext(model)
    vars = inputvars(vars if vars is not None else model.vars)
    start = start if start is not None else model.test_point
    check_discrete_rvs(vars)
    _check_minibatches(minibatch_tensors, minibatches)

    # Prepare optimizer
    if optimizer is None:
        optimizer = adagrad_optimizer(learning_rate, epsilon)

    # For backward compatibility in how input arguments are given
    local_RVs, observed_RVs = _get_rvss(minibatch_RVs, local_RVs, observed_RVs,
                                        minibatch_tensors, total_size)

    # Replace local_RVs with transformed variables
    ds = model.deterministics

    def get_transformed(v):
        if v in ds:
            return v.transformed
        return v

    local_RVs = OrderedDict([(get_transformed(v), (uw, s))
                             for v, (uw, s) in local_RVs.items()])

    # Get global variables
    global_RVs = list(set(vars) - set(list(local_RVs) + list(observed_RVs)))

    # Ordering for concatenation of random variables
    global_order = ArrayOrdering([v for v in global_RVs])
    local_order = ArrayOrdering([v for v in local_RVs])

    # ELBO wrt variational parameters
    inarray_g, uw_g, replace_g = _join_global_RVs(global_RVs, global_order)
    inarray_l, uw_l, replace_l = _join_local_RVs(local_RVs, local_order)
    logpt = _make_logpt(global_RVs, local_RVs, observed_RVs, model)
    replace = replace_g
    if replace_l is not None:
        replace.update(replace_l)
    logp = theano.clone(logpt, replace, strict=False)
    elbo = _elbo_t(logp, uw_g, uw_l, inarray_g, inarray_l, n_mcsamples,
                   random_seed)
    del logpt

    # Variational parameters for global RVs
    uw_global_shared, bij = _init_uw_global_shared(start, global_RVs,
                                                   global_order)

    # Variational parameters for local RVs, encoded from samples in
    # mini-batches
    if 0 < len(local_RVs):
        uws = [uw for _, (uw, _) in local_RVs.items()]
        uw_local_encoded = tt.concatenate([uw[0].ravel() for uw in uws] +
                                          [uw[1].ravel() for uw in uws])

    # Replace tensors in ELBO
    updates = {uw_g: uw_global_shared, uw_l: uw_local_encoded} \
        if 0 < len(local_RVs) else \
              {uw_g: uw_global_shared}
    elbo = theano.clone(elbo, updates, strict=False)

    # Replace input shared variables with tensors
    def is_shared(t):
        return isinstance(t, theano.compile.sharedvalue.SharedVariable)

    tensors = [(t.type() if is_shared(t) else t) for t in minibatch_tensors]
    updates = OrderedDict(
        {t: t_
         for t, t_ in zip(minibatch_tensors, tensors) if is_shared(t)})
    elbo = theano.clone(elbo, updates, strict=False)

    # Create parameter update function used in the training loop
    params = [uw_global_shared] + encoder_params
    updates = OrderedDict()
    for param in params:
        # g = tt.grad(elbo, wrt=param)
        # updates.update(adagrad(g, param, learning_rate, epsilon, n=10))
        updates.update(
            optimizer(likeloss=-1 * elbo[0],
                      entroloss=-1 * elbo[1],
                      param=param,
                      dp_par=dp_par,
                      n_par=len(vars)))
    f = theano.function(tensors,
                        tt.add(elbo[1], tt.sum(elbo[0], axis=0)),
                        updates=updates)

    # Optimization loop
    elbos = np.empty(n)
    l = int(uw_global_shared.get_value(borrow=True).size / 2)
    for i in range(n):
        u_old = bij.rmap(uw_global_shared.get_value(borrow=True)[:l])
        w_old = bij.rmap(uw_global_shared.get_value(borrow=True)[l:])
        e = f(*next(minibatches))
        if np.isnan(e):
            print('NaNs produced at iteration {}'.format(i))
            for var in w_old.keys():
                w_old[var] = np.exp(w_old[var])
            return ADVIFit(u_old, w_old, elbos[:i])
        elbos[i] = e
        if verbose and not i % (n // 10):
            if not i:
                print('Iteration {0} [{1}%]: ELBO = {2}'.format(
                    i, 100 * i // n, e.round(2)))
            else:
                avg_elbo = elbos[i - n // 10:i].mean()
                print('Iteration {0} [{1}%]: Average ELBO = {2}'.format(
                    i, 100 * i // n, avg_elbo.round(2)))

    if verbose:
        print('Finished [100%]: ELBO = {}'.format(elbos[-1].round(2)))

    u = bij.rmap(uw_global_shared.get_value(borrow=True)[:l])
    w = bij.rmap(uw_global_shared.get_value(borrow=True)[l:])
    # w is in log space
    for var in w.keys():
        w[var] = np.exp(w[var])
    return ADVIFit(u, w, elbos)
Example #20
0
    def __init__(self,
                 n0=10,
                 init_samples=None,
                 k_trunc=np.inf,
                 eps_z=.01,
                 nf_iter=2,
                 N=10,
                 t_ess=0.5,
                 beta_max=1,
                 model=None,
                 random_seed=-1,
                 chain=0,
                 frac_validate=0.0,
                 iteration=None,
                 alpha_w=(0, 0),
                 alpha_uw=(0, 0),
                 verbose=False,
                 n_component=None,
                 interp_nbin=None,
                 KDE=True,
                 bw_factor_min=1.0,
                 bw_factor_max=1.0,
                 bw_factor_num=1,
                 rel_bw=1,
                 edge_bins=None,
                 ndata_wT=None,
                 MSWD_max_iter=None,
                 NBfirstlayer=True,
                 logit=False,
                 Whiten=False,
                 trainable_qw=False,
                 sgd_steps=0,
                 knots_trainable=5,
                 batchsize=None,
                 nocuda=False,
                 patch=False,
                 shape=[28, 28, 1],
                 bounds=None):
        self.N = N
        self.n0 = n0

        self.model = model
        self.chain = chain

        # Init method params.
        self.init_samples = init_samples

        self.random_seed = random_seed

        # Set the torch seed.
        if self.random_seed != 1:
            np.random.seed(self.random_seed)
            torch.manual_seed(self.random_seed)

        # Separating out so I can keep track. These are SINF params.
        assert 0.0 <= frac_validate <= 1.0
        self.frac_validate = frac_validate
        self.iteration = iteration
        self.alpha_uw = alpha_uw
        self.alpha_w = alpha_w
        self.k_trunc = k_trunc
        self.verbose = verbose
        self.n_component = n_component
        self.interp_nbin = interp_nbin
        self.KDE = KDE
        self.bw_factors = np.linspace(bw_factor_min, bw_factor_max,
                                      bw_factor_num)
        self.edge_bins = edge_bins
        self.ndata_wT = ndata_wT
        self.MSWD_max_iter = MSWD_max_iter
        self.NBfirstlayer = NBfirstlayer
        self.logit = logit
        self.Whiten = Whiten
        self.batchsize = batchsize
        self.nocuda = nocuda
        self.patch = patch
        self.shape = shape

        #convert array of bounds passed in from [][x1min,x2min,...],[x1max,x2max...]] to what SINF wants, [[x1min,x1max],[x2min,x2max],...]
        if (bounds is not None):
            bounds_sinf = list([list(b) for b in bounds.T])
        else:
            bounds_sinf = [
                [None, None] for i in range(init_samples.shape[1])
            ]  #get the dimensionality from initial samples assuming (N,d) shape
        self.bounds = bounds_sinf

        #trainable sinf
        self.trainable_qw = trainable_qw
        self.sgd_steps = sgd_steps
        self.knots_trainable = knots_trainable

        #nfo
        self.t_ess = t_ess
        self.beta_max = beta_max
        self.beta = 0  #initial value of beta before iterating, match smc
        self.rel_bw = rel_bw

        self.model = modelcontext(model)
        self.variables = inputvars(self.model.vars)
Example #21
0
    def __init__(self,
                 vars=None,
                 scaling=None,
                 step_scale=0.25,
                 is_cov=False,
                 model=None,
                 blocked=True,
                 use_single_leapfrog=False,
                 potential=None,
                 integrator="leapfrog",
                 **theano_kwargs):
        """Superclass to implement Hamiltonian/hybrid monte carlo

        Parameters
        ----------
        vars : list of theano variables
        scaling : array_like, ndim = {1,2}
            Scaling for momentum distribution. 1d arrays interpreted matrix diagonal.
        step_scale : float, default=0.25
            Size of steps to take, automatically scaled down by 1/n**(1/4)
        is_cov : bool, default=False
            Treat scaling as a covariance matrix/vector if True, else treat it as a
            precision matrix/vector
        model : pymc3 Model instance.  default=Context model
        blocked: Boolean, default True
        use_single_leapfrog: Boolean, will leapfrog steps take a single step at a time.
            default False.
        potential : Potential, optional
            An object that represents the Hamiltonian with methods `velocity`,
            `energy`, and `random` methods.
        **theano_kwargs: passed to theano functions
        """
        model = modelcontext(model)

        if vars is None:
            vars = model.cont_vars
        vars = inputvars(vars)

        if scaling is None and potential is None:
            size = sum(np.prod(var.dshape, dtype=int) for var in vars)
            mean = floatX(np.zeros(size))
            var = floatX(np.ones(size))
            potential = QuadPotentialDiagAdapt(size, mean, var, 10)

        if isinstance(scaling, dict):
            point = Point(scaling, model=model)
            scaling = guess_scaling(point, model=model, vars=vars)

        if scaling is not None and potential is not None:
            raise ValueError("Can not specify both potential and scaling.")

        self.step_size = step_scale / (model.ndim**0.25)
        if potential is not None:
            self.potential = potential
        else:
            self.potential = quad_potential(scaling, is_cov)

        shared = make_shared_replacements(vars, model)
        if theano_kwargs is None:
            theano_kwargs = {}

        self.H, self.compute_energy, self.compute_velocity, self.leapfrog, self.dlogp = get_theano_hamiltonian_functions(
            vars, shared, model.logpt, self.potential, use_single_leapfrog,
            integrator, **theano_kwargs)

        super(BaseHMC, self).__init__(vars, shared, blocked=blocked)
Example #22
0
def find_MAP(start=None,
             vars=None,
             method="L-BFGS-B",
             return_raw=False,
             include_transformed=True,
             progressbar=True,
             maxeval=5000,
             model=None,
             *args,
             **kwargs):
    """
    Finds the local maximum a posteriori point given a model.

    find_MAP should not be used to initialize the NUTS sampler. Simply call pymc3.sample() and it will automatically initialize NUTS in a better way.

    Parameters
    ----------
    start: `dict` of parameter values (Defaults to `model.test_point`)
    vars: list
        List of variables to optimize and set to optimum (Defaults to all continuous).
    method: string or callable
        Optimization algorithm (Defaults to 'L-BFGS-B' unless
        discrete variables are specified in `vars`, then
        `Powell` which will perform better).  For instructions on use of a callable,
        refer to SciPy's documentation of `optimize.minimize`.
    return_raw: bool
        Whether to return the full output of scipy.optimize.minimize (Defaults to `False`)
    include_transformed: bool, optional defaults to True
        Flag for reporting automatically transformed variables in addition
        to original variables.
    progressbar: bool, optional defaults to True
        Whether or not to display a progress bar in the command line.
    maxeval: int, optional, defaults to 5000
        The maximum number of times the posterior distribution is evaluated.
    model: Model (optional if in `with` context)
    *args, **kwargs
        Extra args passed to scipy.optimize.minimize

    Notes
    -----
    Older code examples used find_MAP() to initialize the NUTS sampler,
    but this is not an effective way of choosing starting values for sampling.
    As a result, we have greatly enhanced the initialization of NUTS and
    wrapped it inside pymc3.sample() and you should thus avoid this method.
    """
    model = modelcontext(model)
    if start is None:
        start = model.test_point
    else:
        update_start_vals(start, model.test_point, model)

    check_start_vals(start, model)

    if vars is None:
        vars = model.cont_vars
    vars = inputvars(vars)
    disc_vars = list(typefilter(vars, discrete_types))
    allinmodel(vars, model)

    start = Point(start, model=model)
    bij = DictToArrayBijection(ArrayOrdering(vars), start)
    logp_func = bij.mapf(model.fastlogp_nojac)
    x0 = bij.map(start)

    try:
        dlogp_func = bij.mapf(model.fastdlogp_nojac(vars))
        compute_gradient = True
    except (AttributeError, NotImplementedError, tg.NullTypeGradError):
        compute_gradient = False

    if disc_vars or not compute_gradient:
        pm._log.warning(
            "Warning: gradient not available." +
            "(E.g. vars contains discrete variables). MAP " +
            "estimates may not be accurate for the default " +
            "parameters. Defaulting to non-gradient minimization " +
            "'Powell'.")
        method = "Powell"

    if "fmin" in kwargs:
        fmin = kwargs.pop("fmin")
        warnings.warn(
            "In future versions, set the optimization algorithm with a string. "
            'For example, use `method="L-BFGS-B"` instead of '
            '`fmin=sp.optimize.fmin_l_bfgs_b"`.')

        cost_func = CostFuncWrapper(maxeval, progressbar, logp_func)

        # Check to see if minimization function actually uses the gradient
        if "fprime" in getargspec(fmin).args:

            def grad_logp(point):
                return nan_to_num(-dlogp_func(point))

            opt_result = fmin(cost_func, x0, fprime=grad_logp, *args, **kwargs)
        else:
            # Check to see if minimization function uses a starting value
            if "x0" in getargspec(fmin).args:
                opt_result = fmin(cost_func, x0, *args, **kwargs)
            else:
                opt_result = fmin(cost_func, *args, **kwargs)

        if isinstance(opt_result, tuple):
            mx0 = opt_result[0]
        else:
            mx0 = opt_result
    else:
        # remove 'if' part, keep just this 'else' block after version change
        if compute_gradient:
            cost_func = CostFuncWrapper(maxeval, progressbar, logp_func,
                                        dlogp_func)
        else:
            cost_func = CostFuncWrapper(maxeval, progressbar, logp_func)

        try:
            opt_result = minimize(cost_func,
                                  x0,
                                  method=method,
                                  jac=compute_gradient,
                                  *args,
                                  **kwargs)
            mx0 = opt_result["x"]  # r -> opt_result
        except (KeyboardInterrupt, StopIteration) as e:
            mx0, opt_result = cost_func.previous_x, None
            if isinstance(e, StopIteration):
                pm._log.info(e)
        finally:
            last_v = cost_func.n_eval
            if progressbar:
                assert isinstance(cost_func.progress, ProgressBar)
                cost_func.progress.total = last_v
                cost_func.progress.update(last_v)
                print()

    vars = get_default_varnames(model.unobserved_RVs, include_transformed)
    mx = {
        var.name: value
        for var, value in zip(vars,
                              model.fastfn(vars)(bij.rmap(mx0)))
    }

    if return_raw:
        return mx, opt_result
    else:
        return mx
Example #23
0
    def __init__(
        self,
        draws=2000,
        start=None,
        threshold=0.5,
        model=None,
        random_seed=-1,
        chain=0,
        frac_validate=0.1,
        iteration=None,
        alpha=(0, 0),
        k_trunc=0.5,
        pareto=False,
        epsilon=1e-3,
        local_thresh=3,
        local_step_size=0.1,
        local_grad=True,
        nf_local_iter=0,
        max_line_search=2,
        verbose=False,
        n_component=None,
        interp_nbin=None,
        KDE=True,
        bw_factor=0.5,
        edge_bins=None,
        ndata_wT=None,
        MSWD_max_iter=None,
        NBfirstlayer=True,
        logit=False,
        Whiten=False,
        batchsize=None,
        nocuda=False,
        patch=False,
        shape=[28, 28, 1],
    ):

        self.draws = draws
        self.start = start
        self.threshold = threshold
        self.model = model
        self.random_seed = random_seed
        self.chain = chain
        self.frac_validate = frac_validate
        self.iteration = iteration
        self.alpha = alpha
        self.k_trunc = k_trunc
        self.pareto = pareto
        self.epsilon = epsilon

        self.local_thresh = local_thresh
        self.local_step_size = local_step_size
        self.local_grad = local_grad
        self.nf_local_iter = nf_local_iter
        self.max_line_search = max_line_search

        self.verbose = verbose
        self.n_component = n_component
        self.interp_nbin = interp_nbin
        self.KDE = KDE
        self.bw_factor = bw_factor
        self.edge_bins = edge_bins
        self.ndata_wT = ndata_wT
        self.MSWD_max_iter = MSWD_max_iter
        self.NBfirstlayer = NBfirstlayer
        self.logit = logit
        self.Whiten = Whiten
        self.batchsize = batchsize
        self.nocuda = nocuda
        self.patch = patch
        self.shape = shape

        self.model = modelcontext(model)

        if self.random_seed != -1:
            np.random.seed(self.random_seed)

        self.beta = 0
        self.variables = inputvars(self.model.vars)
        self.weights = np.ones(self.draws) / self.draws
        #self.sinf_logq = np.array([])
        self.log_marginal_likelihood = 0
Example #24
0
def advi_minibatch(vars=None, start=None, model=None, n=5000, n_mcsamples=1,
                   minibatch_RVs=None, minibatch_tensors=None,
                   minibatches=None, local_RVs=None, observed_RVs=None,
                   encoder_params=None, total_size=None, optimizer=None,
                   learning_rate=.001, epsilon=.1, random_seed=None):
    """Perform mini-batch ADVI.

    This function implements a mini-batch ADVI with the meanfield
    approximation. Autoencoding variational inference is also supported.

    The log probability terms for mini-batches, corresponding to RVs in
    minibatch_RVs, are scaled to (total_size) / (the number of samples in each
    mini-batch), where total_size is an argument for the total data size.

    minibatch_tensors is a list of tensors (can be shared variables) to which
    mini-batch samples are set during the optimization. In most cases, these
    tensors are observations for RVs in the model.

    local_RVs and observed_RVs are used for autoencoding variational Bayes.
    Both of these RVs are associated with each of given samples.
    The difference is that local_RVs are unkown and their posterior
    distributions are approximated.

    local_RVs are Ordered dict, whose keys and values are RVs and a tuple of
    two objects. The first is the theano expression of variational parameters
    (mean and log of std) of the approximate posterior, which are encoded from
    given samples by an arbitrary deterministic function, e.g., MLP. The other
    one is a scaling constant to be multiplied to the log probability term
    corresponding to the RV.

    observed_RVs are also Ordered dict with RVs as the keys, but whose values
    are only the scaling constant as in local_RVs. In this case, total_size is
    ignored.

    If local_RVs is None (thus not using autoencoder), the following two
    settings are equivalent:

    - observed_RVs=OrderedDict([(rv, total_size / minibatch_size)])
    - minibatch_RVs=[rv], total_size=total_size

    where minibatch_size is minibatch_tensors[0].shape[0].

    The variational parameters and the parameters of the autoencoder are
    simultaneously optimized with given optimizer, which is a function that
    returns a dictionary of parameter updates as provided to Theano function.
    See the docstring of pymc3.variational.advi().

    Parameters
    ----------
    vars : object
        List of random variables. If None, variational posteriors (normal
        distribution) are fit for all RVs in the given model.
    start : Dict or None
        Initial values of parameters (variational means).
    model : Model
        Probabilistic model.
    n : int
        Number of iterations updating parameters.
    n_mcsamples : int
        Number of Monte Carlo samples to approximate ELBO.
    minibatch_RVs : list of ObservedRVs
        Random variables in the model for which mini-batch tensors are set.
        When this argument is given, both of arguments local_RVs and
        observed_RVs must be None.
    minibatch_tensors : list of (tensors or shared variables)
        Tensors used to create ObservedRVs in minibatch_RVs.
    minibatches : generator of list
        Generates a set of minibatches when calling next().
        The length of the returned list must be the same with the number of
        random variables in `minibatch_tensors`.
    total_size : int
        Total size of training samples. This is used to appropriately scale the
        log likelihood terms corresponding to mini-batches in ELBO.
    local_RVs : Ordered dict
        Include encoded variational parameters and a scaling constant for
        the corresponding RV. See the above description.
    observed_RVs : Ordered dict
        Include a scaling constant for the corresponding RV. See the above
        description
    encoder_params : list of theano shared variables
        Parameters of encoder.
    optimizer : (loss, list of shared variables) -> dict or OrderedDict
        A function that returns parameter updates given loss and shared
        variables of parameters. If :code:`None` (default), a default
        Adagrad optimizer is used with parameters :code:`learning_rate`
        and :code:`epsilon` below.
    learning_rate: float
        Base learning rate for adagrad. This parameter is ignored when
        an optimizer is given.
    epsilon : float
        Offset in denominator of the scale of learning rate in Adagrad.
        This parameter is ignored when an optimizer is given.
    random_seed : int
        Seed to initialize random state.

    Returns
    -------
    ADVIFit
        Named tuple, which includes 'means', 'stds', and 'elbo_vals'.
    """
    theano.config.compute_test_value = 'ignore'

    model = pm.modelcontext(model)
    vars = inputvars(vars if vars is not None else model.vars)
    start = start if start is not None else model.test_point
    check_discrete_rvs(vars)
    _check_minibatches(minibatch_tensors, minibatches)
    
    if encoder_params is None:
        encoder_params = []

    # Prepare optimizer
    if optimizer is None:
        optimizer = adagrad_optimizer(learning_rate, epsilon)

    # For backward compatibility in how input arguments are given
    local_RVs, observed_RVs = _get_rvss(minibatch_RVs, local_RVs, observed_RVs,
                                        minibatch_tensors, total_size)

    # Replace local_RVs with transformed variables
    ds = model.deterministics

    def get_transformed(v):
        if v in ds:
            return v.transformed
        return v
    local_RVs = OrderedDict(
        [(get_transformed(v), (uw, s)) for v, (uw, s) in local_RVs.items()]
    )

    # Get global variables
    global_RVs = list(set(vars) - set(list(local_RVs) + list(observed_RVs)))

    # Ordering for concatenation of random variables
    global_order = pm.ArrayOrdering([v for v in global_RVs])
    local_order = pm.ArrayOrdering([v for v in local_RVs])

    # ELBO wrt variational parameters
    inarray_g, uw_g, replace_g = _join_global_RVs(global_RVs, global_order)
    inarray_l, uw_l, replace_l = _join_local_RVs(local_RVs, local_order)
    logpt = _make_logpt(global_RVs, local_RVs, observed_RVs, model)
    replace = replace_g
    replace.update(replace_l)
    logp = theano.clone(logpt, replace, strict=False)
    elbo = _elbo_t(logp, uw_g, uw_l, inarray_g, inarray_l,
                   n_mcsamples, random_seed)
    del logpt

    # Replacements tensors of variational parameters in the graph
    replaces = dict()

    # Variational parameters for global RVs
    if 0 < len(global_RVs):
        uw_global_shared, bij = _init_uw_global_shared(start, global_RVs,
                                                       global_order)
        replaces.update({uw_g: uw_global_shared})

    # Variational parameters for local RVs, encoded from samples in
    # mini-batches
    if 0 < len(local_RVs):
        uws = [uw for _, (uw, _) in local_RVs.items()]
        uw_local_encoded = tt.concatenate([uw[0].ravel() for uw in uws] +
                                          [uw[1].ravel() for uw in uws])
        replaces.update({uw_l: uw_local_encoded})

    # Replace tensors of variational parameters in ELBO
    elbo = theano.clone(elbo, OrderedDict(replaces), strict=False)

    # Replace input shared variables with tensors
    def is_shared(t):
        return isinstance(t, theano.compile.sharedvalue.SharedVariable)
    tensors = [(t.type() if is_shared(t) else t) for t in minibatch_tensors]
    updates = OrderedDict(
        {t: t_ for t, t_ in zip(minibatch_tensors, tensors) if is_shared(t)}
    )
    elbo = theano.clone(elbo, updates, strict=False)

    # Create parameter update function used in the training loop
    params = encoder_params
    if 0 < len(global_RVs):
        params += [uw_global_shared]
    updates = OrderedDict(optimizer(loss=-1 * elbo, param=params))
    f = theano.function(tensors, elbo, updates=updates)

    # Optimization loop
    elbos = np.empty(n)
    progress = tqdm.trange(n)
    for i in progress:
        e = f(*next(minibatches))
        elbos[i] = e
        if i % (n // 10) == 0 and i > 0:
            avg_elbo = elbos[i - n // 10:i].mean()
            progress.set_description('Average ELBO = {:,.2f}'.format(avg_elbo))

    pm._log.info('Finished minibatch ADVI: ELBO = {:,.2f}'.format(elbos[-1]))

    # Variational parameters of global RVs
    if 0 < len(global_RVs):
        l = int(uw_global_shared.get_value(borrow=True).size / 2)
        u = bij.rmap(uw_global_shared.get_value(borrow=True)[:l])
        w = bij.rmap(uw_global_shared.get_value(borrow=True)[l:])
        # w is in log space
        for var in w.keys():
            w[var] = np.exp(w[var])
    else:
        u = dict()
        w = dict()

    return ADVIFit(u, w, elbos)
Example #25
0
def advi_minibatch(vars=None, start=None, model=None, n=5000, n_mcsamples=1,
                   minibatch_RVs=None, minibatch_tensors=None,
                   minibatches=None, global_RVs=None, local_RVs=None,
                   observed_RVs=None, encoder_params=None, total_size=None,
                   optimizer=None, learning_rate=.001, epsilon=.1,
                   random_seed=None, mode=None):
    """Perform mini-batch ADVI.

    This function implements a mini-batch automatic differentiation variational
    inference (ADVI; Kucukelbir et al., 2015) with the meanfield
    approximation. Autoencoding variational Bayes (AEVB; Kingma and Welling,
    2014) is also supported.

    For explanation, we classify random variables in probabilistic models into
    three types. Observed random variables
    :math:`{\cal Y}=\{\mathbf{y}_{i}\}_{i=1}^{N}` are :math:`N` observations.
    Each :math:`\mathbf{y}_{i}` can be a set of observed random variables,
    i.e., :math:`\mathbf{y}_{i}=\{\mathbf{y}_{i}^{k}\}_{k=1}^{V_{o}}`, where
    :math:`V_{k}` is the number of the types of observed random variables
    in the model.

    The next ones are global random variables
    :math:`\Theta=\{\\theta^{k}\}_{k=1}^{V_{g}}`, which are used to calculate
    the probabilities for all observed samples.

    The last ones are local random variables
    :math:`{\cal Z}=\{\mathbf{z}_{i}\}_{i=1}^{N}`, where
    :math:`\mathbf{z}_{i}=\{\mathbf{z}_{i}^{k}\}_{k=1}^{V_{l}}`.
    These RVs are used only in AEVB.

    The goal of ADVI is to approximate the posterior distribution
    :math:`p(\Theta,{\cal Z}|{\cal Y})` by variational posterior
    :math:`q(\Theta)\prod_{i=1}^{N}q(\mathbf{z}_{i})`. All of these terms
    are normal distributions (mean-field approximation).

    :math:`q(\Theta)` is parametrized with its means and standard deviations.
    These parameters are denoted as :math:`\gamma`. While :math:`\gamma` is
    a constant, the parameters of :math:`q(\mathbf{z}_{i})` are dependent on
    each observation. Therefore these parameters are denoted as
    :math:`\\xi(\mathbf{y}_{i}; \\nu)`, where :math:`\\nu` is the parameters
    of :math:`\\xi(\cdot)`. For example, :math:`\\xi(\cdot)` can be a
    multilayer perceptron or convolutional neural network.

    In addition to :math:`\\xi(\cdot)`, we can also include deterministic
    mappings for the likelihood of observations. We denote the parameters of
    the deterministic mappings as :math:`\eta`. An example of such mappings is
    the deconvolutional neural network used in the convolutional VAE example
    in the PyMC3 notebook directory.

    This function maximizes the evidence lower bound (ELBO)
    :math:`{\cal L}(\gamma, \\nu, \eta)` defined as follows:

    .. math::

        {\cal L}(\gamma,\\nu,\eta) & =
        \mathbf{c}_{o}\mathbb{E}_{q(\Theta)}\left[
        \sum_{i=1}^{N}\mathbb{E}_{q(\mathbf{z}_{i})}\left[
        \log p(\mathbf{y}_{i}|\mathbf{z}_{i},\Theta,\eta)
        \\right]\\right] \\\\ &
        - \mathbf{c}_{g}KL\left[q(\Theta)||p(\Theta)\\right]
        - \mathbf{c}_{l}\sum_{i=1}^{N}
            KL\left[q(\mathbf{z}_{i})||p(\mathbf{z}_{i})\\right],

    where :math:`KL[q(v)||p(v)]` is the Kullback-Leibler divergence

    .. math::

        KL[q(v)||p(v)] = \int q(v)\log\\frac{q(v)}{p(v)}dv,

    :math:`\mathbf{c}_{o/g/l}` are vectors for weighting each term of ELBO.
    More precisely, we can write each of the terms in ELBO as follows:

    .. math::

        \mathbf{c}_{o}\log p(\mathbf{y}_{i}|\mathbf{z}_{i},\Theta,\eta) & = &
        \sum_{k=1}^{V_{o}}c_{o}^{k}
            \log p(\mathbf{y}_{i}^{k}|
                   {\\rm pa}(\mathbf{y}_{i}^{k},\Theta,\eta)) \\\\
        \mathbf{c}_{g}KL\left[q(\Theta)||p(\Theta)\\right] & = &
        \sum_{k=1}^{V_{g}}c_{g}^{k}KL\left[
            q(\\theta^{k})||p(\\theta^{k}|{\\rm pa(\\theta^{k})})\\right] \\\\
        \mathbf{c}_{l}KL\left[q(\mathbf{z}_{i}||p(\mathbf{z}_{i})\\right] & = &
        \sum_{k=1}^{V_{l}}c_{l}^{k}KL\left[
            q(\mathbf{z}_{i}^{k})||
            p(\mathbf{z}_{i}^{k}|{\\rm pa}(\mathbf{z}_{i}^{k}))\\right],

    where :math:`{\\rm pa}(v)` denotes the set of parent variables of :math:`v`
    in the directed acyclic graph of the model.

    When using mini-batches, :math:`c_{o}^{k}` and :math:`c_{l}^{k}` should be
    set to :math:`N/M`, where :math:`M` is the number of observations in each
    mini-batch. Another weighting scheme was proposed in
    (Blundell et al., 2015) for accelarating model fitting.

    For working with ADVI, we need to give the probabilistic model
    (:code:`model`), the three types of RVs (:code:`observed_RVs`,
    :code:`global_RVs` and :code:`local_RVs`), the tensors to which
    mini-bathced samples are supplied (:code:`minibatches`) and
    parameters of deterministic mappings :math:`\\xi` and :math:`\eta`
    (:code:`encoder_params`) as input arguments.

    :code:`observed_RVs` is a :code:`OrderedDict` of the form
    :code:`{y_k: c_k}`, where :code:`y_k` is a random variable defined in the
    PyMC3 model. :code:`c_k` is a scalar (:math:`c_{o}^{k}`) and it can be a
    shared variable.

    :code:`global_RVs` is a :code:`OrderedDict` of the form
    :code:`{t_k: c_k}`, where :code:`t_k` is a random variable defined in the
    PyMC3 model. :code:`c_k` is a scalar (:math:`c_{g}^{k}`) and it can be a
    shared variable.

    :code:`local_RVs` is a :code:`OrderedDict` of the form
    :code:`{z_k: ((m_k, s_k), c_k)}`, where :code:`z_k` is a random variable
    defined in the PyMC3 model. :code:`c_k` is a scalar (:math:`c_{l}^{k}`)
    and it can be a shared variable. :code:`(m_k, s_k)` is a pair of tensors
    of means and log standard deviations of the variational distribution;
    samples drawn from the variational distribution replaces :code:`z_k`.
    It should be noted that if :code:`z_k` has a transformation that changes
    the dimension (e.g., StickBreakingTransform), the variational distribution
    must have the same dimension. For example, if :code:`z_k` is distributed
    with Dirichlet distribution with :code:`p` choices, :math:`m_k` and
    :code:`s_k` has the shape :code:`(n_samples_in_minibatch, p - 1)`.

    :code:`minibatch_tensors` is a list of tensors (can be shared variables)
    to which mini-batch samples are set during the optimization.
    These tensors are observations (:code:`obs=`) in :code:`observed_RVs`.

    :code:`minibatches` is a generator of a list of :code:`numpy.ndarray`.
    Each item of the list will be set to tensors in :code:`minibatch_tensors`.

    :code:`encoder_params` is a list of shared variables of the parameters
    :math:`\\nu` and :math:`\eta`. We do not need to include the variational
    parameters of the global variables, :math:`\gamma`, because these are
    automatically created and updated in this function.

    The following is a list of example notebooks using advi_minibatch:

    - docs/source/notebooks/GLM-hierarchical-advi-minibatch.ipynb
    - docs/source/notebooks/bayesian_neural_network_advi.ipynb
    - docs/source/notebooks/convolutional_vae_keras_advi.ipynb
    - docs/source/notebooks/gaussian-mixture-model-advi.ipynb
    - docs/source/notebooks/lda-advi-aevb.ipynb

    Parameters
    ----------
    vars : object
        List of random variables. If None, variational posteriors (normal
        distribution) are fit for all RVs in the given model.
    start : Dict or None
        Initial values of parameters (variational means).
    model : Model
        Probabilistic model.
    n : int
        Number of iterations updating parameters.
    n_mcsamples : int
        Number of Monte Carlo samples to approximate ELBO.
    minibatch_RVs : list of ObservedRVs
        Random variables in the model for which mini-batch tensors are set.
        When this argument is given, both of arguments local_RVs and
        observed_RVs must be None.
    minibatch_tensors : list of (tensors or shared variables)
        Tensors used to create ObservedRVs in minibatch_RVs.
    minibatches : generator of list
        Generates a set of minibatches when calling next().
        The length of the returned list must be the same with the number of
        random variables in `minibatch_tensors`.
    total_size : int
        Total size of training samples. This is used to appropriately scale the
        log likelihood terms corresponding to mini-batches in ELBO.
    observed_RVs : Ordered dict
        Include a scaling constant for the corresponding RV. See the above
        description.
    global_RVs : Ordered dict or None
        Include a scaling constant for the corresponding RV. See the above
        description. If :code:`None`, it is set to
        :code:`{v: 1 for v in grvs}`, where :code:`grvs` is
        :code:`list(set(vars) - set(list(local_RVs) + list(observed_RVs)))`.
    local_RVs : Ordered dict or None
        Include encoded variational parameters and a scaling constant for
        the corresponding RV. See the above description.
    encoder_params : list of theano shared variables
        Parameters of encoder.
    optimizer : (loss, list of shared variables) -> dict or OrderedDict
        A function that returns parameter updates given loss and shared
        variables of parameters. If :code:`None` (default), a default
        Adagrad optimizer is used with parameters :code:`learning_rate`
        and :code:`epsilon` below.
    learning_rate: float
        Base learning rate for adagrad.
        This parameter is ignored when :code:`optimizer` is set.
    epsilon : float
        Offset in denominator of the scale of learning rate in Adagrad.
        This parameter is ignored when :code:`optimizer` is set.
    random_seed : int
        Seed to initialize random state.

    Returns
    -------
    ADVIFit
        Named tuple, which includes 'means', 'stds', and 'elbo_vals'.

    References
    ----------
    - Kingma, D. P., & Welling, M. (2014).
      Auto-Encoding Variational Bayes. stat, 1050, 1.
    - Kucukelbir, A., Ranganath, R., Gelman, A., & Blei, D. (2015).
      Automatic variational inference in Stan. In Advances in neural
      information processing systems (pp. 568-576).
    - Blundell, C., Cornebise, J., Kavukcuoglu, K., & Wierstra, D. (2015).
      Weight Uncertainty in Neural Network. In Proceedings of the 32nd
      International Conference on Machine Learning (ICML-15) (pp. 1613-1622).
    """
    import warnings
    warnings.warn('Old ADVI interface is deprecated and be removed in future, use pm.ADVI instead',
                  DeprecationWarning, stacklevel=2)
    if encoder_params is None:
        encoder_params = []

    model = pm.modelcontext(model)
    vars = inputvars(vars if vars is not None else model.vars)
    start = start if start is not None else model.test_point

    if not pm.model.all_continuous(vars):
        raise ValueError('Model can not include discrete RVs for ADVI.')

    _check_minibatches(minibatch_tensors, minibatches)

    if encoder_params is None:
        encoder_params = []

    # Prepare optimizer
    if optimizer is None:
        optimizer = adagrad_optimizer(learning_rate, epsilon)

    # For backward compatibility in how input arguments are given
    local_RVs, observed_RVs = _get_rvss(minibatch_RVs, local_RVs, observed_RVs,
                                        minibatch_tensors, total_size)

    # Replace local_RVs with transformed variables
    def get_transformed(v):
        if hasattr(v, 'transformed'):
            return v.transformed
        return v
    local_RVs = OrderedDict(
        [(get_transformed(v), (uw, s)) for v, (uw, s) in local_RVs.items()]
    )

    # Get global variables
    grvs = list(set(vars) - set(list(local_RVs) + list(observed_RVs)))
    if global_RVs is None:
        global_RVs = OrderedDict({v: 1 for v in grvs})
    _value_error(len(grvs) == len(global_RVs),
                 'global_RVs ({}) must have all global RVs: {}'.format(
                     [v for v in global_RVs], grvs)
    )

    # ELBO wrt variational parameters
    elbo, uw_l, uw_g = _make_elbo_t(observed_RVs, global_RVs, local_RVs,
                                    model.potentials, n_mcsamples, random_seed)

    # Replacements tensors of variational parameters in the graph
    replaces = dict()

    # Variational parameters for global RVs
    if 0 < len(global_RVs):
        uw_global_shared, bij = _init_uw_global_shared(start, global_RVs)
        replaces.update({uw_g: uw_global_shared})

    # Variational parameters for local RVs, encoded from samples in
    # mini-batches
    if 0 < len(local_RVs):
        uws = [uw for _, (uw, _) in local_RVs.items()]
        uw_local_encoded = tt.concatenate([uw[0].ravel() for uw in uws] +
                                          [uw[1].ravel() for uw in uws])
        replaces.update({uw_l: uw_local_encoded})

    # Replace tensors of variational parameters in ELBO
    elbo = theano.clone(elbo, OrderedDict(replaces), strict=False)

    # Replace input shared variables with tensors
    def is_shared(t):
        return isinstance(t, theano.compile.sharedvalue.SharedVariable)
    tensors = [(t.type() if is_shared(t) else t) for t in minibatch_tensors]
    updates = OrderedDict(
        {t: t_ for t, t_ in zip(minibatch_tensors, tensors) if is_shared(t)}
    )
    elbo = theano.clone(elbo, updates, strict=False)

    # Create parameter update function used in the training loop
    params = encoder_params
    if 0 < len(global_RVs):
        params += [uw_global_shared]
    updates = OrderedDict(optimizer(loss=-1 * elbo, param=params))
    f = theano.function(tensors, elbo, updates=updates, mode=mode)

    # Optimization loop
    elbos = np.empty(n)
    progress = tqdm.trange(n)
    for i in progress:
        e = f(*next(minibatches))
        if np.isnan(e):
            raise FloatingPointError('NaN occurred in ADVI optimization.')
        elbos[i] = e
        if n < 10:
            progress.set_description('ELBO = {:,.2f}'.format(elbos[i]))
        elif i % (n // 10) == 0 and i > 0:
            avg_elbo = infmean(elbos[i - n // 10:i])
            progress.set_description('Average ELBO = {:,.2f}'.format(avg_elbo))

    pm._log.info('Finished minibatch ADVI: ELBO = {:,.2f}'.format(elbos[-1]))

    # Variational parameters of global RVs
    if 0 < len(global_RVs):
        l = int(uw_global_shared.get_value(borrow=True).size / 2)
        u = bij.rmap(uw_global_shared.get_value(borrow=True)[:l])
        w = bij.rmap(uw_global_shared.get_value(borrow=True)[l:])
        # w is in log space
        for var in w.keys():
            w[var] = np.exp(w[var])
    else:
        u = dict()
        w = dict()

    return ADVIFit(u, w, elbos)
Example #26
0
    def __init__(self, vars=None, out_vars=None, covariance=None, scale=1.,
                 n_chains=100, tune=True, tune_interval=100, model=None,
                 check_bound=True, likelihood_name='like', backend='csv',
                 proposal_name='MultivariateNormal', **kwargs):

        model = modelcontext(model)

        if vars is None:
            vars = model.vars

        vars = inputvars(vars)

        if out_vars is None:
            out_vars = model.unobserved_RVs

        out_varnames = [out_var.name for out_var in out_vars]

        self.scaling = utility.scalar2floatX(num.atleast_1d(scale))

        self.tune = tune
        self.check_bound = check_bound
        self.tune_interval = tune_interval
        self.steps_until_tune = tune_interval

        self.stage_sample = 0
        self.cumulative_samples = 0
        self.accepted = 0

        self.beta = 1.
        self.stage = 0
        self.chain_index = 0

        # needed to use the same parallel implementation function as for SMC
        self.resampling_indexes = num.arange(n_chains)
        self.n_chains = n_chains

        self.likelihood_name = likelihood_name
        self._llk_index = out_varnames.index(likelihood_name)
        self.backend = backend
        self.discrete = num.concatenate(
            [[v.dtype in discrete_types] * (v.dsize or 1) for v in vars])
        self.any_discrete = self.discrete.any()
        self.all_discrete = self.discrete.all()

        # create initial population
        self.population = []
        self.array_population = num.zeros(n_chains)
        logger.info('Creating initial population for {}'
                    ' chains ...'.format(self.n_chains))
        for i in range(self.n_chains):
            self.population.append(
                Point({v.name: v.random() for v in vars}, model=model))

        self.population[0] = model.test_point

        shared = make_shared_replacements(vars, model)
        self.logp_forw = logp_forw(out_vars, vars, shared)
        self.check_bnd = logp_forw([model.varlogpt], vars, shared)

        super(Metropolis, self).__init__(vars, out_vars, shared)

        # init proposal
        if covariance is None and proposal_name in multivariate_proposals:
            t0 = time()
            self.covariance = init_proposal_covariance(
                bij=self.bij, vars=vars, model=model, pop_size=1000)
            t1 = time()
            logger.info('Time for proposal covariance init: %f' % (t1 - t0))
            scale = self.covariance
        elif covariance is None:
            scale = num.ones(sum(v.dsize for v in vars))
        else:
            scale = covariance

        self.proposal_name = proposal_name
        self.proposal_dist = choose_proposal(
            self.proposal_name, scale=scale)
        self.proposal_samples_array = self.proposal_dist(n_chains)

        self.chain_previous_lpoint = [[]] * self.n_chains
        self._tps = None