Exemple #1
0
 def test_nest_context_works(self):
     with pm.Model() as m:
         new = NewModel()
         with new:
             assert pm.modelcontext(None) is new
         assert pm.modelcontext(None) is m
     assert 'v1' in m.named_vars
     assert 'v2' in m.named_vars
Exemple #2
0
    def __init__(self, vars=None, S=None, proposal_dist=NormalProposal, scaling=1.,
                 tune=True, tune_interval=100, model=None, **kwargs):

        model = pm.modelcontext(model)

        if vars is None:
            vars = model.vars
        vars = pm.inputvars(vars)

        if S is None:
            S = np.ones(sum(v.dsize for v in vars))
        self.proposal_dist = proposal_dist(S)
        self.scaling = np.atleast_1d(scaling)
        self.tune = tune
        self.tune_interval = tune_interval
        self.steps_until_tune = tune_interval
        self.accepted = 0

        # Determine type of variables
        self.discrete = np.concatenate(
            [[v.dtype in pm.discrete_types] * (v.dsize or 1) for v in vars])
        self.any_discrete = self.discrete.any()
        self.all_discrete = self.discrete.all()

        shared = pm.make_shared_replacements(vars, model)
        self.delta_logp = delta_logp(model.logpt, vars, shared)
        super(Metropolis, self).__init__(vars, shared)
Exemple #3
0
def get_citations_for_model(model=None, width=79):
    """Get the citations for the components used an exoplanet PyMC3

    Returns: The acknowledgement text for exoplanet and its dependencies and a
    string containing the BibTeX entries for the citations in the
    acknowledgement.

    """
    model = pm.modelcontext(model)
    if not hasattr(model, "__citations__"):
        logging.warning("no citations registered with model")
        return "", ""

    cite = list(CITATIONS["exoplanet"][0]) + \
        list(CITATIONS["pymc3"][0]) + \
        list(CITATIONS["theano"][0])
    bib = [CITATIONS["exoplanet"][1], CITATIONS["pymc3"][1],
           CITATIONS["theano"][1]]
    for k, v in model.__citations__.items():
        cite += list(v[0])
        bib.append(v[1])

    txt = (r"This research made use of \textsf{{exoplanet}} "
           r"\citep{{exoplanet}} and its dependencies \citep{{{0}}}.")
    txt = txt.format(", ".join(sorted(cite)))
    txt = textwrap.wrap(txt, width=width)

    return "\n".join(txt), "\n".join(bib)
Exemple #4
0
def init_nuts(init='advi', n_init=500000, model=None, **kwargs):
    """Initialize and sample from posterior of a continuous model.

    This is a convenience function. NUTS convergence and sampling speed is extremely
    dependent on the choice of mass/scaling matrix. In our experience, using ADVI
    to estimate a diagonal covariance matrix and using this as the scaling matrix
    produces robust results over a wide class of continuous models.

    Parameters
    ----------
    init : str {'advi', 'advi_map', 'map', 'nuts'}
        Initialization method to use.
        * advi : Run ADVI to estimate posterior mean and diagonal covariance matrix.
        * advi_map: Initialize ADVI with MAP and use MAP as starting point.
        * map : Use the MAP as starting point.
        * nuts : Run NUTS and estimate posterior mean and covariance matrix.
    n_init : int
        Number of iterations of initializer
        If 'advi', number of iterations, if 'metropolis', number of draws.
    model : Model (optional if in `with` context)
    **kwargs : keyword arguments
        Extra keyword arguments are forwarded to pymc3.NUTS.

    Returns
    -------
    start, nuts_sampler

    start : pymc3.model.Point
        Starting point for sampler
    nuts_sampler : pymc3.step_methods.NUTS
        Instantiated and initialized NUTS sampler object
    """

    model = pm.modelcontext(model)

    pm._log.info('Initializing NUTS using {}...'.format(init))

    if init == 'advi':
        v_params = pm.variational.advi(n=n_init)
        start = pm.variational.sample_vp(v_params, 1, progressbar=False)[0]
        cov = np.power(model.dict_to_array(v_params.stds), 2)
    elif init == 'advi_map':
        start = pm.find_MAP()
        v_params = pm.variational.advi(n=n_init, start=start)
        cov = np.power(model.dict_to_array(v_params.stds), 2)
    elif init == 'map':
        start = pm.find_MAP()
        cov = pm.find_hessian(point=start)

    elif init == 'nuts':
        init_trace = pm.sample(step=pm.NUTS(), draws=n_init)
        cov = pm.trace_cov(init_trace[n_init//2:])

        start = {varname: np.mean(init_trace[varname]) for varname in init_trace.varnames}
    else:
        raise NotImplemented('Initializer {} is not supported.'.format(init))

    step = pm.NUTS(scaling=cov, is_cov=True, **kwargs)

    return start, step
Exemple #5
0
    def __init__(self, vars=None, S=None, proposal_dist=None, lamb=None, scaling=0.001,
                 tune=True, tune_interval=100, model=None, mode=None, **kwargs):
        warnings.warn('Population based sampling methods such as DEMetropolis are experimental.' \
            ' Use carefully and be extra critical about their results!')

        model = pm.modelcontext(model)

        if vars is None:
            vars = model.cont_vars
        vars = pm.inputvars(vars)

        if S is None:
            S = np.ones(sum(v.dsize for v in vars))

        if proposal_dist is not None:
            self.proposal_dist = proposal_dist(S)
        else:
            self.proposal_dist = UniformProposal(S)

        self.scaling = np.atleast_1d(scaling).astype('d')
        if lamb is None:
            lamb = 2.38 / np.sqrt(2 * S.size)
        self.lamb = float(lamb)
        self.tune = tune
        self.tune_interval = tune_interval
        self.steps_until_tune = tune_interval
        self.accepted = 0

        self.mode = mode

        shared = pm.make_shared_replacements(vars, model)
        self.delta_logp = delta_logp(model.logpt, vars, shared)
        super().__init__(vars, shared)
Exemple #6
0
def fit(n=10000, local_rv=None, method='advi', model=None, **kwargs):
    """
    Handy shortcut for using inference methods in functional way

    Parameters
    ----------
    n : int
        number of iterations
    local_rv : dict
        mapping {model_variable -> local_variable}
        Local Vars are used for Autoencoding Variational Bayes
        See (AEVB; Kingma and Welling, 2014) for details
    method : str or Inference
        string name is case insensitive in {'advi', 'fullrank_advi', 'advi->fullrank_advi'}
    model : Model
    kwargs : kwargs for Inference.fit
    frac : float
        if method is 'advi->fullrank_advi' represents advi fraction when training

    Returns
    -------
    Approximation
    """
    if model is None:
        model = pm.modelcontext(model)
    _select = dict(
        advi=ADVI,
        fullrank_advi=FullRankADVI,
        svgd=SVGD
    )
    if isinstance(method, str) and method.lower() == 'advi->fullrank_advi':
        frac = kwargs.pop('frac', .5)
        if not 0. < frac < 1.:
            raise ValueError('frac should be in (0, 1)')
        n1 = int(n * frac)
        n2 = n-n1
        inference = ADVI(local_rv=local_rv, model=model)
        logger.info('fitting advi ...')
        inference.fit(n1, **kwargs)
        inference = FullRankADVI.from_advi(inference)
        logger.info('fitting fullrank advi ...')
        return inference.fit(n2, **kwargs)

    elif isinstance(method, str):
        try:
            inference = _select[method.lower()](
                local_rv=local_rv, model=model
            )
        except KeyError:
            raise KeyError('method should be one of %s '
                           'or Inference instance' %
                           set(_select.keys()))
    elif isinstance(method, Inference):
        inference = method
    else:
        raise TypeError('method should be one of %s '
                        'or Inference instance' %
                        set(_select.keys()))
    return inference.fit(n, **kwargs)
Exemple #7
0
def add_citations_to_model(citations, model=None):
    try:
        model = pm.modelcontext(model)
        if not hasattr(model, "__citations__"):
            model.__citations__ = dict()
        for k in citations:
            model.__citations__[k] = CITATIONS[k]

    except TypeError:
        pass
Exemple #8
0
 def __init__(self, name='', model=None):
     super().__init__(name, model)
     assert pm.modelcontext(None) is self
     # 1) init variables with Var method
     self.Var('v1', pm.Normal.dist())
     self.v2 = pm.Normal('v2', mu=0, sigma=1)
     # 2) Potentials and Deterministic variables with method too
     # be sure that names will not overlap with other same models
     pm.Deterministic('d', tt.constant(1))
     pm.Potential('p', tt.constant(1))
def run_ppc(trace, samples=100, model=None):
    """Generate Posterior Predictive samples from a model given a trace.
    """
    if model is None:
         model = pm.modelcontext(model)

    ppc = defaultdict(list)
    for idx in np.random.randint(0, len(trace), samples):
        param = trace[idx]
        for obs in model.observed_RVs:
            ppc[obs.name].append(round(obs.distribution.random(point=param)))

    return ppc
def model_to_graphviz(model=None):
    """Produce a graphviz Digraph from a PyMC3 model.

    Requires graphviz, which may be installed most easily with
        conda install -c conda-forge python-graphviz

    Alternatively, you may install the `graphviz` binaries yourself,
    and then `pip install graphviz` to get the python bindings.  See
    http://graphviz.readthedocs.io/en/stable/manual.html
    for more information.
    """
    model = pm.modelcontext(model)
    return ModelGraph(model).make_graph()
Exemple #11
0
    def __init__(self, vars=None, covariance=None, scaling=1., n_chains=100,
                 tune=True, tune_interval=100, model=None, check_bound=True,
                 likelihood_name='like', proposal_dist=MvNPd,
                 coef_variation=1., **kwargs):

        model = pm.modelcontext(model)

        if vars is None:
            vars = model.vars
        vars = pm.inputvars(vars)

        if covariance is None:
            self.covariance = np.eye(sum(v.dsize for v in vars))
        self.scaling = np.atleast_1d(scaling)
        self.tune = tune
        self.check_bnd = check_bound
        self.tune_interval = tune_interval
        self.steps_until_tune = tune_interval

        self.proposal_dist = proposal_dist(self.covariance)
        self.proposal_samples_array = self.proposal_dist(n_chains)

        self.stage_sample = 0
        self.accepted = 0

        self.beta = 0
        self.stage = 0
        self.coef_variation = coef_variation
        self.n_chains = n_chains
        self.likelihoods = []
        self.likelihood_name = likelihood_name
        self.discrete = np.concatenate(
            [[v.dtype in pm.discrete_types] * (v.dsize or 1) for v in vars])
        self.any_discrete = self.discrete.any()
        self.all_discrete = self.discrete.all()

        # create initial population
        self.population = []
        self.array_population = np.zeros(n_chains)
        for i in range(self.n_chains):
            dummy = pm.Point({v.name: v.random() for v in vars},
                                                            model=model)
            self.population.append(dummy)

        shared = make_shared_replacements(vars, model)
        self.logp_forw = logp_forw(model.logpt, vars, shared)
        self.check_bnd = logp_forw(model.varlogpt, vars, shared)
        self.delta_logp = pm.metropolis.delta_logp(model.logpt, vars, shared)

        super(ATMCMC, self).__init__(vars, shared)
Exemple #12
0
    def get_step_for_trace(self, trace=None, model=None,
                           regular_window=0, regular_variance=1e-3,
                           **kwargs):
        """Get a PyMC3 NUTS step tuned for a given burn-in trace

        Args:
            trace: The ``MultiTrace`` output from a previous run of
                ``pymc3.sample``.
            regular_window: The weight (in units of number of steps) to use
                when regularizing the mass matrix estimate.
            regular_variance: The amplitude of the regularization for the mass
                matrix. This will be added to the diagonal of the covariance
                matrix with weight given by ``regular_window``.

        """
        model = pm.modelcontext(model)

        # If not given, use the trivial metric
        if trace is None or model.ndim == 1:
            potential = quad.QuadPotentialDiag(np.ones(model.ndim))

        else:
            # Loop over samples and convert to the relevant parameter space;
            # I'm sure that there's an easier way to do this, but I don't know
            # how to make something work in general...
            N = len(trace) * trace.nchains
            samples = np.empty((N, model.ndim))
            i = 0
            for chain in trace._straces.values():
                for p in chain:
                    samples[i] = model.bijection.map(p)
                    i += 1

            if self.dense:
                # Compute the regularized sample covariance
                cov = np.cov(samples, rowvar=0)
                if regular_window > 0:
                    cov = cov * N / (N + regular_window)
                    cov[np.diag_indices_from(cov)] += \
                        regular_variance * regular_window / (N+regular_window)
                potential = quad.QuadPotentialFull(cov)
            else:
                var = np.var(samples, axis=0)
                if regular_window > 0:
                    var = var * N / (N + regular_window)
                    var += \
                        regular_variance * regular_window / (N+regular_window)
                potential = quad.QuadPotentialDiag(var)

        return pm.NUTS(potential=potential, **kwargs)
Exemple #13
0
    def tune(self, tune=1000, start=None, step_kwargs=None, **kwargs):
        """Run the full tuning run for the mass matrix

        This will run ``start`` steps of warmup followed by chains with
        exponentially increasing chains to tune the mass matrix.

        Args:
            tune (int): The total number of steps to run.

        """
        model = pm.modelcontext(kwargs.get("model", None))

        ntot = self.start + self.window + self.finish
        if tune < ntot:
            raise ValueError("'tune' must be at least {0}".format(ntot) +
                             "(start + window + finish)")

        self.count = 0
        self.warmup(start=start, step_kwargs=step_kwargs, **kwargs)
        steps = self.window
        trace = None
        while self.count < tune:
            trace = self.extend_tune(start=start, step_kwargs=step_kwargs,
                                     steps=steps, trace=trace, **kwargs)
            steps *= 2
            if self.count + steps + steps*2 > tune:
                steps = tune - self.count

        # Final tuning stage for step size
        self.extend_tune(start=start, step_kwargs=step_kwargs,
                         steps=self.finish, trace=trace, **kwargs)

        # Copy across the step size from the parallel runs
        self._current_step.stop_tuning()
        expected = []
        for chain in self._current_trace._straces.values():
            expected.append(chain.get_sampler_stats("step_size")[-1])

        step = self._current_step
        if step_kwargs is None:
            step_kwargs = dict()
        else:
            step_kwargs = dict(step_kwargs)
        step_kwargs["model"] = model
        step_kwargs["step_scale"] = np.mean(expected) * model.ndim ** 0.25
        step_kwargs["adapt_step_size"] = False
        step_kwargs["potential"] = step.potential
        self._current_step = pm.NUTS(**step_kwargs)
        return self._current_trace
Exemple #14
0
    def __init__(self, vars, scaling=1., tune=True, tune_interval=100, model=None):

        model = pm.modelcontext(model)

        self.scaling = scaling
        self.tune = tune
        self.tune_interval = tune_interval
        self.steps_until_tune = tune_interval
        self.accepted = 0

        if not all([v.dtype in pm.discrete_types for v in vars]):
            raise ValueError(
                'All variables must be Bernoulli for BinaryMetropolis')

        super(BinaryMetropolis, self).__init__(vars, [model.fastlogp])
Exemple #15
0
    def _get_priors(self, model=None):
        """Return prior distributions of the likelihood.

        Returns
        -------
        dict : mapping name -> pymc3 distribution
        """
        model = pymc3.modelcontext(model)
        priors = {}
        for key, val in self.priors.items():
            if isinstance(val, numbers.Number):
                priors[key] = val
            else:
                priors[key] = model.Var(val[0], val[1])

        return priors
Exemple #16
0
    def __init__(self, vars, order='random', model=None):

        model = pm.modelcontext(model)

        self.dim = sum(v.dsize for v in vars)

        if order == 'random':
            self.shuffle_dims = True
            self.order = list(range(self.dim))
        else:
            if sorted(order) != list(range(self.dim)):
                raise ValueError('Argument \'order\' has to be a permutation')
            self.shuffle_dims = False
            self.order = order

        if not all([v.dtype in pm.discrete_types for v in vars]):
            raise ValueError(
                'All variables must be binary for BinaryGibbsMetropolis')

        super(BinaryGibbsMetropolis, self).__init__(vars, [model.fastlogp])
Exemple #17
0
    def __init__(self, vars, proposal='uniform', order='random', model=None):

        model = pm.modelcontext(model)
        vars = pm.inputvars(vars)

        dimcats = []
        # The above variable is a list of pairs (aggregate dimension, number
        # of categories). For example, if vars = [x, y] with x being a 2-D
        # variable with M categories and y being a 3-D variable with N
        # categories, we will have dimcats = [(0, M), (1, M), (2, N), (3, N), (4, N)].
        for v in vars:
            distr = getattr(v.distribution, 'parent_dist', v.distribution)
            if isinstance(distr, pm.Categorical):
                k = draw_values([distr.k])[0]
            elif isinstance(distr, pm.Bernoulli) or (v.dtype in pm.bool_types):
                k = 2
            else:
                raise ValueError('All variables must be categorical or binary' +
                                 'for CategoricalGibbsMetropolis')
            start = len(dimcats)
            dimcats += [(dim, k) for dim in range(start, start + v.dsize)]

        if order == 'random':
            self.shuffle_dims = True
            self.dimcats = dimcats
        else:
            if sorted(order) != list(range(len(dimcats))):
                raise ValueError('Argument \'order\' has to be a permutation')
            self.shuffle_dims = False
            self.dimcats = [dimcats[j] for j in order]

        if proposal == 'uniform':
            self.astep = self.astep_unif
        elif proposal == 'proportional':
            # Use the optimized "Metropolized Gibbs Sampler" described in Liu96.
            self.astep = self.astep_prop
        else:
            raise ValueError('Argument \'proposal\' should either be ' +
                    '\'uniform\' or \'proportional\'')

        super(CategoricalGibbsMetropolis, self).__init__(vars, [model.fastlogp])
Exemple #18
0
    def __init__(self, vars=None, S=None, proposal_dist=None, scaling=1.,
                 tune=True, tune_interval=100, model=None, mode=None, **kwargs):

        model = pm.modelcontext(model)

        if vars is None:
            vars = model.vars
        vars = pm.inputvars(vars)

        if S is None:
            S = np.ones(sum(v.dsize for v in vars))

        if proposal_dist is not None:
            self.proposal_dist = proposal_dist(S)
        elif S.ndim == 1:
            self.proposal_dist = NormalProposal(S)
        elif S.ndim == 2:
            self.proposal_dist = MultivariateNormalProposal(S)
        else:
            raise ValueError("Invalid rank for variance: %s" % S.ndim)

        self.scaling = np.atleast_1d(scaling).astype('d')
        self.tune = tune
        self.tune_interval = tune_interval
        self.steps_until_tune = tune_interval
        self.accepted = 0

        # Determine type of variables
        self.discrete = np.concatenate(
            [[v.dtype in pm.discrete_types] * (v.dsize or 1) for v in vars])
        self.any_discrete = self.discrete.any()
        self.all_discrete = self.discrete.all()

        self.mode = mode

        shared = pm.make_shared_replacements(vars, model)
        self.delta_logp = delta_logp(model.logpt, vars, shared)
        super().__init__(vars, shared)
Exemple #19
0
def sample_numpyro_nuts(
    draws=1000,
    tune=1000,
    chains=4,
    target_accept=0.8,
    random_seed=10,
    model=None,
    progress_bar=True,
    keep_untransformed=False,
):
    model = modelcontext(model)

    seed = jax.random.PRNGKey(random_seed)

    rv_names = [rv.name for rv in model.value_vars]
    init_state = [model.initial_point[rv_name] for rv_name in rv_names]
    init_state_batched = jax.tree_map(
        lambda x: np.repeat(x[None, ...], chains, axis=0), init_state)
    init_state_batched_at = [at.as_tensor(v) for v in init_state_batched]

    nuts_inputs = sorted(
        [
            v for v in graph_inputs([model.logpt])
            if not isinstance(v, Constant)
        ],
        key=lambda x: isinstance(x, SharedVariable),
    )
    map_seed = jax.random.split(seed, chains)
    numpyro_samples = NumPyroNUTS(
        nuts_inputs,
        [model.logpt],
        target_accept=target_accept,
        draws=draws,
        tune=tune,
        chains=chains,
        seed=map_seed,
        progress_bar=progress_bar,
    )(*init_state_batched_at)

    # Un-transform the transformed variables in JAX
    sample_outputs = []
    for i, (value_var, rv_samples) in enumerate(
            zip(model.value_vars, numpyro_samples[:-1])):
        rv = model.values_to_rvs[value_var]
        transform = getattr(value_var.tag, "transform", None)
        if transform is not None:
            untrans_value_var = transform.backward(rv, rv_samples)
            untrans_value_var.name = rv.name
            sample_outputs.append(untrans_value_var)

            if keep_untransformed:
                rv_samples.name = value_var.name
                sample_outputs.append(rv_samples)
        else:
            rv_samples.name = rv.name
            sample_outputs.append(rv_samples)

    print("Compiling...")

    tic1 = pd.Timestamp.now()
    _sample = compile_rv_inplace(
        [],
        sample_outputs + [numpyro_samples[-1]],
        allow_input_downcast=True,
        on_unused_input="ignore",
        accept_inplace=True,
        mode="JAX",
    )
    tic2 = pd.Timestamp.now()

    print("Compilation time = ", tic2 - tic1)

    print("Sampling...")

    *mcmc_samples, leapfrogs_taken = _sample()
    tic3 = pd.Timestamp.now()

    print("Sampling time = ", tic3 - tic2)

    posterior = {k.name: v for k, v in zip(sample_outputs, mcmc_samples)}

    az_trace = az.from_dict(posterior=posterior)

    return az_trace
Exemple #20
0
def init_nuts(init='ADVI', njobs=1, n_init=500000, model=None,
              random_seed=-1, progressbar=True, **kwargs):
    """Initialize and sample from posterior of a continuous model.

    This is a convenience function. NUTS convergence and sampling speed is extremely
    dependent on the choice of mass/scaling matrix. In our experience, using ADVI
    to estimate a diagonal covariance matrix and using this as the scaling matrix
    produces robust results over a wide class of continuous models.

    Parameters
    ----------
    init : str {'ADVI', 'ADVI_MAP', 'MAP', 'NUTS'}
        Initialization method to use.
        * ADVI : Run ADVI to estimate posterior mean and diagonal covariance matrix.
        * ADVI_MAP: Initialize ADVI with MAP and use MAP as starting point.
        * MAP : Use the MAP as starting point.
        * NUTS : Run NUTS and estimate posterior mean and covariance matrix.
    njobs : int
        Number of parallel jobs to start.
    n_init : int
        Number of iterations of initializer
        If 'ADVI', number of iterations, if 'metropolis', number of draws.
    model : Model (optional if in `with` context)
    progressbar : bool
        Whether or not to display a progressbar for advi sampling.
    **kwargs : keyword arguments
        Extra keyword arguments are forwarded to pymc3.NUTS.

    Returns
    -------
    start : pymc3.model.Point
        Starting point for sampler
    nuts_sampler : pymc3.step_methods.NUTS
        Instantiated and initialized NUTS sampler object
    """

    model = pm.modelcontext(model)

    pm._log.info('Initializing NUTS using {}...'.format(init))

    random_seed = int(np.atleast_1d(random_seed)[0])

    if init is not None:
        init = init.lower()
    cb = [
        pm.callbacks.CheckParametersConvergence(tolerance=1e-2, diff='absolute'),
        pm.callbacks.CheckParametersConvergence(tolerance=1e-2, diff='relative'),
    ]
    if init == 'advi':
        approx = pm.fit(
            random_seed=random_seed,
            n=n_init, method='advi', model=model,
            callbacks=cb,
            progressbar=progressbar,
            obj_optimizer=pm.adagrad_window
        )  # type: pm.MeanField
        start = approx.sample(draws=njobs)
        stds = approx.gbij.rmap(approx.std.eval())
        cov = model.dict_to_array(stds) ** 2
        if njobs == 1:
            start = start[0]
    elif init == 'advi_map':
        start = pm.find_MAP()
        approx = pm.MeanField(model=model, start=start)
        pm.fit(
            random_seed=random_seed,
            n=n_init, method=pm.ADVI.from_mean_field(approx),
            callbacks=cb,
            progressbar=progressbar,
            obj_optimizer=pm.adagrad_window
        )
        start = approx.sample(draws=njobs)
        stds = approx.gbij.rmap(approx.std.eval())
        cov = model.dict_to_array(stds) ** 2
        if njobs == 1:
            start = start[0]
    elif init == 'map':
        start = pm.find_MAP()
        cov = pm.find_hessian(point=start)
    elif init == 'nuts':
        init_trace = pm.sample(draws=n_init, step=pm.NUTS(),
                               tune=n_init // 2,
                               random_seed=random_seed)
        cov = np.atleast_1d(pm.trace_cov(init_trace))
        start = np.random.choice(init_trace, njobs)
        if njobs == 1:
            start = start[0]
    else:
        raise NotImplementedError('Initializer {} is not supported.'.format(init))

    step = pm.NUTS(scaling=cov, is_cov=True, **kwargs)

    return start, step
Exemple #21
0
def advi_minibatch(vars=None,
                   start=None,
                   model=None,
                   n=5000,
                   n_mcsamples=1,
                   minibatch_RVs=None,
                   minibatch_tensors=None,
                   minibatches=None,
                   local_RVs=None,
                   observed_RVs=None,
                   encoder_params=[],
                   total_size=None,
                   optimizer=None,
                   learning_rate=.001,
                   epsilon=.1,
                   random_seed=None):
    """Perform mini-batch ADVI.

    This function implements a mini-batch ADVI with the meanfield
    approximation. Autoencoding variational inference is also supported.

    The log probability terms for mini-batches, corresponding to RVs in
    minibatch_RVs, are scaled to (total_size) / (the number of samples in each
    mini-batch), where total_size is an argument for the total data size.

    minibatch_tensors is a list of tensors (can be shared variables) to which
    mini-batch samples are set during the optimization. In most cases, these
    tensors are observations for RVs in the model.

    local_RVs and observed_RVs are used for autoencoding variational Bayes.
    Both of these RVs are associated with each of given samples.
    The difference is that local_RVs are unkown and their posterior
    distributions are approximated.

    local_RVs are Ordered dict, whose keys and values are RVs and a tuple of
    two objects. The first is the theano expression of variational parameters
    (mean and log of std) of the approximate posterior, which are encoded from
    given samples by an arbitrary deterministic function, e.g., MLP. The other
    one is a scaling constant to be multiplied to the log probability term
    corresponding to the RV.

    observed_RVs are also Ordered dict with RVs as the keys, but whose values
    are only the scaling constant as in local_RVs. In this case, total_size is
    ignored.

    If local_RVs is None (thus not using autoencoder), the following two
    settings are equivalent:

    - observed_RVs=OrderedDict([(rv, total_size / minibatch_size)])
    - minibatch_RVs=[rv], total_size=total_size

    where minibatch_size is minibatch_tensors[0].shape[0].

    The variational parameters and the parameters of the autoencoder are
    simultaneously optimized with given optimizer, which is a function that
    returns a dictionary of parameter updates as provided to Theano function.
    See the docstring of pymc3.variational.advi().

    Parameters
    ----------
    vars : object
        List of random variables. If None, variational posteriors (normal
        distribution) are fit for all RVs in the given model.
    start : Dict or None
        Initial values of parameters (variational means).
    model : Model
        Probabilistic model.
    n : int
        Number of iterations updating parameters.
    n_mcsamples : int
        Number of Monte Carlo samples to approximate ELBO.
    minibatch_RVs : list of ObservedRVs
        Random variables in the model for which mini-batch tensors are set.
        When this argument is given, both of arguments local_RVs and
        observed_RVs must be None.
    minibatch_tensors : list of (tensors or shared variables)
        Tensors used to create ObservedRVs in minibatch_RVs.
    minibatches : generator of list
        Generates a set of minibatches when calling next().
        The length of the returned list must be the same with the number of
        random variables in `minibatch_tensors`.
    total_size : int
        Total size of training samples. This is used to appropriately scale the
        log likelihood terms corresponding to mini-batches in ELBO.
    local_RVs : Ordered dict
        Include encoded variational parameters and a scaling constant for
        the corresponding RV. See the above description.
    observed_RVs : Ordered dict
        Include a scaling constant for the corresponding RV. See the above
        description
    encoder_params : list of theano shared variables
        Parameters of encoder.
    optimizer : (loss, list of shared variables) -> dict or OrderedDict
        A function that returns parameter updates given loss and shared
        variables of parameters. If :code:`None` (default), a default
        Adagrad optimizer is used with parameters :code:`learning_rate`
        and :code:`epsilon` below.
    learning_rate: float
        Base learning rate for adagrad. This parameter is ignored when
        an optimizer is given.
    epsilon : float
        Offset in denominator of the scale of learning rate in Adagrad.
        This parameter is ignored when an optimizer is given.
    random_seed : int
        Seed to initialize random state.

    Returns
    -------
    ADVIFit
        Named tuple, which includes 'means', 'stds', and 'elbo_vals'.
    """
    theano.config.compute_test_value = 'ignore'

    model = pm.modelcontext(model)
    vars = inputvars(vars if vars is not None else model.vars)
    start = start if start is not None else model.test_point
    check_discrete_rvs(vars)
    _check_minibatches(minibatch_tensors, minibatches)

    # Prepare optimizer
    if optimizer is None:
        optimizer = adagrad_optimizer(learning_rate, epsilon)

    # For backward compatibility in how input arguments are given
    local_RVs, observed_RVs = _get_rvss(minibatch_RVs, local_RVs, observed_RVs,
                                        minibatch_tensors, total_size)

    # Replace local_RVs with transformed variables
    ds = model.deterministics

    def get_transformed(v):
        if v in ds:
            return v.transformed
        return v

    local_RVs = OrderedDict([(get_transformed(v), (uw, s))
                             for v, (uw, s) in local_RVs.items()])

    # Get global variables
    global_RVs = list(set(vars) - set(list(local_RVs) + list(observed_RVs)))

    # Ordering for concatenation of random variables
    global_order = pm.ArrayOrdering([v for v in global_RVs])
    local_order = pm.ArrayOrdering([v for v in local_RVs])

    # ELBO wrt variational parameters
    inarray_g, uw_g, replace_g = _join_global_RVs(global_RVs, global_order)
    inarray_l, uw_l, replace_l = _join_local_RVs(local_RVs, local_order)
    logpt = _make_logpt(global_RVs, local_RVs, observed_RVs, model)
    replace = replace_g
    replace.update(replace_l)
    logp = theano.clone(logpt, replace, strict=False)
    elbo = _elbo_t(logp, uw_g, uw_l, inarray_g, inarray_l, n_mcsamples,
                   random_seed)
    del logpt

    # Replacements tensors of variational parameters in the graph
    replaces = dict()

    # Variational parameters for global RVs
    if 0 < len(global_RVs):
        uw_global_shared, bij = _init_uw_global_shared(start, global_RVs,
                                                       global_order)
        replaces.update({uw_g: uw_global_shared})

    # Variational parameters for local RVs, encoded from samples in
    # mini-batches
    if 0 < len(local_RVs):
        uws = [uw for _, (uw, _) in local_RVs.items()]
        uw_local_encoded = tt.concatenate([uw[0].ravel() for uw in uws] +
                                          [uw[1].ravel() for uw in uws])
        replaces.update({uw_l: uw_local_encoded})

    # Replace tensors of variational parameters in ELBO
    elbo = theano.clone(elbo, OrderedDict(replaces), strict=False)

    # Replace input shared variables with tensors
    def is_shared(t):
        return isinstance(t, theano.compile.sharedvalue.SharedVariable)

    tensors = [(t.type() if is_shared(t) else t) for t in minibatch_tensors]
    updates = OrderedDict(
        {t: t_
         for t, t_ in zip(minibatch_tensors, tensors) if is_shared(t)})
    elbo = theano.clone(elbo, updates, strict=False)

    # Create parameter update function used in the training loop
    params = encoder_params
    if 0 < len(global_RVs):
        params += [uw_global_shared]
    updates = OrderedDict(optimizer(loss=-1 * elbo, param=params))
    f = theano.function(tensors, elbo, updates=updates)

    # Optimization loop
    elbos = np.empty(n)
    progress = tqdm.trange(n)
    for i in progress:
        e = f(*next(minibatches))
        elbos[i] = e
        if i % (n // 10) == 0 and i > 0:
            avg_elbo = elbos[i - n // 10:i].mean()
            progress.set_description('Average ELBO = {:,.2f}'.format(avg_elbo))

    pm._log.info('Finished minibatch ADVI: ELBO = {:,.2f}'.format(elbos[-1]))

    # Variational parameters of global RVs
    if 0 < len(global_RVs):
        l = int(uw_global_shared.get_value(borrow=True).size / 2)
        u = bij.rmap(uw_global_shared.get_value(borrow=True)[:l])
        w = bij.rmap(uw_global_shared.get_value(borrow=True)[l:])
        # w is in log space
        for var in w.keys():
            w[var] = np.exp(w[var])
    else:
        u = dict()
        w = dict()

    return ADVIFit(u, w, elbos)
Exemple #22
0
def plot_map_model_and_residuals(ax,
                                 data,
                                 map_point,
                                 t_grid,
                                 prediction,
                                 gp_list=None,
                                 model=None,
                                 **kwargs):
    """
    Plots model in data space given samples from the posterior distribution. 
    Also plots residuals with respect to the median model, where the median 
    model is the median of multiple posterior draws of the model in data space,
    rather then a single draw corresponding to median values of all parameters.
    All extra keyword arguments are passed to the matplotlib plot function.
    
    Parameters
    ----------
    ax : matplotlib.axes 
        Needs to be of shape ``(2, 1)``.
    data : :func:`~caustic.data.Data`
        Microlensing event data. 
    model : pymc3.Model
        PyMC3 model object which was used to obtain posterior samples in the
        trace.
    map_point : dict 
        Point in the parameter space for which we want to evaluate the 
        prediction tensor.
    t_grid : theano.tensor
        Times at which we want to evaluate model predictions. Shape 
        ``(n_bands, n_pts)``.
    prediction : theano.tensor
        Model prediction evaluated at ``t_grid``.
    gp_list : list
        List of ``exoplanet.gp.GP`` objects, one per each band. If these
        are provided the likelihood which is computed is the GP marginal
        likelihood.
    """
    model = pm.modelcontext(model)

    # Load data
    if model.is_standardized is True:
        tables = data.get_standardized_data()

    else:
        tables = data.get_standardized_data(rescale=False)

    # Evaluate model for each sample on a fine grid
    n_pts_dense = T.shape(t_grid)[1].eval()
    n_bands = len(data.light_curves)

    prediction_eval = np.zeros((n_bands, n_pts_dense))

    if gp_list is None:
        with model:
            prediction_eval = xo.eval_in_model(prediction, map_point)

    else:
        with model:
            for n in range(n_bands):
                prediction_eval[n] = xo.eval_in_model(
                    gp_list[n].predict(t_grid[n]), map_point)

            # Add mean model to GP prediction
            prediction_eval += xo.eval_in_model(prediction, map_point)

    # Plot model predictions for each different samples from posterior on dense
    # grid
    for n in range(n_bands):  # iterate over bands
        ax[0].plot(
            t_grid[n].eval(),
            prediction_eval[n, :],
            color="C" + str(n),
            **kwargs,
        )

    # Plot data
    data.plot_standardized_data(ax[0], rescale=model.is_standardized)
    ax[0].set_xlabel(None)
    ax[1].set_xlabel("HJD - 2450000")
    ax[1].set_ylabel("Residuals")
    ax[0].set_xlim(T.min(t_grid).eval(), T.max(t_grid).eval())

    # Compute residuals with respect to median model
    for n in range(n_bands):
        # Interpolate median predictions onto a grid of observed times
        map_prediction_interp = np.interp(tables[n]["HJD"], t_grid[n].eval(),
                                          prediction_eval[n])

        residuals = tables[n]["flux"] - map_prediction_interp

        ax[1].errorbar(
            tables[n]["HJD"],
            residuals,
            tables[n]["flux_err"],
            fmt="o",
            color="C" + str(n),
            alpha=0.2,
            **kwargs,
        )
        ax[1].grid(True)
Exemple #23
0
def init_nuts(init='auto', njobs=1, n_init=500000, model=None,
              random_seed=-1, progressbar=True, **kwargs):
    """Set up the mass matrix initialization for NUTS.

    NUTS convergence and sampling speed is extremely dependent on the
    choice of mass/scaling matrix. This function implements different
    methods for choosing or adapting the mass matrix.

    Parameters
    ----------
    init : str
        Initialization method to use.

        * auto : Choose a default initialization method automatically.
          Currently, this is `'jitter+adapt_diag'`, but this can change in
          the future. If you depend on the exact behaviour, choose an
          initialization method explicitly.
        * adapt_diag : Start with a identity mass matrix and then adapt
          a diagonal based on the variance of the tuning samples. All
          chains use the test value (usually the prior mean) as starting
          point.
        * jitter+adapt_diag : Same as `adapt_diag`, but add uniform jitter
          in [-1, 1] to the starting point in each chain.
        * advi+adapt_diag : Run ADVI and then adapt the resulting diagonal
          mass matrix based on the sample variance of the tuning samples.
        * advi+adapt_diag_grad : Run ADVI and then adapt the resulting
          diagonal mass matrix based on the variance of the gradients
          during tuning. This is **experimental** and might be removed
          in a future release.
        * advi : Run ADVI to estimate posterior mean and diagonal mass
          matrix.
        * advi_map: Initialize ADVI with MAP and use MAP as starting point.
        * map : Use the MAP as starting point. This is discouraged.
        * nuts : Run NUTS and estimate posterior mean and mass matrix from
          the trace.
    njobs : int
        Number of parallel jobs to start.
    n_init : int
        Number of iterations of initializer
        If 'ADVI', number of iterations, if 'nuts', number of draws.
    model : Model (optional if in `with` context)
    progressbar : bool
        Whether or not to display a progressbar for advi sampling.
    **kwargs : keyword arguments
        Extra keyword arguments are forwarded to pymc3.NUTS.

    Returns
    -------
    start : pymc3.model.Point
        Starting point for sampler
    nuts_sampler : pymc3.step_methods.NUTS
        Instantiated and initialized NUTS sampler object
    """
    model = pm.modelcontext(model)

    vars = kwargs.get('vars', model.vars)
    if set(vars) != set(model.vars):
        raise ValueError('Must use init_nuts on all variables of a model.')
    if not pm.model.all_continuous(vars):
        raise ValueError('init_nuts can only be used for models with only '
                         'continuous variables.')

    if not isinstance(init, str):
        raise TypeError('init must be a string.')

    if init is not None:
        init = init.lower()

    if init == 'auto':
        init = 'jitter+adapt_diag'

    pm._log.info('Initializing NUTS using {}...'.format(init))

    random_seed = int(np.atleast_1d(random_seed)[0])

    cb = [
        pm.callbacks.CheckParametersConvergence(
            tolerance=1e-2, diff='absolute'),
        pm.callbacks.CheckParametersConvergence(
            tolerance=1e-2, diff='relative'),
    ]

    if init == 'adapt_diag':
        start = [model.test_point] * njobs
        mean = np.mean([model.dict_to_array(vals) for vals in start], axis=0)
        var = np.ones_like(mean)
        potential = quadpotential.QuadPotentialDiagAdapt(
            model.ndim, mean, var, 10)
        if njobs == 1:
            start = start[0]
    elif init == 'jitter+adapt_diag':
        start = []
        for _ in range(njobs):
            mean = {var: val.copy() for var, val in model.test_point.items()}
            for val in mean.values():
                val[...] += 2 * np.random.rand(*val.shape) - 1
            start.append(mean)
        mean = np.mean([model.dict_to_array(vals) for vals in start], axis=0)
        var = np.ones_like(mean)
        potential = quadpotential.QuadPotentialDiagAdapt(
            model.ndim, mean, var, 10)
        if njobs == 1:
            start = start[0]
    elif init == 'advi+adapt_diag_grad':
        approx = pm.fit(
            random_seed=random_seed,
            n=n_init, method='advi', model=model,
            callbacks=cb,
            progressbar=progressbar,
            obj_optimizer=pm.adagrad_window,
        )  # type: pm.MeanField
        start = approx.sample(draws=njobs)
        start = list(start)
        stds = approx.bij.rmap(approx.std.eval())
        cov = model.dict_to_array(stds) ** 2
        mean = approx.bij.rmap(approx.mean.get_value())
        mean = model.dict_to_array(mean)
        weight = 50
        potential = quadpotential.QuadPotentialDiagAdaptGrad(
            model.ndim, mean, cov, weight)
        if njobs == 1:
            start = start[0]
    elif init == 'advi+adapt_diag':
        approx = pm.fit(
            random_seed=random_seed,
            n=n_init, method='advi', model=model,
            callbacks=cb,
            progressbar=progressbar,
            obj_optimizer=pm.adagrad_window,
        )  # type: pm.MeanField
        start = approx.sample(draws=njobs)
        start = list(start)
        stds = approx.bij.rmap(approx.std.eval())
        cov = model.dict_to_array(stds) ** 2
        mean = approx.bij.rmap(approx.mean.get_value())
        mean = model.dict_to_array(mean)
        weight = 50
        potential = quadpotential.QuadPotentialDiagAdapt(
            model.ndim, mean, cov, weight)
        if njobs == 1:
            start = start[0]
    elif init == 'advi':
        approx = pm.fit(
            random_seed=random_seed,
            n=n_init, method='advi', model=model,
            callbacks=cb,
            progressbar=progressbar,
            obj_optimizer=pm.adagrad_window
        )  # type: pm.MeanField
        start = approx.sample(draws=njobs)
        start = list(start)
        stds = approx.bij.rmap(approx.std.eval())
        cov = model.dict_to_array(stds) ** 2
        potential = quadpotential.QuadPotentialDiag(cov)
        if njobs == 1:
            start = start[0]
    elif init == 'advi_map':
        start = pm.find_MAP()
        approx = pm.MeanField(model=model, start=start)
        pm.fit(
            random_seed=random_seed,
            n=n_init, method=pm.KLqp(approx),
            callbacks=cb,
            progressbar=progressbar,
            obj_optimizer=pm.adagrad_window
        )
        start = approx.sample(draws=njobs)
        start = list(start)
        stds = approx.bij.rmap(approx.std.eval())
        cov = model.dict_to_array(stds) ** 2
        potential = quadpotential.QuadPotentialDiag(cov)
        if njobs == 1:
            start = start[0]
    elif init == 'map':
        start = pm.find_MAP()
        cov = pm.find_hessian(point=start)
        start = [start] * njobs
        potential = quadpotential.QuadPotentialFull(cov)
        if njobs == 1:
            start = start[0]
    elif init == 'nuts':
        init_trace = pm.sample(draws=n_init, step=pm.NUTS(),
                               tune=n_init // 2,
                               random_seed=random_seed)
        cov = np.atleast_1d(pm.trace_cov(init_trace))
        start = list(np.random.choice(init_trace, njobs))
        potential = quadpotential.QuadPotentialFull(cov)
        if njobs == 1:
            start = start[0]
    else:
        raise NotImplementedError('Initializer {} is not supported.'.format(init))

    step = pm.NUTS(potential=potential, **kwargs)

    return start, step
Exemple #24
0
def advi(vars=None,
         start=None,
         model=None,
         n=5000,
         accurate_elbo=False,
         optimizer=None,
         learning_rate=.001,
         epsilon=.1,
         random_seed=None,
         verbose=1,
         dp_par=None):
    """Perform automatic differentiation variational inference (ADVI).
    This function implements the meanfield ADVI, where the variational
    posterior distribution is assumed to be spherical Gaussian without
    correlation of parameters and fit to the true posterior distribution.
    The means and standard deviations of the variational posterior are referred
    to as variational parameters.
    The return value of this function is an :code:`ADVIfit` object, which has
    variational parameters. If you want to draw samples from the variational
    posterior, you need to pass the :code:`ADVIfit` object to
    :code:`pymc3.variational.sample_vp()`.
    The variational parameters are defined on the transformed space, which is
    required to do ADVI on an unconstrained parameter space as described in
    [KTR+2016]. The parameters in the :code:`ADVIfit` object are in the
    transformed space, while traces returned by :code:`sample_vp()` are in
    the original space as obtained by MCMC sampling methods in PyMC3.
    The variational parameters are optimized with given optimizer, which is a
    function that returns a dictionary of parameter updates as provided to
    Theano function. If no optimizer is provided, optimization is performed
    with a modified version of adagrad, where only the last (n_window) gradient
    vectors are used to control the learning rate and older gradient vectors
    are ignored. n_window denotes the size of time window and fixed to 10.
    Parameters
    ----------
    vars : object
        Random variables.
    start : Dict or None
        Initial values of parameters (variational means).
    model : Model
        Probabilistic model.
    n : int
        Number of interations updating parameters.
    accurate_elbo : bool
        If true, 100 MC samples are used for accurate calculation of ELBO.
    optimizer : (loss, tensor) -> dict or OrderedDict
        A function that returns parameter updates given loss and parameter
        tensor. If :code:`None` (default), a default Adagrad optimizer is
        used with parameters :code:`learning_rate` and :code:`epsilon` below.
    learning_rate: float
        Base learning rate for adagrad. This parameter is ignored when
        optimizer is given.
    epsilon : float
        Offset in denominator of the scale of learning rate in Adagrad.
        This parameter is ignored when optimizer is given.
    random_seed : int or None
        Seed to initialize random state. None uses current seed.
    Returns
    -------
    ADVIFit
        Named tuple, which includes 'means', 'stds', and 'elbo_vals'.
    'means' is the mean. 'stds' is the standard deviation.
    'elbo_vals' is the trace of ELBO values during optimizaiton.
    References
    ----------
    .. [KTR+2016] Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A.,
        and Blei, D. M. (2016). Automatic Differentiation Variational Inference.
        arXiv preprint arXiv:1603.00788.
    """
    model = pm.modelcontext(model)
    if start is None:
        start = model.test_point

    if vars is None:
        vars = model.vars
    vars = pm.inputvars(vars)

    check_discrete_rvs(vars)

    n_mcsamples = 100 if accurate_elbo else 1

    # Prepare optimizer
    if optimizer is None:
        optimizer = adagrad_optimizer(learning_rate, epsilon)

    # Create variational gradient tensor
    elbo, shared = _calc_elbo(vars,
                              model,
                              n_mcsamples=n_mcsamples,
                              random_seed=random_seed)

    # Set starting values
    for var, share in shared.items():
        share.set_value(start[str(var)])

    order = pm.ArrayOrdering(vars)
    bij = pm.DictToArrayBijection(order, start)
    u_start = bij.map(start)
    w_start = np.zeros_like(u_start)
    uw = np.concatenate([u_start, w_start])

    # Create parameter update function used in the training loop
    uw_shared = theano.shared(uw, 'uw_shared')
    elbo = pm.CallableTensor(elbo)(uw_shared)
    updates = optimizer(likeloss=-1 * elbo[0],
                        entroloss=-1 * elbo[1],
                        param=uw_shared,
                        dp_par=dp_par,
                        n_par=len(vars))
    f = theano.function(
        [], [uw_shared, tt.add(elbo[1], tt.sum(elbo[0], axis=0))],
        updates=updates)

    # Optimization loop
    elbos = np.empty(n)
    try:
        for i in range(n):
            uw_i, e = f()
            elbos[i] = e
            if verbose and not i % (n // 10):
                if not i:
                    print('Iteration {0} [{1}%]: ELBO = {2}'.format(
                        i, 100 * i // n, e.round(2)))
                else:
                    avg_elbo = elbos[i - n // 10:i].mean()
                    print('Iteration {0} [{1}%]: Average ELBO = {2}'.format(
                        i, 100 * i // n, avg_elbo.round(2)))
    except KeyboardInterrupt:
        if verbose:
            elbos = elbos[:i]
            avg_elbo = elbos[i - n // 10:].mean()
            print('Interrupted at {0} [{1}%]: Average ELBO = {2}'.format(
                i, 100 * i // n, avg_elbo.round(2)))
    else:
        if verbose:
            avg_elbo = elbos[-n // 10:].mean()
            print('Finished [100%]: Average ELBO = {}'.format(
                avg_elbo.round(2)))

    # Estimated parameters
    l = int(uw_i.size / 2)
    u = bij.rmap(uw_i[:l])
    w = bij.rmap(uw_i[l:])
    # w is in log space
    for var in w.keys():
        w[var] = np.exp(w[var])

    return ADVIFit(u, w, elbos)
Exemple #25
0
def advi_minibatch(vars=None, start=None, model=None, n=5000, n_mcsamples=1,
                   minibatch_RVs=None, minibatch_tensors=None,
                   minibatches=None, local_RVs=None, observed_RVs=None,
                   encoder_params=None, total_size=None, optimizer=None,
                   learning_rate=.001, epsilon=.1, random_seed=None):
    """Perform mini-batch ADVI.

    This function implements a mini-batch ADVI with the meanfield
    approximation. Autoencoding variational inference is also supported.

    The log probability terms for mini-batches, corresponding to RVs in
    minibatch_RVs, are scaled to (total_size) / (the number of samples in each
    mini-batch), where total_size is an argument for the total data size.

    minibatch_tensors is a list of tensors (can be shared variables) to which
    mini-batch samples are set during the optimization. In most cases, these
    tensors are observations for RVs in the model.

    local_RVs and observed_RVs are used for autoencoding variational Bayes.
    Both of these RVs are associated with each of given samples.
    The difference is that local_RVs are unkown and their posterior
    distributions are approximated.

    local_RVs are Ordered dict, whose keys and values are RVs and a tuple of
    two objects. The first is the theano expression of variational parameters
    (mean and log of std) of the approximate posterior, which are encoded from
    given samples by an arbitrary deterministic function, e.g., MLP. The other
    one is a scaling constant to be multiplied to the log probability term
    corresponding to the RV.

    observed_RVs are also Ordered dict with RVs as the keys, but whose values
    are only the scaling constant as in local_RVs. In this case, total_size is
    ignored.

    If local_RVs is None (thus not using autoencoder), the following two
    settings are equivalent:

    - observed_RVs=OrderedDict([(rv, total_size / minibatch_size)])
    - minibatch_RVs=[rv], total_size=total_size

    where minibatch_size is minibatch_tensors[0].shape[0].

    The variational parameters and the parameters of the autoencoder are
    simultaneously optimized with given optimizer, which is a function that
    returns a dictionary of parameter updates as provided to Theano function.
    See the docstring of pymc3.variational.advi().

    Parameters
    ----------
    vars : object
        List of random variables. If None, variational posteriors (normal
        distribution) are fit for all RVs in the given model.
    start : Dict or None
        Initial values of parameters (variational means).
    model : Model
        Probabilistic model.
    n : int
        Number of iterations updating parameters.
    n_mcsamples : int
        Number of Monte Carlo samples to approximate ELBO.
    minibatch_RVs : list of ObservedRVs
        Random variables in the model for which mini-batch tensors are set.
        When this argument is given, both of arguments local_RVs and
        observed_RVs must be None.
    minibatch_tensors : list of (tensors or shared variables)
        Tensors used to create ObservedRVs in minibatch_RVs.
    minibatches : generator of list
        Generates a set of minibatches when calling next().
        The length of the returned list must be the same with the number of
        random variables in `minibatch_tensors`.
    total_size : int
        Total size of training samples. This is used to appropriately scale the
        log likelihood terms corresponding to mini-batches in ELBO.
    local_RVs : Ordered dict
        Include encoded variational parameters and a scaling constant for
        the corresponding RV. See the above description.
    observed_RVs : Ordered dict
        Include a scaling constant for the corresponding RV. See the above
        description
    encoder_params : list of theano shared variables
        Parameters of encoder.
    optimizer : (loss, list of shared variables) -> dict or OrderedDict
        A function that returns parameter updates given loss and shared
        variables of parameters. If :code:`None` (default), a default
        Adagrad optimizer is used with parameters :code:`learning_rate`
        and :code:`epsilon` below.
    learning_rate: float
        Base learning rate for adagrad. This parameter is ignored when
        an optimizer is given.
    epsilon : float
        Offset in denominator of the scale of learning rate in Adagrad.
        This parameter is ignored when an optimizer is given.
    random_seed : int
        Seed to initialize random state.

    Returns
    -------
    ADVIFit
        Named tuple, which includes 'means', 'stds', and 'elbo_vals'.
    """
    theano.config.compute_test_value = 'ignore'

    model = pm.modelcontext(model)
    vars = inputvars(vars if vars is not None else model.vars)
    start = start if start is not None else model.test_point
    check_discrete_rvs(vars)
    _check_minibatches(minibatch_tensors, minibatches)
    
    if encoder_params is None:
        encoder_params = []

    # Prepare optimizer
    if optimizer is None:
        optimizer = adagrad_optimizer(learning_rate, epsilon)

    # For backward compatibility in how input arguments are given
    local_RVs, observed_RVs = _get_rvss(minibatch_RVs, local_RVs, observed_RVs,
                                        minibatch_tensors, total_size)

    # Replace local_RVs with transformed variables
    ds = model.deterministics

    def get_transformed(v):
        if v in ds:
            return v.transformed
        return v
    local_RVs = OrderedDict(
        [(get_transformed(v), (uw, s)) for v, (uw, s) in local_RVs.items()]
    )

    # Get global variables
    global_RVs = list(set(vars) - set(list(local_RVs) + list(observed_RVs)))

    # Ordering for concatenation of random variables
    global_order = pm.ArrayOrdering([v for v in global_RVs])
    local_order = pm.ArrayOrdering([v for v in local_RVs])

    # ELBO wrt variational parameters
    inarray_g, uw_g, replace_g = _join_global_RVs(global_RVs, global_order)
    inarray_l, uw_l, replace_l = _join_local_RVs(local_RVs, local_order)
    logpt = _make_logpt(global_RVs, local_RVs, observed_RVs, model)
    replace = replace_g
    replace.update(replace_l)
    logp = theano.clone(logpt, replace, strict=False)
    elbo = _elbo_t(logp, uw_g, uw_l, inarray_g, inarray_l,
                   n_mcsamples, random_seed)
    del logpt

    # Replacements tensors of variational parameters in the graph
    replaces = dict()

    # Variational parameters for global RVs
    if 0 < len(global_RVs):
        uw_global_shared, bij = _init_uw_global_shared(start, global_RVs,
                                                       global_order)
        replaces.update({uw_g: uw_global_shared})

    # Variational parameters for local RVs, encoded from samples in
    # mini-batches
    if 0 < len(local_RVs):
        uws = [uw for _, (uw, _) in local_RVs.items()]
        uw_local_encoded = tt.concatenate([uw[0].ravel() for uw in uws] +
                                          [uw[1].ravel() for uw in uws])
        replaces.update({uw_l: uw_local_encoded})

    # Replace tensors of variational parameters in ELBO
    elbo = theano.clone(elbo, OrderedDict(replaces), strict=False)

    # Replace input shared variables with tensors
    def is_shared(t):
        return isinstance(t, theano.compile.sharedvalue.SharedVariable)
    tensors = [(t.type() if is_shared(t) else t) for t in minibatch_tensors]
    updates = OrderedDict(
        {t: t_ for t, t_ in zip(minibatch_tensors, tensors) if is_shared(t)}
    )
    elbo = theano.clone(elbo, updates, strict=False)

    # Create parameter update function used in the training loop
    params = encoder_params
    if 0 < len(global_RVs):
        params += [uw_global_shared]
    updates = OrderedDict(optimizer(loss=-1 * elbo, param=params))
    f = theano.function(tensors, elbo, updates=updates)

    # Optimization loop
    elbos = np.empty(n)
    progress = tqdm.trange(n)
    for i in progress:
        e = f(*next(minibatches))
        elbos[i] = e
        if i % (n // 10) == 0 and i > 0:
            avg_elbo = elbos[i - n // 10:i].mean()
            progress.set_description('Average ELBO = {:,.2f}'.format(avg_elbo))

    pm._log.info('Finished minibatch ADVI: ELBO = {:,.2f}'.format(elbos[-1]))

    # Variational parameters of global RVs
    if 0 < len(global_RVs):
        l = int(uw_global_shared.get_value(borrow=True).size / 2)
        u = bij.rmap(uw_global_shared.get_value(borrow=True)[:l])
        w = bij.rmap(uw_global_shared.get_value(borrow=True)[l:])
        # w is in log space
        for var in w.keys():
            w[var] = np.exp(w[var])
    else:
        u = dict()
        w = dict()

    return ADVIFit(u, w, elbos)
def vaneylen19(
    name,
    fixed=False,
    multi=False,
    lower=None,
    upper=None,
    model=None,
    **kwargs,
):
    """The eccentricity distribution for small planets

    The mixture distribution fit by `Van Eylen et al. (2019)
    <https://arxiv.org/abs/1807.00549>`_ to a population of well-characterized
    small transiting planets observed by Kepler.

    Args:
        name (str): The name of the eccentricity variable.
        fixed (bool, optional): If ``True``, use the posterior median
            hyperparameters. Otherwise, marginalize over the parameters.
        multi (bool, optional): If ``True``, use the distribution for systems
            with multiple transiting planets. If ``False`` (default), use the
            distribution for systems with only one detected transiting planet.
        lower (float, optional): Restrict the eccentricity to be larger than
            this value.
        upper (float, optional): Restrict the eccentricity to be smaller than
            this value.

    Returns:
        The eccentricity distribution.

    """

    model = pm.modelcontext(model)
    add_citations_to_model(["vaneylen19"], model=model)

    sigma_gauss_mu = 0.049
    sigma_gauss_sd = 0.02
    sigma_rayleigh_mu = 0.26
    sigma_rayleigh_sd = 0.05
    if multi:
        frac_mu = 0.08
        frac_sd = 0.08
    else:
        frac_mu = 0.76
        frac_sd = 0.2

    with model:
        if lower is None and upper is None:
            ecc = UnitUniform(name, **kwargs)
        else:
            ecc = pm.Uniform(
                name,
                lower=0.0 if lower is None else lower,
                upper=1.0 if upper is None else upper,
                **kwargs,
            )

        with pm.Model(name=name):

            if fixed:
                sigma_gauss = sigma_gauss_mu
                sigma_rayleigh = sigma_rayleigh_mu
                frac = frac_mu
            else:

                bounded_normal = pm.Bound(pm.Normal, lower=0)
                sigma_gauss = bounded_normal(
                    "sigma_gauss",
                    mu=sigma_gauss_mu,
                    sd=sigma_gauss_sd,
                    testval=sigma_gauss_mu,
                )
                sigma_rayleigh = bounded_normal(
                    "sigma_rayleigh",
                    mu=sigma_rayleigh_mu,
                    sd=sigma_rayleigh_sd,
                    testval=sigma_rayleigh_mu,
                )
                frac = pm.Bound(pm.Normal, lower=0, upper=1)(
                    "frac", mu=frac_mu, sd=frac_sd, testval=frac_mu
                )

            gauss = pm.HalfNormal.dist(sigma=sigma_gauss)
            rayleigh = pm.Weibull.dist(
                alpha=2, beta=np.sqrt(2) * sigma_rayleigh
            )

            pm.Potential(
                "prior",
                pm.math.logaddexp(
                    tt.log(1 - frac) + gauss.logp(ecc),
                    tt.log(frac) + rayleigh.logp(ecc),
                ),
            )

        return ecc
 def __init__(self, var, model=None, values=[0,1]):
     model = pm.modelcontext(model)
     self.values = values
     self.i = 0
     super(SequentialScanDiscreteMetropolis, self).__init__([var], [model.fastlogp])
def kipping13(
    name, fixed=False, long=None, lower=None, upper=None, model=None, **kwargs
):
    """The beta eccentricity distribution fit by Kipping (2013)

    The beta distribution parameters fit by `Kipping (2013b)
    <https://arxiv.org/abs/1306.4982>`_.

    Args:
        name (str): The name of the eccentricity variable.
        fixed (bool, optional): If ``True``, use the posterior median
            hyperparameters. Otherwise, marginalize over the parameters.
        long (bool, optional): If ``True``, use the parameters for the long
            period fit. If ``False``, use the parameters for the short period
            fit. If not given, the parameters fit using the full dataset are
            used.
        lower (float, optional): Restrict the eccentricity to be larger than
            this value.
        upper (float, optional): Restrict the eccentricity to be smaller than
            this value.

    Returns:
        The eccentricity distribution.

    """
    model = pm.modelcontext(model)
    add_citations_to_model(["kipping13b"], model=model)

    if long is None:
        # If 'long' is not provided, use the fit for the parameters from the
        # full dataset
        alpha_mu = 1.12
        alpha_sd = 0.1
        beta_mu = 3.09
        beta_sd = 0.3
    else:
        # If 'long' is set, select either the long or short period model
        # parameters
        if long:
            alpha_mu = 1.12
            alpha_sd = 0.1
            beta_mu = 3.09
            beta_sd = 0.3
        else:
            alpha_mu = 0.697
            alpha_sd = 0.4
            beta_mu = 3.27
            beta_sd = 0.3

    with model:
        if fixed:
            # Use the posterior median parameters
            alpha = alpha_mu
            beta = beta_mu
        else:
            # Marginalize over the uncertainty on the parameters of the beta
            with pm.Model(name=name):
                bounded_normal = pm.Bound(pm.Normal, lower=0)
                alpha = bounded_normal(
                    "alpha", mu=alpha_mu, sd=alpha_sd, testval=alpha_mu
                )
                beta = bounded_normal(
                    "beta", mu=beta_mu, sd=beta_sd, testval=beta_mu
                )

        # Allow for upper and lower bounds
        if lower is not None or upper is not None:
            dist = pm.Bound(
                pm.Beta,
                lower=0.0 if lower is None else lower,
                upper=1.0 if upper is None else upper,
            )
            return dist(name, alpha=alpha, beta=beta, **kwargs)

        return pm.Beta(name, alpha=alpha, beta=beta, **kwargs)
Exemple #29
0
    def __init__(
        self,
        coarse_models: List[Model],
        vars: Optional[list] = None,
        base_sampler="DEMetropolisZ",
        base_S: Optional = None,
        base_proposal_dist: Optional[Type[Proposal]] = None,
        base_scaling: Optional = None,
        tune: bool = True,
        base_tune_target: str = "lambda",
        base_tune_interval: int = 100,
        base_lamb: Optional = None,
        base_tune_drop_fraction: float = 0.9,
        model: Optional[Model] = None,
        mode: Optional = None,
        subsampling_rates: List[int] = 5,
        base_blocked: bool = False,
        variance_reduction: bool = False,
        store_Q_fine: bool = False,
        adaptive_error_model: bool = False,
        **kwargs,
    ) -> None:

        # this variable is used to identify MLDA objects which are
        # not in the finest level (i.e. child MLDA objects)
        self.is_child = kwargs.get("is_child", False)
        if not self.is_child:
            warnings.warn(
                "The MLDA implementation in PyMC3 is still immature. You should be particularly critical of its results."
            )

        if not isinstance(coarse_models, list):
            raise ValueError(
                "MLDA step method cannot use coarse_models if it is not a list"
            )
        if len(coarse_models) == 0:
            raise ValueError("MLDA step method was given an empty "
                             "list of coarse models. Give at least "
                             "one coarse model.")

        # assign internal state
        model = pm.modelcontext(model)
        self.model = model
        self.coarse_models = coarse_models
        self.model_below = self.coarse_models[-1]
        self.num_levels = len(self.coarse_models) + 1

        # set up variance reduction.
        self.variance_reduction = variance_reduction
        self.store_Q_fine = store_Q_fine

        # check that certain requirements hold
        # for the variance reduction feature to work
        if self.variance_reduction or self.store_Q_fine:
            if not hasattr(self.model, "Q"):
                raise AttributeError("Model given to MLDA does not contain"
                                     "variable 'Q'. You need to include"
                                     "the variable in the model definition"
                                     "for variance reduction to work or"
                                     "for storing the fine Q."
                                     "Use pm.Data() to define it.")
            if not isinstance(self.model.Q, tt.sharedvar.TensorSharedVariable):
                raise TypeError(
                    "The variable 'Q' in the model definition is not of type "
                    "'TensorSharedVariable'. Use pm.Data() to define the"
                    "variable.")

        if self.is_child and self.variance_reduction:
            # this is the subsampling rate applied to the current level
            # it is stored in the level above and transferred here
            self.subsampling_rate_above = kwargs.pop("subsampling_rate_above",
                                                     None)

        # set up adaptive error model
        self.adaptive_error_model = adaptive_error_model

        # check that certain requirements hold
        # for the adaptive error model feature to work
        if self.adaptive_error_model:
            if not hasattr(self.model_below, "mu_B"):
                raise AttributeError(
                    "Model below in hierarchy does not contain"
                    "variable 'mu_B'. You need to include"
                    "the variable in the model definition"
                    "for adaptive error model to work."
                    "Use pm.Data() to define it.")
            if not hasattr(self.model_below, "Sigma_B"):
                raise AttributeError(
                    "Model below in hierarchy does not contain"
                    "variable 'Sigma_B'. You need to include"
                    "the variable in the model definition"
                    "for adaptive error model to work."
                    "Use pm.Data() to define it.")
            if not (isinstance(self.model_below.mu_B,
                               tt.sharedvar.TensorSharedVariable)
                    and isinstance(self.model_below.Sigma_B,
                                   tt.sharedvar.TensorSharedVariable)):
                raise TypeError(
                    "At least one of the variables 'mu_B' and 'Sigma_B' "
                    "in the definition of the below model is not of type "
                    "'TensorSharedVariable'. Use pm.Data() to define those "
                    "variables.")

            # this object is used to recursively update the mean and
            # variance of the bias correction given new differences
            # between levels
            self.bias = RecursiveSampleMoments(
                self.model_below.mu_B.get_value(),
                self.model_below.Sigma_B.get_value())

            # this list holds the bias objects from all levels
            # it is gradually constructed when MLDA objects are
            # created and then shared between all levels
            self.bias_all = kwargs.pop("bias_all", None)
            if self.bias_all is None:
                self.bias_all = [self.bias]
            else:
                self.bias_all.append(self.bias)

            # variables used for adaptive error model
            self.last_synced_output_diff = None
            self.adaptation_started = False

        # set up subsampling rates.
        if isinstance(subsampling_rates, int):
            self.subsampling_rates = [subsampling_rates] * len(
                self.coarse_models)
        else:
            if len(subsampling_rates) != len(self.coarse_models):
                raise ValueError(
                    f"List of subsampling rates needs to have the same "
                    f"length as list of coarse models but the lengths "
                    f"were {len(subsampling_rates)}, {len(self.coarse_models)}"
                )
            self.subsampling_rates = subsampling_rates

        self.subsampling_rate = self.subsampling_rates[-1]
        self.subchain_selection = None

        # set up base sampling
        self.base_sampler = base_sampler

        # VR is not compatible with compound base samplers so an automatic conversion
        # to a block sampler happens here if
        if self.variance_reduction and self.base_sampler == "Metropolis" and not base_blocked:
            warnings.warn(
                "Variance reduction is not compatible with non-blocked (compound) samplers."
                "Automatically switching to a blocked Metropolis sampler.")
            self.base_blocked = True
        else:
            self.base_blocked = base_blocked

        self.base_S = base_S
        self.base_proposal_dist = base_proposal_dist

        if base_scaling is None:
            if self.base_sampler == "Metropolis":
                self.base_scaling = 1.0
            else:
                self.base_scaling = 0.001
        else:
            self.base_scaling = float(base_scaling)

        self.tune = tune
        if not self.tune and self.base_sampler == "DEMetropolisZ":
            raise ValueError(
                f"The argument tune was set to False while using"
                f" a 'DEMetropolisZ' base sampler. 'DEMetropolisZ' "
                f" tune needs to be True.")

        self.base_tune_target = base_tune_target
        self.base_tune_interval = base_tune_interval
        self.base_lamb = base_lamb
        self.base_tune_drop_fraction = float(base_tune_drop_fraction)
        self.base_tuning_stats = None

        self.mode = mode

        # Process model variables
        if vars is None:
            vars = model.vars
        vars = pm.inputvars(vars)
        self.vars = vars
        self.var_names = [var.name for var in self.vars]

        self.accepted = 0

        # Construct theano function for current-level model likelihood
        # (for use in acceptance)
        shared = pm.make_shared_replacements(vars, model)
        self.delta_logp = delta_logp_inverse(model.logpt, vars, shared)

        # Construct theano function for below-level model likelihood
        # (for use in acceptance)
        model_below = pm.modelcontext(self.model_below)
        vars_below = [
            var for var in model_below.vars if var.name in self.var_names
        ]
        vars_below = pm.inputvars(vars_below)
        shared_below = pm.make_shared_replacements(vars_below, model_below)
        self.delta_logp_below = delta_logp(model_below.logpt, vars_below,
                                           shared_below)

        super().__init__(vars, shared)

        # initialise complete step method hierarchy
        if self.num_levels == 2:
            with self.model_below:
                # make sure the correct variables are selected from model_below
                vars_below = [
                    var for var in self.model_below.vars
                    if var.name in self.var_names
                ]

                # create kwargs
                if self.variance_reduction:
                    base_kwargs = {
                        "mlda_subsampling_rate_above": self.subsampling_rate,
                        "mlda_variance_reduction": True,
                    }
                else:
                    base_kwargs = {}

                if self.base_sampler == "Metropolis":
                    # MetropolisMLDA sampler in base level (level=0), targeting self.model_below
                    self.step_method_below = pm.MetropolisMLDA(
                        vars=vars_below,
                        proposal_dist=self.base_proposal_dist,
                        S=self.base_S,
                        scaling=self.base_scaling,
                        tune=self.tune,
                        tune_interval=self.base_tune_interval,
                        model=None,
                        mode=self.mode,
                        blocked=self.base_blocked,
                        **base_kwargs,
                    )
                else:
                    # DEMetropolisZMLDA sampler in base level (level=0), targeting self.model_below
                    self.step_method_below = pm.DEMetropolisZMLDA(
                        vars=vars_below,
                        S=self.base_S,
                        proposal_dist=self.base_proposal_dist,
                        lamb=self.base_lamb,
                        scaling=self.base_scaling,
                        tune=self.base_tune_target,
                        tune_interval=self.base_tune_interval,
                        tune_drop_fraction=self.base_tune_drop_fraction,
                        model=None,
                        mode=self.mode,
                        **base_kwargs,
                    )
        else:
            # drop the last coarse model
            coarse_models_below = self.coarse_models[:-1]
            subsampling_rates_below = self.subsampling_rates[:-1]

            with self.model_below:
                # make sure the correct variables are selected from model_below
                vars_below = [
                    var for var in self.model_below.vars
                    if var.name in self.var_names
                ]

                # create kwargs
                if self.variance_reduction:
                    mlda_kwargs = {
                        "is_child": True,
                        "subsampling_rate_above": self.subsampling_rate,
                    }
                else:
                    mlda_kwargs = {"is_child": True}
                if self.adaptive_error_model:
                    mlda_kwargs = {
                        **mlda_kwargs,
                        **{
                            "bias_all": self.bias_all
                        }
                    }

                # MLDA sampler in some intermediate level, targeting self.model_below
                self.step_method_below = pm.MLDA(
                    vars=vars_below,
                    base_S=self.base_S,
                    base_sampler=self.base_sampler,
                    base_proposal_dist=self.base_proposal_dist,
                    base_scaling=self.base_scaling,
                    tune=self.tune,
                    base_tune_target=self.base_tune_target,
                    base_tune_interval=self.base_tune_interval,
                    base_lamb=self.base_lamb,
                    base_tune_drop_fraction=self.base_tune_drop_fraction,
                    model=None,
                    mode=self.mode,
                    subsampling_rates=subsampling_rates_below,
                    coarse_models=coarse_models_below,
                    base_blocked=self.base_blocked,
                    variance_reduction=self.variance_reduction,
                    store_Q_fine=False,
                    adaptive_error_model=self.adaptive_error_model,
                    **mlda_kwargs,
                )

        # instantiate the recursive DA proposal.
        # this is the main proposal used for
        # all levels (Recursive Delayed Acceptance)
        # (except for level 0 where the step method is MetropolisMLDA
        # or DEMetropolisZMLDA - not MLDA)
        self.proposal_dist = RecursiveDAProposal(self.step_method_below,
                                                 self.model_below, self.tune,
                                                 self.subsampling_rate)

        # set up data types of stats.
        if isinstance(self.step_method_below, MLDA):
            # get the stat types from the level below if that level is MLDA
            self.stats_dtypes = self.step_method_below.stats_dtypes

        else:
            # otherwise, set it up from scratch.
            self.stats_dtypes = [{
                "accept": np.float64,
                "accepted": np.bool,
                "tune": np.bool
            }]

            if isinstance(self.step_method_below, MetropolisMLDA):
                self.stats_dtypes.append({"base_scaling": np.float64})
            elif isinstance(self.step_method_below, DEMetropolisZMLDA):
                self.stats_dtypes.append({
                    "base_scaling": np.float64,
                    "base_lambda": np.float64
                })
            elif isinstance(self.step_method_below, CompoundStep):
                for method in self.step_method_below.methods:
                    if isinstance(method, MetropolisMLDA):
                        self.stats_dtypes.append({"base_scaling": np.float64})
                    elif isinstance(method, DEMetropolisZMLDA):
                        self.stats_dtypes.append({
                            "base_scaling": np.float64,
                            "base_lambda": np.float64
                        })

        # initialise necessary variables for doing variance reduction
        if self.variance_reduction:
            self.sub_counter = 0
            self.Q_diff = []
            if self.is_child:
                self.Q_reg = [np.nan] * self.subsampling_rate_above
            if self.num_levels == 2:
                self.Q_base_full = []
            if not self.is_child:
                for level in range(self.num_levels - 1, 0, -1):
                    self.stats_dtypes[0][f"Q_{level}_{level - 1}"] = object
                self.stats_dtypes[0]["Q_0"] = object

        # initialise necessary variables for doing variance reduction or storing fine Q
        if self.variance_reduction or self.store_Q_fine:
            self.Q_last = np.nan
            self.Q_diff_last = np.nan
        if self.store_Q_fine and not self.is_child:
            self.stats_dtypes[0][f"Q_{self.num_levels - 1}"] = object
Exemple #30
0
def get_args_for_theano_function(point=None, model=None):
    model = pm.modelcontext(model)
    if point is None:
        point = model.test_point
    return [point[k.name] for k in model.vars]
Exemple #31
0
def get_theano_function_for_var(var, model=None, **kwargs):
    model = pm.modelcontext(model)
    kwargs["on_unused_input"] = kwargs.get("on_unused_input", "ignore")
    return theano.function(model.vars, var, **kwargs)
Exemple #32
0
def sample_vp(vparams,
              draws=1000,
              model=None,
              local_RVs=None,
              random_seed=None,
              hide_transformed=True,
              progressbar=True):
    """Draw samples from variational posterior.
    Parameters
    ----------
    vparams : dict or pymc3.variational.ADVIFit
        Estimated variational parameters of the model.
    draws : int
        Number of random samples.
    model : pymc3.Model
        Probabilistic model.
    random_seed : int or None
        Seed of random number generator.  None to use current seed.
    hide_transformed : bool
        If False, transformed variables are also sampled. Default is True.
    Returns
    -------
    trace : pymc3.backends.base.MultiTrace
        Samples drawn from the variational posterior.
    """
    model = pm.modelcontext(model)

    if isinstance(vparams, ADVIFit):
        vparams = {'means': vparams.means, 'stds': vparams.stds}

    ds = model.deterministics
    get_transformed = lambda v: v if v not in ds else v.transformed
    rvs = lambda x: [get_transformed(v) for v in x] if x is not None else []

    global_RVs = list(set(model.free_RVs) - set(rvs(local_RVs)))

    # Make dict for replacements of random variables
    if random_seed is None:
        r = MRG_RandomStreams(seed=123)
    else:
        r = MRG_RandomStreams(seed=123)
    updates = {}
    for v in global_RVs:
        u = theano.shared(vparams['means'][str(v)]).ravel()
        w = theano.shared(vparams['stds'][str(v)]).ravel()
        n = r.normal(size=u.tag.test_value.shape)
        updates.update({v: (n * w + u).reshape(v.tag.test_value.shape)})

    if local_RVs is not None:
        ds = model.deterministics
        get_transformed = lambda v: v if v not in ds else v.transformed
        for v_, (uw, _) in local_RVs.items():
            v = get_transformed(v_)
            u = uw[0].ravel()
            w = uw[1].ravel()
            n = r.normal(size=u.tag.test_value.shape)
            updates.update(
                {v: (n * tt.exp(w) + u).reshape(v.tag.test_value.shape)})

    # Replace some nodes of the graph with variational distributions
    vars = model.free_RVs
    samples = theano.clone(vars, updates)
    f = theano.function([], samples)

    # Random variables which will be sampled
    vars_sampled = [v for v in model.unobserved_RVs if not str(v).endswith('_')] \
        if hide_transformed else \
                   [v for v in model.unobserved_RVs]

    varnames = [str(var) for var in model.unobserved_RVs]
    trace = pm.sampling.NDArray(model=model, vars=vars_sampled)
    trace.setup(draws=draws, chain=0)

    range_ = trange(draws) if progressbar else range(draws)

    for i in range_:
        # 'point' is like {'var1': np.array(0.1), 'var2': np.array(0.2), ...}
        point = {varname: value for varname, value in zip(varnames, f())}
        trace.record(point)

    return MultiTrace([trace])
Exemple #33
0
def fit(
    n=10000,
    local_rv=None,
    method="advi",
    model=None,
    random_seed=None,
    start=None,
    inf_kwargs=None,
    **kwargs,
):
    r"""Handy shortcut for using inference methods in functional way

    Parameters
    ----------
    n: `int`
        number of iterations
    local_rv: dict[var->tuple]
        mapping {model_variable -> approx params}
        Local Vars are used for Autoencoding Variational Bayes
        See (AEVB; Kingma and Welling, 2014) for details
    method: str or :class:`Inference`
        string name is case insensitive in:

        -   'advi'  for ADVI
        -   'fullrank_advi'  for FullRankADVI
        -   'svgd'  for Stein Variational Gradient Descent
        -   'asvgd'  for Amortized Stein Variational Gradient Descent
        -   'nfvi'  for Normalizing Flow with default `scale-loc` flow
        -   'nfvi=<formula>'  for Normalizing Flow using formula

    model: :class:`Model`
        PyMC3 model for inference
    random_seed: None or int
        leave None to use package global RandomStream or other
        valid value to create instance specific one
    inf_kwargs: dict
        additional kwargs passed to :class:`Inference`
    start: `Point`
        starting point for inference

    Other Parameters
    ----------------
    score: bool
            evaluate loss on each iteration or not
    callbacks: list[function: (Approximation, losses, i) -> None]
        calls provided functions after each iteration step
    progressbar: bool
        whether to show progressbar or not
    obj_n_mc: `int`
        Number of monte carlo samples used for approximation of objective gradients
    tf_n_mc: `int`
        Number of monte carlo samples used for approximation of test function gradients
    obj_optimizer: function (grads, params) -> updates
        Optimizer that is used for objective params
    test_optimizer: function (grads, params) -> updates
        Optimizer that is used for test function params
    more_obj_params: `list`
        Add custom params for objective optimizer
    more_tf_params: `list`
        Add custom params for test function optimizer
    more_updates: `dict`
        Add custom updates to resulting updates
    total_grad_norm_constraint: `float`
        Bounds gradient norm, prevents exploding gradient problem
    fn_kwargs: `dict`
        Add kwargs to aesara.function (e.g. `{'profile': True}`)
    more_replacements: `dict`
        Apply custom replacements before calculating gradients

    Returns
    -------
    :class:`Approximation`
    """
    if inf_kwargs is None:
        inf_kwargs = dict()
    else:
        inf_kwargs = inf_kwargs.copy()
    if local_rv is not None:
        inf_kwargs["local_rv"] = local_rv
    if random_seed is not None:
        inf_kwargs["random_seed"] = random_seed
    if start is not None:
        inf_kwargs["start"] = start
    if model is None:
        model = pm.modelcontext(model)
    _select = dict(advi=ADVI,
                   fullrank_advi=FullRankADVI,
                   svgd=SVGD,
                   asvgd=ASVGD,
                   nfvi=NFVI)
    if isinstance(method, str):
        method = method.lower()
        if method.startswith("nfvi="):
            formula = method[5:]
            inference = NFVI(formula, **inf_kwargs)
        elif method in _select:

            inference = _select[method](model=model, **inf_kwargs)
        else:
            raise KeyError(
                f"method should be one of {set(_select.keys())} or Inference instance"
            )
    elif isinstance(method, Inference):
        inference = method
    else:
        raise TypeError(
            f"method should be one of {set(_select.keys())} or Inference instance"
        )
    return inference.fit(n, **kwargs)
Exemple #34
0
def advi_minibatch(vars=None,
                   start=None,
                   model=None,
                   n=5000,
                   n_mcsamples=1,
                   minibatch_RVs=None,
                   minibatch_tensors=None,
                   minibatches=None,
                   global_RVs=None,
                   local_RVs=None,
                   observed_RVs=None,
                   encoder_params=None,
                   total_size=None,
                   optimizer=None,
                   learning_rate=.001,
                   epsilon=.1,
                   random_seed=None,
                   mode=None):
    """Perform mini-batch ADVI.

    This function implements a mini-batch automatic differentiation variational
    inference (ADVI; Kucukelbir et al., 2015) with the meanfield
    approximation. Autoencoding variational Bayes (AEVB; Kingma and Welling,
    2014) is also supported.

    For explanation, we classify random variables in probabilistic models into
    three types. Observed random variables
    :math:`{\cal Y}=\{\mathbf{y}_{i}\}_{i=1}^{N}` are :math:`N` observations.
    Each :math:`\mathbf{y}_{i}` can be a set of observed random variables,
    i.e., :math:`\mathbf{y}_{i}=\{\mathbf{y}_{i}^{k}\}_{k=1}^{V_{o}}`, where
    :math:`V_{k}` is the number of the types of observed random variables
    in the model.

    The next ones are global random variables
    :math:`\Theta=\{\\theta^{k}\}_{k=1}^{V_{g}}`, which are used to calculate
    the probabilities for all observed samples.

    The last ones are local random variables
    :math:`{\cal Z}=\{\mathbf{z}_{i}\}_{i=1}^{N}`, where
    :math:`\mathbf{z}_{i}=\{\mathbf{z}_{i}^{k}\}_{k=1}^{V_{l}}`.
    These RVs are used only in AEVB.

    The goal of ADVI is to approximate the posterior distribution
    :math:`p(\Theta,{\cal Z}|{\cal Y})` by variational posterior
    :math:`q(\Theta)\prod_{i=1}^{N}q(\mathbf{z}_{i})`. All of these terms
    are normal distributions (mean-field approximation).

    :math:`q(\Theta)` is parametrized with its means and standard deviations.
    These parameters are denoted as :math:`\gamma`. While :math:`\gamma` is
    a constant, the parameters of :math:`q(\mathbf{z}_{i})` are dependent on
    each observation. Therefore these parameters are denoted as
    :math:`\\xi(\mathbf{y}_{i}; \\nu)`, where :math:`\\nu` is the parameters
    of :math:`\\xi(\cdot)`. For example, :math:`\\xi(\cdot)` can be a
    multilayer perceptron or convolutional neural network.

    In addition to :math:`\\xi(\cdot)`, we can also include deterministic
    mappings for the likelihood of observations. We denote the parameters of
    the deterministic mappings as :math:`\eta`. An example of such mappings is
    the deconvolutional neural network used in the convolutional VAE example
    in the PyMC3 notebook directory.

    This function maximizes the evidence lower bound (ELBO)
    :math:`{\cal L}(\gamma, \\nu, \eta)` defined as follows:

    .. math::

        {\cal L}(\gamma,\\nu,\eta) & =
        \mathbf{c}_{o}\mathbb{E}_{q(\Theta)}\left[
        \sum_{i=1}^{N}\mathbb{E}_{q(\mathbf{z}_{i})}\left[
        \log p(\mathbf{y}_{i}|\mathbf{z}_{i},\Theta,\eta)
        \\right]\\right] \\\\ &
        - \mathbf{c}_{g}KL\left[q(\Theta)||p(\Theta)\\right]
        - \mathbf{c}_{l}\sum_{i=1}^{N}
            KL\left[q(\mathbf{z}_{i})||p(\mathbf{z}_{i})\\right],

    where :math:`KL[q(v)||p(v)]` is the Kullback-Leibler divergence

    .. math::

        KL[q(v)||p(v)] = \int q(v)\log\\frac{q(v)}{p(v)}dv,

    :math:`\mathbf{c}_{o/g/l}` are vectors for weighting each term of ELBO.
    More precisely, we can write each of the terms in ELBO as follows:

    .. math::

        \mathbf{c}_{o}\log p(\mathbf{y}_{i}|\mathbf{z}_{i},\Theta,\eta) & = &
        \sum_{k=1}^{V_{o}}c_{o}^{k}
            \log p(\mathbf{y}_{i}^{k}|
                   {\\rm pa}(\mathbf{y}_{i}^{k},\Theta,\eta)) \\\\
        \mathbf{c}_{g}KL\left[q(\Theta)||p(\Theta)\\right] & = &
        \sum_{k=1}^{V_{g}}c_{g}^{k}KL\left[
            q(\\theta^{k})||p(\\theta^{k}|{\\rm pa(\\theta^{k})})\\right] \\\\
        \mathbf{c}_{l}KL\left[q(\mathbf{z}_{i}||p(\mathbf{z}_{i})\\right] & = &
        \sum_{k=1}^{V_{l}}c_{l}^{k}KL\left[
            q(\mathbf{z}_{i}^{k})||
            p(\mathbf{z}_{i}^{k}|{\\rm pa}(\mathbf{z}_{i}^{k}))\\right],

    where :math:`{\\rm pa}(v)` denotes the set of parent variables of :math:`v`
    in the directed acyclic graph of the model.

    When using mini-batches, :math:`c_{o}^{k}` and :math:`c_{l}^{k}` should be
    set to :math:`N/M`, where :math:`M` is the number of observations in each
    mini-batch. Another weighting scheme was proposed in
    (Blundell et al., 2015) for accelarating model fitting.

    For working with ADVI, we need to give the probabilistic model
    (:code:`model`), the three types of RVs (:code:`observed_RVs`,
    :code:`global_RVs` and :code:`local_RVs`), the tensors to which
    mini-bathced samples are supplied (:code:`minibatches`) and
    parameters of deterministic mappings :math:`\\xi` and :math:`\eta`
    (:code:`encoder_params`) as input arguments.

    :code:`observed_RVs` is a :code:`OrderedDict` of the form
    :code:`{y_k: c_k}`, where :code:`y_k` is a random variable defined in the
    PyMC3 model. :code:`c_k` is a scalar (:math:`c_{o}^{k}`) and it can be a
    shared variable.

    :code:`global_RVs` is a :code:`OrderedDict` of the form
    :code:`{t_k: c_k}`, where :code:`t_k` is a random variable defined in the
    PyMC3 model. :code:`c_k` is a scalar (:math:`c_{g}^{k}`) and it can be a
    shared variable.

    :code:`local_RVs` is a :code:`OrderedDict` of the form
    :code:`{z_k: ((m_k, s_k), c_k)}`, where :code:`z_k` is a random variable
    defined in the PyMC3 model. :code:`c_k` is a scalar (:math:`c_{l}^{k}`)
    and it can be a shared variable. :code:`(m_k, s_k)` is a pair of tensors
    of means and log standard deviations of the variational distribution;
    samples drawn from the variational distribution replaces :code:`z_k`.
    It should be noted that if :code:`z_k` has a transformation that changes
    the dimension (e.g., StickBreakingTransform), the variational distribution
    must have the same dimension. For example, if :code:`z_k` is distributed
    with Dirichlet distribution with :code:`p` choices, :math:`m_k` and
    :code:`s_k` has the shape :code:`(n_samples_in_minibatch, p - 1)`.

    :code:`minibatch_tensors` is a list of tensors (can be shared variables)
    to which mini-batch samples are set during the optimization.
    These tensors are observations (:code:`obs=`) in :code:`observed_RVs`.

    :code:`minibatches` is a generator of a list of :code:`numpy.ndarray`.
    Each item of the list will be set to tensors in :code:`minibatch_tensors`.

    :code:`encoder_params` is a list of shared variables of the parameters
    :math:`\\nu` and :math:`\eta`. We do not need to include the variational
    parameters of the global variables, :math:`\gamma`, because these are
    automatically created and updated in this function.

    The following is a list of example notebooks using advi_minibatch:

    - docs/source/notebooks/GLM-hierarchical-advi-minibatch.ipynb
    - docs/source/notebooks/bayesian_neural_network_advi.ipynb
    - docs/source/notebooks/convolutional_vae_keras_advi.ipynb
    - docs/source/notebooks/gaussian-mixture-model-advi.ipynb
    - docs/source/notebooks/lda-advi-aevb.ipynb

    Parameters
    ----------
    vars : object
        List of random variables. If None, variational posteriors (normal
        distribution) are fit for all RVs in the given model.
    start : Dict or None
        Initial values of parameters (variational means).
    model : Model
        Probabilistic model.
    n : int
        Number of iterations updating parameters.
    n_mcsamples : int
        Number of Monte Carlo samples to approximate ELBO.
    minibatch_RVs : list of ObservedRVs
        Random variables in the model for which mini-batch tensors are set.
        When this argument is given, both of arguments local_RVs and
        observed_RVs must be None.
    minibatch_tensors : list of (tensors or shared variables)
        Tensors used to create ObservedRVs in minibatch_RVs.
    minibatches : generator of list
        Generates a set of minibatches when calling next().
        The length of the returned list must be the same with the number of
        random variables in `minibatch_tensors`.
    total_size : int
        Total size of training samples. This is used to appropriately scale the
        log likelihood terms corresponding to mini-batches in ELBO.
    observed_RVs : Ordered dict
        Include a scaling constant for the corresponding RV. See the above
        description.
    global_RVs : Ordered dict or None
        Include a scaling constant for the corresponding RV. See the above
        description. If :code:`None`, it is set to
        :code:`{v: 1 for v in grvs}`, where :code:`grvs` is
        :code:`list(set(vars) - set(list(local_RVs) + list(observed_RVs)))`.
    local_RVs : Ordered dict or None
        Include encoded variational parameters and a scaling constant for
        the corresponding RV. See the above description.
    encoder_params : list of theano shared variables
        Parameters of encoder.
    optimizer : (loss, list of shared variables) -> dict or OrderedDict
        A function that returns parameter updates given loss and shared
        variables of parameters. If :code:`None` (default), a default
        Adagrad optimizer is used with parameters :code:`learning_rate`
        and :code:`epsilon` below.
    learning_rate: float
        Base learning rate for adagrad.
        This parameter is ignored when :code:`optimizer` is set.
    epsilon : float
        Offset in denominator of the scale of learning rate in Adagrad.
        This parameter is ignored when :code:`optimizer` is set.
    random_seed : int
        Seed to initialize random state.

    Returns
    -------
    ADVIFit
        Named tuple, which includes 'means', 'stds', and 'elbo_vals'.

    References
    ----------
    - Kingma, D. P., & Welling, M. (2014).
      Auto-Encoding Variational Bayes. stat, 1050, 1.
    - Kucukelbir, A., Ranganath, R., Gelman, A., & Blei, D. (2015).
      Automatic variational inference in Stan. In Advances in neural
      information processing systems (pp. 568-576).
    - Blundell, C., Cornebise, J., Kavukcuoglu, K., & Wierstra, D. (2015).
      Weight Uncertainty in Neural Network. In Proceedings of the 32nd
      International Conference on Machine Learning (ICML-15) (pp. 1613-1622).
    """
    if encoder_params is None:
        encoder_params = []

    model = pm.modelcontext(model)
    vars = inputvars(vars if vars is not None else model.vars)
    start = start if start is not None else model.test_point

    if not pm.model.all_continuous(vars):
        raise ValueError('Model can not include discrete RVs for ADVI.')

    _check_minibatches(minibatch_tensors, minibatches)

    if encoder_params is None:
        encoder_params = []

    # Prepare optimizer
    if optimizer is None:
        optimizer = adagrad_optimizer(learning_rate, epsilon)

    # For backward compatibility in how input arguments are given
    local_RVs, observed_RVs = _get_rvss(minibatch_RVs, local_RVs, observed_RVs,
                                        minibatch_tensors, total_size)

    # Replace local_RVs with transformed variables
    def get_transformed(v):
        if hasattr(v, 'transformed'):
            return v.transformed
        return v

    local_RVs = OrderedDict([(get_transformed(v), (uw, s))
                             for v, (uw, s) in local_RVs.items()])

    # Get global variables
    grvs = list(set(vars) - set(list(local_RVs) + list(observed_RVs)))
    if global_RVs is None:
        global_RVs = OrderedDict({v: 1 for v in grvs})
    elif len(grvs) != len(global_RVs):
        _value_error('global_RVs ({}) must have all global RVs: {}'.format(
            [v for v in global_RVs], grvs))

    # ELBO wrt variational parameters
    elbo, uw_l, uw_g = _make_elbo_t(observed_RVs, global_RVs, local_RVs,
                                    model.potentials, n_mcsamples, random_seed)

    # Replacements tensors of variational parameters in the graph
    replaces = dict()

    # Variational parameters for global RVs
    if 0 < len(global_RVs):
        uw_global_shared, bij = _init_uw_global_shared(start, global_RVs)
        replaces.update({uw_g: uw_global_shared})

    # Variational parameters for local RVs, encoded from samples in
    # mini-batches
    if 0 < len(local_RVs):
        uws = [uw for _, (uw, _) in local_RVs.items()]
        uw_local_encoded = tt.concatenate([uw[0].ravel() for uw in uws] +
                                          [uw[1].ravel() for uw in uws])
        replaces.update({uw_l: uw_local_encoded})

    # Replace tensors of variational parameters in ELBO
    elbo = theano.clone(elbo, OrderedDict(replaces), strict=False)

    # Replace input shared variables with tensors
    def is_shared(t):
        return isinstance(t, theano.compile.sharedvalue.SharedVariable)

    tensors = [(t.type() if is_shared(t) else t) for t in minibatch_tensors]
    updates = OrderedDict(
        {t: t_
         for t, t_ in zip(minibatch_tensors, tensors) if is_shared(t)})
    elbo = theano.clone(elbo, updates, strict=False)

    # Create parameter update function used in the training loop
    params = encoder_params
    if 0 < len(global_RVs):
        params += [uw_global_shared]
    updates = OrderedDict(optimizer(loss=-1 * elbo, param=params))
    f = theano.function(tensors, elbo, updates=updates, mode=mode)

    # Optimization loop
    elbos = np.empty(n)
    progress = tqdm.trange(n)
    for i in progress:
        e = f(*next(minibatches))
        elbos[i] = e
        if n < 10:
            progress.set_description('ELBO = {:,.2f}'.format(elbos[i]))
        elif i % (n // 10) == 0 and i > 0:
            avg_elbo = elbos[i - n // 10:i].mean()
            progress.set_description('Average ELBO = {:,.2f}'.format(avg_elbo))

    pm._log.info('Finished minibatch ADVI: ELBO = {:,.2f}'.format(elbos[-1]))

    # Variational parameters of global RVs
    if 0 < len(global_RVs):
        l = int(uw_global_shared.get_value(borrow=True).size / 2)
        u = bij.rmap(uw_global_shared.get_value(borrow=True)[:l])
        w = bij.rmap(uw_global_shared.get_value(borrow=True)[l:])
        # w is in log space
        for var in w.keys():
            w[var] = np.exp(w[var])
    else:
        u = dict()
        w = dict()

    return ADVIFit(u, w, elbos)
Exemple #35
0
def fit(n=10000, local_rv=None, method='advi', model=None,
        random_seed=None, start=None, inf_kwargs=None, **kwargs):
    R"""Handy shortcut for using inference methods in functional way

    Parameters
    ----------
    n : `int`
        number of iterations
    local_rv : dict[var->tuple]
        mapping {model_variable -> approx params}
        Local Vars are used for Autoencoding Variational Bayes
        See (AEVB; Kingma and Welling, 2014) for details
    method : str or :class:`Inference`
        string name is case insensitive in:

        -   'advi'  for ADVI
        -   'fullrank_advi'  for FullRankADVI
        -   'svgd'  for Stein Variational Gradient Descent
        -   'asvgd'  for Amortized Stein Variational Gradient Descent
        -   'nfvi'  for Normalizing Flow with default `scale-loc` flow
        -   'nfvi=<formula>'  for Normalizing Flow using formula

    model : :class:`Model`
        PyMC3 model for inference
    random_seed : None or int
        leave None to use package global RandomStream or other
        valid value to create instance specific one
    inf_kwargs : dict
        additional kwargs passed to :class:`Inference`
    start : `Point`
        starting point for inference

    Other Parameters
    ----------------
    score : bool
            evaluate loss on each iteration or not
    callbacks : list[function : (Approximation, losses, i) -> None]
        calls provided functions after each iteration step
    progressbar : bool
        whether to show progressbar or not
    obj_n_mc : `int`
        Number of monte carlo samples used for approximation of objective gradients
    tf_n_mc : `int`
        Number of monte carlo samples used for approximation of test function gradients
    obj_optimizer : function (grads, params) -> updates
        Optimizer that is used for objective params
    test_optimizer : function (grads, params) -> updates
        Optimizer that is used for test function params
    more_obj_params : `list`
        Add custom params for objective optimizer
    more_tf_params : `list`
        Add custom params for test function optimizer
    more_updates : `dict`
        Add custom updates to resulting updates
    total_grad_norm_constraint : `float`
        Bounds gradient norm, prevents exploding gradient problem
    fn_kwargs : `dict`
        Add kwargs to theano.function (e.g. `{'profile': True}`)
    more_replacements : `dict`
        Apply custom replacements before calculating gradients

    Returns
    -------
    :class:`Approximation`
    """
    if inf_kwargs is None:
        inf_kwargs = dict()
    else:
        inf_kwargs = inf_kwargs.copy()
    if local_rv is not None:
        inf_kwargs['local_rv'] = local_rv
    if random_seed is not None:
        inf_kwargs['random_seed'] = random_seed
    if start is not None:
        inf_kwargs['start'] = start
    if model is None:
        model = pm.modelcontext(model)
    _select = dict(
        advi=ADVI,
        fullrank_advi=FullRankADVI,
        svgd=SVGD,
        asvgd=ASVGD,
        nfvi=NFVI
    )
    if isinstance(method, str):
        method = method.lower()
        if method.startswith('nfvi='):
            formula = method[5:]
            inference = NFVI(
                formula,
                **inf_kwargs
                )
        elif method in _select:

            inference = _select[method](
                model=model,
                **inf_kwargs
            )
        else:
            raise KeyError('method should be one of %s '
                           'or Inference instance' %
                           set(_select.keys()))
    elif isinstance(method, Inference):
        inference = method
    else:
        raise TypeError('method should be one of %s '
                        'or Inference instance' %
                        set(_select.keys()))
    return inference.fit(n, **kwargs)
Exemple #36
0
def sample_vp(
        vparams, draws=1000, model=None, local_RVs=None, random_seed=None,
        hide_transformed=True, progressbar=True):
    """Draw samples from variational posterior.

    Parameters
    ----------
    vparams : dict or pymc3.variational.ADVIFit
        Estimated variational parameters of the model.
    draws : int
        Number of random samples.
    model : pymc3.Model
        Probabilistic model.
    random_seed : int or None
        Seed of random number generator.  None to use current seed.
    hide_transformed : bool
        If False, transformed variables are also sampled. Default is True.

    Returns
    -------
    trace : pymc3.backends.base.MultiTrace
        Samples drawn from the variational posterior.
    """
    model = pm.modelcontext(model)

    if isinstance(vparams, ADVIFit):
        vparams = {
            'means': vparams.means,
            'stds': vparams.stds
        }

    ds = model.deterministics

    def get_transformed(v):
        return v if v not in ds else v.transformed

    def rvs(x):
        return [get_transformed(v) for v in x] if x is not None else []

    global_RVs = list(set(model.free_RVs) - set(rvs(local_RVs)))

    # Make dict for replacements of random variables
    if random_seed is None:
        r = MRG_RandomStreams(gen_random_state())
    else:
        r = MRG_RandomStreams(seed=random_seed)
    updates = {}
    for v in global_RVs:
        u = theano.shared(vparams['means'][str(v)]).ravel()
        w = theano.shared(vparams['stds'][str(v)]).ravel()
        n = r.normal(size=u.tag.test_value.shape)
        updates.update({v: (n * w + u).reshape(v.tag.test_value.shape)})

    if local_RVs is not None:
        for v_, (uw, _) in local_RVs.items():
            v = get_transformed(v_)
            u = uw[0].ravel()
            w = uw[1].ravel()
            n = r.normal(size=u.tag.test_value.shape)
            updates.update(
                {v: (n * tt.exp(w) + u).reshape(v.tag.test_value.shape)})

    # Replace some nodes of the graph with variational distributions
    vars = model.free_RVs
    samples = theano.clone(vars, updates)
    f = theano.function([], samples)

    # Random variables which will be sampled
    if hide_transformed:
        vars_sampled = [v_ for v_ in model.unobserved_RVs
                        if not str(v_).endswith('_')]
    else:
        vars_sampled = [v_ for v_ in model.unobserved_RVs]

    varnames = [str(var) for var in model.unobserved_RVs]
    trace = pm.sampling.NDArray(model=model, vars=vars_sampled)
    trace.setup(draws=draws, chain=0)

    range_ = trange(draws) if progressbar else range(draws)

    for _ in range_:
        # 'point' is like {'var1': np.array(0.1), 'var2': np.array(0.2), ...}
        point = {varname: value for varname, value in zip(varnames, f())}
        trace.record(point)

    return MultiTrace([trace])
Exemple #37
0
def optimize(
    start=None,
    vars=None,
    model=None,
    return_info=False,
    verbose=True,
    **kwargs
):
    """Maximize the log prob of a PyMC3 model using scipy

    All extra arguments are passed directly to the ``scipy.optimize.minimize``
    function.

    Args:
        start: The PyMC3 coordinate dictionary of the starting position
        vars: The variables to optimize
        model: The PyMC3 model
        return_info: Return both the coordinate dictionary and the result of
            ``scipy.optimize.minimize``
        verbose: Print the success flag and log probability to the screen

    """
    from scipy.optimize import minimize

    model = pm.modelcontext(model)

    # Work out the full starting coordinates
    if start is None:
        start = model.test_point
    else:
        update_start_vals(start, model.test_point, model)

    # Fit all the parameters by default
    if vars is None:
        vars = model.cont_vars
    vars = inputvars(vars)
    allinmodel(vars, model)

    # Work out the relevant bijection map
    start = Point(start, model=model)
    bij = DictToArrayBijection(ArrayOrdering(vars), start)

    # Pre-compile the theano model and gradient
    nlp = -model.logpt
    grad = theano.grad(nlp, vars, disconnected_inputs="ignore")
    func = get_theano_function_for_var([nlp] + grad, model=model)

    if verbose:
        names = [
            get_untransformed_name(v.name)
            if is_transformed_name(v.name)
            else v.name
            for v in vars
        ]
        sys.stderr.write(
            "optimizing logp for variables: [{0}]\n".format(", ".join(names))
        )
        bar = tqdm.tqdm()

    # This returns the objective function and its derivatives
    def objective(vec):
        res = func(*get_args_for_theano_function(bij.rmap(vec), model=model))
        d = dict(zip((v.name for v in vars), res[1:]))
        g = bij.map(d)
        if verbose:
            bar.set_postfix(logp="{0:e}".format(-res[0]))
            bar.update()
        return res[0], g

    # Optimize using scipy.optimize
    x0 = bij.map(start)
    initial = objective(x0)[0]
    kwargs["jac"] = True
    info = minimize(objective, x0, **kwargs)

    # Only accept the output if it is better than it was
    x = info.x if (np.isfinite(info.fun) and info.fun < initial) else x0

    # Coerce the output into the right format
    vars = get_default_varnames(model.unobserved_RVs, True)
    point = {
        var.name: value
        for var, value in zip(vars, model.fastfn(vars)(bij.rmap(x)))
    }

    if verbose:
        bar.close()
        sys.stderr.write("message: {0}\n".format(info.message))
        sys.stderr.write("logp: {0} -> {1}\n".format(-initial, -info.fun))
        if not np.isfinite(info.fun):
            logger.warning("final logp not finite, returning initial point")
            logger.warning(
                "this suggests that something is wrong with the model"
            )
            logger.debug("{0}".format(info))

    if return_info:
        return point, info
    return point
Exemple #38
0
def compute_source_mag_and_blend_fraction(data,
                                          Delta_F,
                                          F_base,
                                          u_0,
                                          model=None):
    """
    Converts flux parameters :math:`(\Delta F,  F_\mathrm{base})` to physically
    more relevant interesting quantities, the source  star brightness in 
    magnitudes and the blend ratio :math:`g=F_B/F_S`.
    
    Parameters
    ----------
    data : :func:`~caustic.data.Data`
        Microlensing event data. 
    Delta_F : theano.tensor
        Tensor of shape ``(n_bands)``.
    F_base : theano.tensor
        Tensor of shape ``(n_bands)``.
    u_0 : theano.tensor
        Lens--source separation at time :math:`t_0`.
    standardized : bool
        Wether or not the flux is standardized to unit std deviation and zero
        median. By default ``True``.
    model : pymc3.Model
        PyMC3 model object which was used to obtain posterior samples in the
        trace.
    
    Returns
    -------
    tuple
        ``(m_source, g)``.
    """
    model = pm.modelcontext(model)

    if model.is_standardized is True:
        # Revert F_base and Delta_F to non-standardized units
        data.units = "fluxes"
        fluxes_median = np.zeros(len(data.light_curves))
        fluxes_std = np.zeros(len(data.light_curves))

        for i, table in enumerate(data.light_curves):
            mask = table["mask"]
            fluxes_median[i] = np.median(table["flux"][mask])
            fluxes_std[i] = np.std(table["flux"][mask])

        # Flux parameters to standard flux units
        Delta_F_ = T.as_tensor_variable(fluxes_std) * Delta_F
        F_base_ = T.as_tensor_variable(
            fluxes_std) * F_base + T.as_tensor_variable(fluxes_median)
    else:
        Delta_F_ = Delta_F
        F_base_ = F_base

    # Calculate source flux and blend flux
    A_u0 = (u_0**2 + 2) / (T.abs_(u_0) * T.sqrt(u_0**2 + 4))

    F_S = Delta_F_ / (A_u0 - 1)
    F_B = F_base_ - F_S

    g = F_B / F_S

    # Convert fluxes to magnitudes
    zero_point = 22.0
    m_source = zero_point - 2.5 * T.log10(F_S)

    return m_source, g
Exemple #39
0
def advi_minibatch(vars=None, start=None, model=None, n=5000, n_mcsamples=1,
                   minibatch_RVs=None, minibatch_tensors=None,
                   minibatches=None, global_RVs=None, local_RVs=None,
                   observed_RVs=None, encoder_params=None, total_size=None,
                   optimizer=None, learning_rate=.001, epsilon=.1,
                   random_seed=None, mode=None):
    """Perform mini-batch ADVI.

    This function implements a mini-batch automatic differentiation variational
    inference (ADVI; Kucukelbir et al., 2015) with the meanfield
    approximation. Autoencoding variational Bayes (AEVB; Kingma and Welling,
    2014) is also supported.

    For explanation, we classify random variables in probabilistic models into
    three types. Observed random variables
    :math:`{\cal Y}=\{\mathbf{y}_{i}\}_{i=1}^{N}` are :math:`N` observations.
    Each :math:`\mathbf{y}_{i}` can be a set of observed random variables,
    i.e., :math:`\mathbf{y}_{i}=\{\mathbf{y}_{i}^{k}\}_{k=1}^{V_{o}}`, where
    :math:`V_{k}` is the number of the types of observed random variables
    in the model.

    The next ones are global random variables
    :math:`\Theta=\{\\theta^{k}\}_{k=1}^{V_{g}}`, which are used to calculate
    the probabilities for all observed samples.

    The last ones are local random variables
    :math:`{\cal Z}=\{\mathbf{z}_{i}\}_{i=1}^{N}`, where
    :math:`\mathbf{z}_{i}=\{\mathbf{z}_{i}^{k}\}_{k=1}^{V_{l}}`.
    These RVs are used only in AEVB.

    The goal of ADVI is to approximate the posterior distribution
    :math:`p(\Theta,{\cal Z}|{\cal Y})` by variational posterior
    :math:`q(\Theta)\prod_{i=1}^{N}q(\mathbf{z}_{i})`. All of these terms
    are normal distributions (mean-field approximation).

    :math:`q(\Theta)` is parametrized with its means and standard deviations.
    These parameters are denoted as :math:`\gamma`. While :math:`\gamma` is
    a constant, the parameters of :math:`q(\mathbf{z}_{i})` are dependent on
    each observation. Therefore these parameters are denoted as
    :math:`\\xi(\mathbf{y}_{i}; \\nu)`, where :math:`\\nu` is the parameters
    of :math:`\\xi(\cdot)`. For example, :math:`\\xi(\cdot)` can be a
    multilayer perceptron or convolutional neural network.

    In addition to :math:`\\xi(\cdot)`, we can also include deterministic
    mappings for the likelihood of observations. We denote the parameters of
    the deterministic mappings as :math:`\eta`. An example of such mappings is
    the deconvolutional neural network used in the convolutional VAE example
    in the PyMC3 notebook directory.

    This function maximizes the evidence lower bound (ELBO)
    :math:`{\cal L}(\gamma, \\nu, \eta)` defined as follows:

    .. math::

        {\cal L}(\gamma,\\nu,\eta) & =
        \mathbf{c}_{o}\mathbb{E}_{q(\Theta)}\left[
        \sum_{i=1}^{N}\mathbb{E}_{q(\mathbf{z}_{i})}\left[
        \log p(\mathbf{y}_{i}|\mathbf{z}_{i},\Theta,\eta)
        \\right]\\right] \\\\ &
        - \mathbf{c}_{g}KL\left[q(\Theta)||p(\Theta)\\right]
        - \mathbf{c}_{l}\sum_{i=1}^{N}
            KL\left[q(\mathbf{z}_{i})||p(\mathbf{z}_{i})\\right],

    where :math:`KL[q(v)||p(v)]` is the Kullback-Leibler divergence

    .. math::

        KL[q(v)||p(v)] = \int q(v)\log\\frac{q(v)}{p(v)}dv,

    :math:`\mathbf{c}_{o/g/l}` are vectors for weighting each term of ELBO.
    More precisely, we can write each of the terms in ELBO as follows:

    .. math::

        \mathbf{c}_{o}\log p(\mathbf{y}_{i}|\mathbf{z}_{i},\Theta,\eta) & = &
        \sum_{k=1}^{V_{o}}c_{o}^{k}
            \log p(\mathbf{y}_{i}^{k}|
                   {\\rm pa}(\mathbf{y}_{i}^{k},\Theta,\eta)) \\\\
        \mathbf{c}_{g}KL\left[q(\Theta)||p(\Theta)\\right] & = &
        \sum_{k=1}^{V_{g}}c_{g}^{k}KL\left[
            q(\\theta^{k})||p(\\theta^{k}|{\\rm pa(\\theta^{k})})\\right] \\\\
        \mathbf{c}_{l}KL\left[q(\mathbf{z}_{i}||p(\mathbf{z}_{i})\\right] & = &
        \sum_{k=1}^{V_{l}}c_{l}^{k}KL\left[
            q(\mathbf{z}_{i}^{k})||
            p(\mathbf{z}_{i}^{k}|{\\rm pa}(\mathbf{z}_{i}^{k}))\\right],

    where :math:`{\\rm pa}(v)` denotes the set of parent variables of :math:`v`
    in the directed acyclic graph of the model.

    When using mini-batches, :math:`c_{o}^{k}` and :math:`c_{l}^{k}` should be
    set to :math:`N/M`, where :math:`M` is the number of observations in each
    mini-batch. Another weighting scheme was proposed in
    (Blundell et al., 2015) for accelarating model fitting.

    For working with ADVI, we need to give the probabilistic model
    (:code:`model`), the three types of RVs (:code:`observed_RVs`,
    :code:`global_RVs` and :code:`local_RVs`), the tensors to which
    mini-bathced samples are supplied (:code:`minibatches`) and
    parameters of deterministic mappings :math:`\\xi` and :math:`\eta`
    (:code:`encoder_params`) as input arguments.

    :code:`observed_RVs` is a :code:`OrderedDict` of the form
    :code:`{y_k: c_k}`, where :code:`y_k` is a random variable defined in the
    PyMC3 model. :code:`c_k` is a scalar (:math:`c_{o}^{k}`) and it can be a
    shared variable.

    :code:`global_RVs` is a :code:`OrderedDict` of the form
    :code:`{t_k: c_k}`, where :code:`t_k` is a random variable defined in the
    PyMC3 model. :code:`c_k` is a scalar (:math:`c_{g}^{k}`) and it can be a
    shared variable.

    :code:`local_RVs` is a :code:`OrderedDict` of the form
    :code:`{z_k: ((m_k, s_k), c_k)}`, where :code:`z_k` is a random variable
    defined in the PyMC3 model. :code:`c_k` is a scalar (:math:`c_{l}^{k}`)
    and it can be a shared variable. :code:`(m_k, s_k)` is a pair of tensors
    of means and log standard deviations of the variational distribution;
    samples drawn from the variational distribution replaces :code:`z_k`.
    It should be noted that if :code:`z_k` has a transformation that changes
    the dimension (e.g., StickBreakingTransform), the variational distribution
    must have the same dimension. For example, if :code:`z_k` is distributed
    with Dirichlet distribution with :code:`p` choices, :math:`m_k` and
    :code:`s_k` has the shape :code:`(n_samples_in_minibatch, p - 1)`.

    :code:`minibatch_tensors` is a list of tensors (can be shared variables)
    to which mini-batch samples are set during the optimization.
    These tensors are observations (:code:`obs=`) in :code:`observed_RVs`.

    :code:`minibatches` is a generator of a list of :code:`numpy.ndarray`.
    Each item of the list will be set to tensors in :code:`minibatch_tensors`.

    :code:`encoder_params` is a list of shared variables of the parameters
    :math:`\\nu` and :math:`\eta`. We do not need to include the variational
    parameters of the global variables, :math:`\gamma`, because these are
    automatically created and updated in this function.

    The following is a list of example notebooks using advi_minibatch:

    - docs/source/notebooks/GLM-hierarchical-advi-minibatch.ipynb
    - docs/source/notebooks/bayesian_neural_network_advi.ipynb
    - docs/source/notebooks/convolutional_vae_keras_advi.ipynb
    - docs/source/notebooks/gaussian-mixture-model-advi.ipynb
    - docs/source/notebooks/lda-advi-aevb.ipynb

    Parameters
    ----------
    vars : object
        List of random variables. If None, variational posteriors (normal
        distribution) are fit for all RVs in the given model.
    start : Dict or None
        Initial values of parameters (variational means).
    model : Model
        Probabilistic model.
    n : int
        Number of iterations updating parameters.
    n_mcsamples : int
        Number of Monte Carlo samples to approximate ELBO.
    minibatch_RVs : list of ObservedRVs
        Random variables in the model for which mini-batch tensors are set.
        When this argument is given, both of arguments local_RVs and
        observed_RVs must be None.
    minibatch_tensors : list of (tensors or shared variables)
        Tensors used to create ObservedRVs in minibatch_RVs.
    minibatches : generator of list
        Generates a set of minibatches when calling next().
        The length of the returned list must be the same with the number of
        random variables in `minibatch_tensors`.
    total_size : int
        Total size of training samples. This is used to appropriately scale the
        log likelihood terms corresponding to mini-batches in ELBO.
    observed_RVs : Ordered dict
        Include a scaling constant for the corresponding RV. See the above
        description.
    global_RVs : Ordered dict or None
        Include a scaling constant for the corresponding RV. See the above
        description. If :code:`None`, it is set to
        :code:`{v: 1 for v in grvs}`, where :code:`grvs` is
        :code:`list(set(vars) - set(list(local_RVs) + list(observed_RVs)))`.
    local_RVs : Ordered dict or None
        Include encoded variational parameters and a scaling constant for
        the corresponding RV. See the above description.
    encoder_params : list of theano shared variables
        Parameters of encoder.
    optimizer : (loss, list of shared variables) -> dict or OrderedDict
        A function that returns parameter updates given loss and shared
        variables of parameters. If :code:`None` (default), a default
        Adagrad optimizer is used with parameters :code:`learning_rate`
        and :code:`epsilon` below.
    learning_rate: float
        Base learning rate for adagrad.
        This parameter is ignored when :code:`optimizer` is set.
    epsilon : float
        Offset in denominator of the scale of learning rate in Adagrad.
        This parameter is ignored when :code:`optimizer` is set.
    random_seed : int
        Seed to initialize random state.

    Returns
    -------
    ADVIFit
        Named tuple, which includes 'means', 'stds', and 'elbo_vals'.

    References
    ----------
    - Kingma, D. P., & Welling, M. (2014).
      Auto-Encoding Variational Bayes. stat, 1050, 1.
    - Kucukelbir, A., Ranganath, R., Gelman, A., & Blei, D. (2015).
      Automatic variational inference in Stan. In Advances in neural
      information processing systems (pp. 568-576).
    - Blundell, C., Cornebise, J., Kavukcuoglu, K., & Wierstra, D. (2015).
      Weight Uncertainty in Neural Network. In Proceedings of the 32nd
      International Conference on Machine Learning (ICML-15) (pp. 1613-1622).
    """
    import warnings
    warnings.warn('Old ADVI interface is deprecated and be removed in future, use pm.ADVI instead',
                  DeprecationWarning, stacklevel=2)
    if encoder_params is None:
        encoder_params = []

    model = pm.modelcontext(model)
    vars = inputvars(vars if vars is not None else model.vars)
    start = start if start is not None else model.test_point

    if not pm.model.all_continuous(vars):
        raise ValueError('Model can not include discrete RVs for ADVI.')

    _check_minibatches(minibatch_tensors, minibatches)

    if encoder_params is None:
        encoder_params = []

    # Prepare optimizer
    if optimizer is None:
        optimizer = adagrad_optimizer(learning_rate, epsilon)

    # For backward compatibility in how input arguments are given
    local_RVs, observed_RVs = _get_rvss(minibatch_RVs, local_RVs, observed_RVs,
                                        minibatch_tensors, total_size)

    # Replace local_RVs with transformed variables
    def get_transformed(v):
        if hasattr(v, 'transformed'):
            return v.transformed
        return v
    local_RVs = OrderedDict(
        [(get_transformed(v), (uw, s)) for v, (uw, s) in local_RVs.items()]
    )

    # Get global variables
    grvs = list(set(vars) - set(list(local_RVs) + list(observed_RVs)))
    if global_RVs is None:
        global_RVs = OrderedDict({v: 1 for v in grvs})
    _value_error(len(grvs) == len(global_RVs),
                 'global_RVs ({}) must have all global RVs: {}'.format(
                     [v for v in global_RVs], grvs)
    )

    # ELBO wrt variational parameters
    elbo, uw_l, uw_g = _make_elbo_t(observed_RVs, global_RVs, local_RVs,
                                    model.potentials, n_mcsamples, random_seed)

    # Replacements tensors of variational parameters in the graph
    replaces = dict()

    # Variational parameters for global RVs
    if 0 < len(global_RVs):
        uw_global_shared, bij = _init_uw_global_shared(start, global_RVs)
        replaces.update({uw_g: uw_global_shared})

    # Variational parameters for local RVs, encoded from samples in
    # mini-batches
    if 0 < len(local_RVs):
        uws = [uw for _, (uw, _) in local_RVs.items()]
        uw_local_encoded = tt.concatenate([uw[0].ravel() for uw in uws] +
                                          [uw[1].ravel() for uw in uws])
        replaces.update({uw_l: uw_local_encoded})

    # Replace tensors of variational parameters in ELBO
    elbo = theano.clone(elbo, OrderedDict(replaces), strict=False)

    # Replace input shared variables with tensors
    def is_shared(t):
        return isinstance(t, theano.compile.sharedvalue.SharedVariable)
    tensors = [(t.type() if is_shared(t) else t) for t in minibatch_tensors]
    updates = OrderedDict(
        {t: t_ for t, t_ in zip(minibatch_tensors, tensors) if is_shared(t)}
    )
    elbo = theano.clone(elbo, updates, strict=False)

    # Create parameter update function used in the training loop
    params = encoder_params
    if 0 < len(global_RVs):
        params += [uw_global_shared]
    updates = OrderedDict(optimizer(loss=-1 * elbo, param=params))
    f = theano.function(tensors, elbo, updates=updates, mode=mode)

    # Optimization loop
    elbos = np.empty(n)
    progress = tqdm.trange(n)
    for i in progress:
        e = f(*next(minibatches))
        if np.isnan(e):
            raise FloatingPointError('NaN occurred in ADVI optimization.')
        elbos[i] = e
        if n < 10:
            progress.set_description('ELBO = {:,.2f}'.format(elbos[i]))
        elif i % (n // 10) == 0 and i > 0:
            avg_elbo = infmean(elbos[i - n // 10:i])
            progress.set_description('Average ELBO = {:,.2f}'.format(avg_elbo))

    pm._log.info('Finished minibatch ADVI: ELBO = {:,.2f}'.format(elbos[-1]))

    # Variational parameters of global RVs
    if 0 < len(global_RVs):
        l = int(uw_global_shared.get_value(borrow=True).size / 2)
        u = bij.rmap(uw_global_shared.get_value(borrow=True)[:l])
        w = bij.rmap(uw_global_shared.get_value(borrow=True)[l:])
        # w is in log space
        for var in w.keys():
            w[var] = np.exp(w[var])
    else:
        u = dict()
        w = dict()

    return ADVIFit(u, w, elbos)
Exemple #40
0
def sample_vp(vparams,
              draws=1000,
              model=None,
              local_RVs=None,
              random_seed=None,
              include_transformed=False,
              progressbar=True):
    """Draw samples from variational posterior.

    Parameters
    ----------
    vparams : dict or pymc3.variational.ADVIFit
        Estimated variational parameters of the model.
    draws : int
        Number of random samples.
    model : pymc3.Model
        Probabilistic model.
    random_seed : int or None
        Seed of random number generator.  None to use current seed.
    include_transformed : bool
        If True, transformed variables are also sampled. Default is False.

    Returns
    -------
    trace : pymc3.backends.base.MultiTrace
        Samples drawn from the variational posterior.
    """
    import warnings
    warnings.warn(
        'Old ADVI interface and sample_vp is deprecated and will '
        'be removed in future, use pm.fit and pm.sample_approx instead',
        DeprecationWarning,
        stacklevel=2)
    model = pm.modelcontext(model)

    if isinstance(vparams, ADVIFit):
        vparams = {'means': vparams.means, 'stds': vparams.stds}

    ds = model.deterministics

    def get_transformed(v):
        return v if v not in ds else v.transformed

    def rvs(x):
        return [get_transformed(v) for v in x] if x is not None else []

    global_RVs = list(set(model.free_RVs) - set(rvs(local_RVs)))

    # Make dict for replacements of random variables
    if random_seed is None:
        r = MRG_RandomStreams(gen_random_state())
    else:
        r = MRG_RandomStreams(seed=random_seed)
    updates = {}
    for v in global_RVs:
        u = theano.shared(vparams['means'][str(v)]).ravel()
        w = theano.shared(vparams['stds'][str(v)]).ravel()
        n = r.normal(size=u.tag.test_value.shape)
        updates.update({v: (n * w + u).reshape(v.tag.test_value.shape)})

    if local_RVs is not None:
        for v_, (uw, _) in local_RVs.items():
            v = get_transformed(v_)
            u = uw[0].ravel()
            w = uw[1].ravel()
            n = r.normal(size=u.tag.test_value.shape)
            updates.update(
                {v: (n * tt.exp(w) + u).reshape(v.tag.test_value.shape)})

    # Replace some nodes of the graph with variational distributions
    vars = model.free_RVs
    samples = theano.clone(vars, updates)
    f = theano.function([], samples)

    # Random variables which will be sampled
    vars_sampled = pm.util.get_default_varnames(
        model.unobserved_RVs, include_transformed=include_transformed)

    varnames = [str(var) for var in model.unobserved_RVs]
    trace = pm.sampling.NDArray(model=model, vars=vars_sampled)
    trace.setup(draws=draws, chain=0)

    range_ = trange(draws) if progressbar else range(draws)

    for _ in range_:
        # 'point' is like {'var1': np.array(0.1), 'var2': np.array(0.2), ...}
        point = {varname: value for varname, value in zip(varnames, f())}
        trace.record(point)

    return MultiTrace([trace])
Exemple #41
0
def fit(n=10000,
        local_rv=None,
        method='advi',
        model=None,
        random_seed=None,
        start=None,
        inf_kwargs=None,
        **kwargs):
    R"""
    Handy shortcut for using inference methods in functional way

    Parameters
    ----------
    n : `int`
        number of iterations
    local_rv : dict[var->tuple]
        mapping {model_variable -> local_variable (:math:`\mu`, :math:`\rho`)}
        Local Vars are used for Autoencoding Variational Bayes
        See (AEVB; Kingma and Welling, 2014) for details
    method : str or :class:`Inference`
        string name is case insensitive in:

        -   'advi'  for ADVI
        -   'fullrank_advi'  for FullRankADVI
        -   'advi->fullrank_advi'  for fitting ADVI first and then FullRankADVI
        -   'svgd'  for Stein Variational Gradient Descent
        -   'asvgd'  for Amortized Stein Variational Gradient Descent
        -   'nfvi'  for Normalizing Flow
        -   'nfvi=formula'  for Normalizing Flow using formula

    model : :class:`Model`
        PyMC3 model for inference
    random_seed : None or int
        leave None to use package global RandomStream or other
        valid value to create instance specific one
    inf_kwargs : dict
        additional kwargs passed to :class:`Inference`
    start : `Point`
        starting point for inference

    Other Parameters
    ----------------
    frac : `float`
        if method is 'advi->fullrank_advi' represents advi fraction when training
    kwargs : kwargs
        additional kwargs for :func:`Inference.fit`

    Returns
    -------
    :class:`Approximation`
    """
    if inf_kwargs is None:
        inf_kwargs = dict()
    if model is None:
        model = pm.modelcontext(model)
    _select = dict(advi=ADVI,
                   fullrank_advi=FullRankADVI,
                   svgd=SVGD,
                   asvgd=ASVGD,
                   nfvi=NFVI)
    if isinstance(method, str):
        method = method.lower()
        if method == 'advi->fullrank_advi':
            frac = kwargs.pop('frac', .5)
            if not 0. < frac < 1.:
                raise ValueError('frac should be in (0, 1)')
            n1 = int(n * frac)
            n2 = n - n1
            inference = ADVI(local_rv=local_rv,
                             model=model,
                             random_seed=random_seed,
                             start=start)
            logger.info('fitting advi ...')
            inference.fit(n1, **kwargs)
            inference = FullRankADVI.from_advi(inference)
            logger.info('fitting fullrank advi ...')
            return inference.fit(n2, **kwargs)
        elif method.startswith('nfvi='):
            formula = method[5:]
            inference = NFVI(
                formula,
                local_rv=local_rv,
                model=model,
                random_seed=random_seed,
                start=
                start,  # ignored by now, hope I'll find a good application for this argument
                **inf_kwargs)
        elif method in _select:
            inference = _select[method](local_rv=local_rv,
                                        model=model,
                                        random_seed=random_seed,
                                        start=start,
                                        **inf_kwargs)
        else:
            raise KeyError('method should be one of %s '
                           'or Inference instance' % set(_select.keys()))
    elif isinstance(method, Inference):
        inference = method
    else:
        raise TypeError('method should be one of %s '
                        'or Inference instance' % set(_select.keys()))
    return inference.fit(n, **kwargs)
Exemple #42
0
def sample_numpyro_nuts(
    draws=1000,
    tune=1000,
    chains=4,
    target_accept=0.8,
    random_seed=10,
    model=None,
    progress_bar=True,
    keep_untransformed=False,
):
    from numpyro.infer import MCMC, NUTS

    from pymc3 import modelcontext

    model = modelcontext(model)

    seed = jax.random.PRNGKey(random_seed)

    fgraph = aesara.graph.fg.FunctionGraph(model.free_RVs, [model.logpt])
    fns = jax_funcify(fgraph)
    logp_fn_jax = fns[0]

    rv_names = [rv.name for rv in model.free_RVs]
    init_state = [model.test_point[rv_name] for rv_name in rv_names]
    init_state_batched = jax.tree_map(lambda x: np.repeat(x[None, ...], chains, axis=0), init_state)

    @jax.jit
    def _sample(current_state, seed):
        step_size = jax.tree_map(jax.numpy.ones_like, init_state)
        nuts_kernel = NUTS(
            potential_fn=lambda x: -logp_fn_jax(*x),
            # model=model,
            target_accept_prob=target_accept,
            adapt_step_size=True,
            adapt_mass_matrix=True,
            dense_mass=False,
        )

        pmap_numpyro = MCMC(
            nuts_kernel,
            num_warmup=tune,
            num_samples=draws,
            num_chains=chains,
            postprocess_fn=None,
            chain_method="parallel",
            progress_bar=progress_bar,
        )

        pmap_numpyro.run(seed, init_params=current_state, extra_fields=("num_steps",))
        samples = pmap_numpyro.get_samples(group_by_chain=True)
        leapfrogs_taken = pmap_numpyro.get_extra_fields(group_by_chain=True)["num_steps"]
        return samples, leapfrogs_taken

    print("Compiling...")
    tic2 = pd.Timestamp.now()
    map_seed = jax.random.split(seed, chains)
    mcmc_samples, leapfrogs_taken = _sample(init_state_batched, map_seed)
    # map_seed = jax.random.split(seed, chains)
    # mcmc_samples = _sample(init_state_batched, map_seed)
    # tic4 = pd.Timestamp.now()
    # print("Sampling time = ", tic4 - tic3)

    posterior = {k: v for k, v in zip(rv_names, mcmc_samples)}
    tic3 = pd.Timestamp.now()
    posterior = _transform_samples(posterior, model, keep_untransformed=keep_untransformed)
    tic4 = pd.Timestamp.now()

    az_trace = az.from_dict(posterior=posterior)
    print("Compilation + sampling time = ", tic3 - tic2)
    print("Transformation time = ", tic4 - tic3)

    return az_trace  # , leapfrogs_taken, tic3 - tic2
Exemple #43
0
 def __init__(self, vars, model=None, **kwargs):
     model = pm.modelcontext(model)
     self.model = model
     vars = pm.inputvars(vars)
     super(ConstantStep, self).__init__(vars, [model.fastlogp], **kwargs)
Exemple #44
0
def sample_tfp_nuts(
    draws=1000,
    tune=1000,
    chains=4,
    target_accept=0.8,
    random_seed=10,
    model=None,
    num_tuning_epoch=2,
    num_compute_step_size=500,
):
    import jax

    from tensorflow_probability.substrates import jax as tfp

    model = modelcontext(model)

    seed = jax.random.PRNGKey(random_seed)

    fgraph = aesara.graph.fg.FunctionGraph(model.free_RVs, [model.logpt])
    fns = jax_funcify(fgraph)
    logp_fn_jax = fns[0]

    rv_names = [rv.name for rv in model.free_RVs]
    init_state = [model.test_point[rv_name] for rv_name in rv_names]
    init_state_batched = jax.tree_map(lambda x: np.repeat(x[None, ...], chains, axis=0), init_state)

    @jax.pmap
    def _sample(init_state, seed):
        def gen_kernel(step_size):
            hmc = tfp.mcmc.NoUTurnSampler(target_log_prob_fn=logp_fn_jax, step_size=step_size)
            return tfp.mcmc.DualAveragingStepSizeAdaptation(
                hmc, tune // num_tuning_epoch, target_accept_prob=target_accept
            )

        def trace_fn(_, pkr):
            return pkr.new_step_size

        def get_tuned_stepsize(samples, step_size):
            return step_size[-1] * jax.numpy.std(samples[-num_compute_step_size:])

        step_size = jax.tree_map(jax.numpy.ones_like, init_state)
        for i in range(num_tuning_epoch - 1):
            tuning_hmc = gen_kernel(step_size)
            init_samples, tuning_result, kernel_results = tfp.mcmc.sample_chain(
                num_results=tune // num_tuning_epoch,
                current_state=init_state,
                kernel=tuning_hmc,
                trace_fn=trace_fn,
                return_final_kernel_results=True,
                seed=seed,
            )

            step_size = jax.tree_multimap(get_tuned_stepsize, list(init_samples), tuning_result)
            init_state = [x[-1] for x in init_samples]

        # Run inference
        sample_kernel = gen_kernel(step_size)
        mcmc_samples, leapfrog_num = tfp.mcmc.sample_chain(
            num_results=draws,
            num_burnin_steps=tune // num_tuning_epoch,
            current_state=init_state,
            kernel=sample_kernel,
            trace_fn=lambda _, pkr: pkr.inner_results.leapfrogs_taken,
            seed=seed,
        )

        return mcmc_samples, leapfrog_num

    print("Compiling...")
    tic2 = pd.Timestamp.now()
    map_seed = jax.random.split(seed, chains)
    mcmc_samples, leapfrog_num = _sample(init_state_batched, map_seed)

    # map_seed = jax.random.split(seed, chains)
    # mcmc_samples = _sample(init_state_batched, map_seed)
    # tic4 = pd.Timestamp.now()
    # print("Sampling time = ", tic4 - tic3)

    posterior = {k: v for k, v in zip(rv_names, mcmc_samples)}

    az_trace = az.from_dict(posterior=posterior)
    tic3 = pd.Timestamp.now()
    print("Compilation + sampling time = ", tic3 - tic2)
    return az_trace  # , leapfrog_num, tic3 - tic2
Exemple #45
0
 def __init__(self, var, model=None, values=[0,1]):
     model = pm.modelcontext(model)
     self.values = values
     super(RandomScanDiscreteMetropolis, self).__init__([var], [model.fastlogp])
Exemple #46
0
def init_nuts(init='ADVI',
              njobs=1,
              n_init=500000,
              model=None,
              random_seed=-1,
              progressbar=True,
              **kwargs):
    """Initialize and sample from posterior of a continuous model.

    This is a convenience function. NUTS convergence and sampling speed is extremely
    dependent on the choice of mass/scaling matrix. In our experience, using ADVI
    to estimate a diagonal covariance matrix and using this as the scaling matrix
    produces robust results over a wide class of continuous models.

    Parameters
    ----------
    init : str {'ADVI', 'ADVI_MAP', 'MAP', 'NUTS'}
        Initialization method to use.
        * ADVI : Run ADVI to estimate posterior mean and diagonal covariance matrix.
        * ADVI_MAP: Initialize ADVI with MAP and use MAP as starting point.
        * MAP : Use the MAP as starting point.
        * NUTS : Run NUTS and estimate posterior mean and covariance matrix.
    njobs : int
        Number of parallel jobs to start.
    n_init : int
        Number of iterations of initializer
        If 'ADVI', number of iterations, if 'metropolis', number of draws.
    model : Model (optional if in `with` context)
    progressbar : bool
        Whether or not to display a progressbar for advi sampling.
    **kwargs : keyword arguments
        Extra keyword arguments are forwarded to pymc3.NUTS.

    Returns
    -------
    start : pymc3.model.Point
        Starting point for sampler
    nuts_sampler : pymc3.step_methods.NUTS
        Instantiated and initialized NUTS sampler object
    """

    model = pm.modelcontext(model)

    pm._log.info('Initializing NUTS using {}...'.format(init))

    random_seed = int(np.atleast_1d(random_seed)[0])

    if init is not None:
        init = init.lower()

    if init == 'advi':
        v_params = pm.variational.advi(n=n_init,
                                       random_seed=random_seed,
                                       progressbar=progressbar)
        start = pm.variational.sample_vp(v_params,
                                         njobs,
                                         progressbar=False,
                                         hide_transformed=False,
                                         random_seed=random_seed)
        if njobs == 1:
            start = start[0]
        cov = np.power(model.dict_to_array(v_params.stds), 2)
    elif init == 'advi_map':
        start = pm.find_MAP()
        v_params = pm.variational.advi(n=n_init,
                                       start=start,
                                       random_seed=random_seed)
        cov = np.power(model.dict_to_array(v_params.stds), 2)
    elif init == 'map':
        start = pm.find_MAP()
        cov = pm.find_hessian(point=start)
    elif init == 'nuts':
        init_trace = pm.sample(step=pm.NUTS(),
                               draws=n_init,
                               random_seed=random_seed)[n_init // 2:]
        cov = np.atleast_1d(pm.trace_cov(init_trace))
        start = np.random.choice(init_trace, njobs)
        if njobs == 1:
            start = start[0]
    else:
        raise NotImplementedError(
            'Initializer {} is not supported.'.format(init))

    step = pm.NUTS(scaling=cov, is_cov=True, **kwargs)

    return start, step
Exemple #47
0
def get_step_for_trace(trace=None,
                       model=None,
                       diag=False,
                       regularize=True,
                       regular_window=5,
                       regular_variance=1e-3,
                       **kwargs):
    """ Define a tuning procedure that adapts off-diagonal mass matrix terms
        adapted from a blog post by Dan Foreman-Mackey here:
        https://dfm.io/posts/pymc3-mass-matrix/

       Args:
           trace (trace): pymc3 trace object
           model (model): pymc3 model object
           
           diag (bool): flag to tune only the diagonal elements
           regularize (bool): flag to turn on covariance matrix regularization
           regular_window (int): size of parameter space at which regularization becomes important
           regular_variance (float): magnitude of covariance floor
           
       Returns:
           pymc3 step_methods object

    """

    model = pm.modelcontext(model)

    # If not given, use the trivial metric
    if trace is None:
        potential = pm.step_methods.hmc.quadpotential.QuadPotentialFull(
            np.eye(model.ndim))
        return pm.NUTS(potential=potential, **kwargs)

    # Loop over samples and convert to the relevant parameter space
    # while removing divergent samples
    div_mask = np.invert(np.copy(trace.diverging))
    samples = np.empty((div_mask.sum() * trace.nchains, model.ndim))
    i = 0
    imask = 0
    for chain in trace._straces.values():
        for p in chain:
            if div_mask[imask]:
                samples[i] = model.bijection.map(p)
                i += 1
            imask += 1

    # Compute the sample covariance
    cov = np.cov(samples, rowvar=0)
    if diag:
        cov = np.diag(np.diag(cov))

    # Stan uses a regularized estimator for the covariance matrix to
    # be less sensitive to numerical issues for large parameter spaces.
    if regularize:
        N = len(samples)
        cov = cov * N / (N + regular_window)
        cov[np.diag_indices_from(
            cov)] += regular_variance * regular_window / (N + regular_window)

    # Use the sample covariance as the inverse metric
    potential = pm.step_methods.hmc.quadpotential.QuadPotentialFull(cov)

    return pm.NUTS(potential=potential, **kwargs)
Exemple #48
0
def modelcontext(model=None):
    return pm.modelcontext(model)
Exemple #49
0
def init_nuts(init='auto',
              chains=1,
              n_init=500000,
              model=None,
              random_seed=None,
              progressbar=True,
              **kwargs):
    """Set up the mass matrix initialization for NUTS.

    NUTS convergence and sampling speed is extremely dependent on the
    choice of mass/scaling matrix. This function implements different
    methods for choosing or adapting the mass matrix.

    Parameters
    ----------
    init : str
        Initialization method to use.

        * auto : Choose a default initialization method automatically.
          Currently, this is `'jitter+adapt_diag'`, but this can change in
          the future. If you depend on the exact behaviour, choose an
          initialization method explicitly.
        * adapt_diag : Start with a identity mass matrix and then adapt
          a diagonal based on the variance of the tuning samples. All
          chains use the test value (usually the prior mean) as starting
          point.
        * jitter+adapt_diag : Same as `adapt_diag`, but add uniform jitter
          in [-1, 1] to the starting point in each chain.
        * advi+adapt_diag : Run ADVI and then adapt the resulting diagonal
          mass matrix based on the sample variance of the tuning samples.
        * advi+adapt_diag_grad : Run ADVI and then adapt the resulting
          diagonal mass matrix based on the variance of the gradients
          during tuning. This is **experimental** and might be removed
          in a future release.
        * advi : Run ADVI to estimate posterior mean and diagonal mass
          matrix.
        * advi_map: Initialize ADVI with MAP and use MAP as starting point.
        * map : Use the MAP as starting point. This is discouraged.
        * nuts : Run NUTS and estimate posterior mean and mass matrix from
          the trace.
    chains : int
        Number of jobs to start.
    n_init : int
        Number of iterations of initializer
        If 'ADVI', number of iterations, if 'nuts', number of draws.
    model : Model (optional if in `with` context)
    progressbar : bool
        Whether or not to display a progressbar for advi sampling.
    **kwargs : keyword arguments
        Extra keyword arguments are forwarded to pymc3.NUTS.

    Returns
    -------
    start : pymc3.model.Point
        Starting point for sampler
    nuts_sampler : pymc3.step_methods.NUTS
        Instantiated and initialized NUTS sampler object
    """
    model = pm.modelcontext(model)

    vars = kwargs.get('vars', model.vars)
    if set(vars) != set(model.vars):
        raise ValueError('Must use init_nuts on all variables of a model.')
    if not pm.model.all_continuous(vars):
        raise ValueError('init_nuts can only be used for models with only '
                         'continuous variables.')

    if not isinstance(init, str):
        raise TypeError('init must be a string.')

    if init is not None:
        init = init.lower()

    if init == 'auto':
        init = 'jitter+adapt_diag'

    pm._log.info('Initializing NUTS using {}...'.format(init))

    if random_seed is not None:
        random_seed = int(np.atleast_1d(random_seed)[0])
        np.random.seed(random_seed)

    cb = [
        pm.callbacks.CheckParametersConvergence(tolerance=1e-2,
                                                diff='absolute'),
        pm.callbacks.CheckParametersConvergence(tolerance=1e-2,
                                                diff='relative'),
    ]

    if init == 'adapt_diag':
        start = [model.test_point] * chains
        mean = np.mean([model.dict_to_array(vals) for vals in start], axis=0)
        var = np.ones_like(mean)
        potential = quadpotential.QuadPotentialDiagAdapt(
            model.ndim, mean, var, 10)
    elif init == 'jitter+adapt_diag':
        start = []
        for _ in range(chains):
            mean = {var: val.copy() for var, val in model.test_point.items()}
            for val in mean.values():
                val[...] += 2 * np.random.rand(*val.shape) - 1
            start.append(mean)
        mean = np.mean([model.dict_to_array(vals) for vals in start], axis=0)
        var = np.ones_like(mean)
        potential = quadpotential.QuadPotentialDiagAdapt(
            model.ndim, mean, var, 10)
    elif init == 'advi+adapt_diag_grad':
        approx = pm.fit(
            random_seed=random_seed,
            n=n_init,
            method='advi',
            model=model,
            callbacks=cb,
            progressbar=progressbar,
            obj_optimizer=pm.adagrad_window,
        )  # type: pm.MeanField
        start = approx.sample(draws=chains)
        start = list(start)
        stds = approx.bij.rmap(approx.std.eval())
        cov = model.dict_to_array(stds)**2
        mean = approx.bij.rmap(approx.mean.get_value())
        mean = model.dict_to_array(mean)
        weight = 50
        potential = quadpotential.QuadPotentialDiagAdaptGrad(
            model.ndim, mean, cov, weight)
    elif init == 'advi+adapt_diag':
        approx = pm.fit(
            random_seed=random_seed,
            n=n_init,
            method='advi',
            model=model,
            callbacks=cb,
            progressbar=progressbar,
            obj_optimizer=pm.adagrad_window,
        )  # type: pm.MeanField
        start = approx.sample(draws=chains)
        start = list(start)
        stds = approx.bij.rmap(approx.std.eval())
        cov = model.dict_to_array(stds)**2
        mean = approx.bij.rmap(approx.mean.get_value())
        mean = model.dict_to_array(mean)
        weight = 50
        potential = quadpotential.QuadPotentialDiagAdapt(
            model.ndim, mean, cov, weight)
    elif init == 'advi':
        approx = pm.fit(random_seed=random_seed,
                        n=n_init,
                        method='advi',
                        model=model,
                        callbacks=cb,
                        progressbar=progressbar,
                        obj_optimizer=pm.adagrad_window)  # type: pm.MeanField
        start = approx.sample(draws=chains)
        start = list(start)
        stds = approx.bij.rmap(approx.std.eval())
        cov = model.dict_to_array(stds)**2
        potential = quadpotential.QuadPotentialDiag(cov)
    elif init == 'advi_map':
        start = pm.find_MAP(include_transformed=True)
        approx = pm.MeanField(model=model, start=start)
        pm.fit(random_seed=random_seed,
               n=n_init,
               method=pm.KLqp(approx),
               callbacks=cb,
               progressbar=progressbar,
               obj_optimizer=pm.adagrad_window)
        start = approx.sample(draws=chains)
        start = list(start)
        stds = approx.bij.rmap(approx.std.eval())
        cov = model.dict_to_array(stds)**2
        potential = quadpotential.QuadPotentialDiag(cov)
    elif init == 'map':
        start = pm.find_MAP(include_transformed=True)
        cov = pm.find_hessian(point=start)
        start = [start] * chains
        potential = quadpotential.QuadPotentialFull(cov)
    elif init == 'nuts':
        init_trace = pm.sample(draws=n_init,
                               step=pm.NUTS(),
                               tune=n_init // 2,
                               random_seed=random_seed)
        cov = np.atleast_1d(pm.trace_cov(init_trace))
        start = list(np.random.choice(init_trace, chains))
        potential = quadpotential.QuadPotentialFull(cov)
    else:
        raise NotImplementedError(
            'Initializer {} is not supported.'.format(init))

    step = pm.NUTS(potential=potential, model=model, **kwargs)

    return start, step
Exemple #50
0
def fit(n=10000, local_rv=None, method='advi', model=None, seed=None, start=None, **kwargs):
    """
    Handy shortcut for using inference methods in functional way

    Parameters
    ----------
    n : int
        number of iterations
    local_rv : dict[var->tuple]
        mapping {model_variable -> local_variable (:math:`\\mu`, :math:`\\rho`)}
        Local Vars are used for Autoencoding Variational Bayes
        See (AEVB; Kingma and Welling, 2014) for details
    method : str or Inference
        string name is case insensitive in {'advi', 'fullrank_advi', 'advi->fullrank_advi'}
    model : Model
    kwargs : kwargs for Inference.fit
    frac : float
        if method is 'advi->fullrank_advi' represents advi fraction when training
    seed : None or int
        leave None to use package global RandomStream or other
        valid value to create instance specific one
    start : Point
        starting point for inference
    Returns
    -------
    Approximation
    """
    if model is None:
        model = pm.modelcontext(model)
    _select = dict(
        advi=ADVI,
        fullrank_advi=FullRankADVI,
        svgd=SVGD
    )
    if isinstance(method, str) and method.lower() == 'advi->fullrank_advi':
        frac = kwargs.pop('frac', .5)
        if not 0. < frac < 1.:
            raise ValueError('frac should be in (0, 1)')
        n1 = int(n * frac)
        n2 = n-n1
        inference = ADVI(local_rv=local_rv, model=model, seed=seed, start=start)
        logger.info('fitting advi ...')
        inference.fit(n1, **kwargs)
        inference = FullRankADVI.from_advi(inference)
        logger.info('fitting fullrank advi ...')
        return inference.fit(n2, **kwargs)

    elif isinstance(method, str):
        try:
            inference = _select[method.lower()](
                local_rv=local_rv, model=model, seed=seed,
                start=start
            )
        except KeyError:
            raise KeyError('method should be one of %s '
                           'or Inference instance' %
                           set(_select.keys()))
    elif isinstance(method, Inference):
        inference = method
    else:
        raise TypeError('method should be one of %s '
                        'or Inference instance' %
                        set(_select.keys()))
    return inference.fit(n, **kwargs)
Exemple #51
0
def fit(n=10000, local_rv=None, method='advi', model=None,
        random_seed=None, start=None, inf_kwargs=None, **kwargs):
    R"""
    Handy shortcut for using inference methods in functional way

    Parameters
    ----------
    n : `int`
        number of iterations
    local_rv : dict[var->tuple]
        mapping {model_variable -> local_variable (:math:`\mu`, :math:`\rho`)}
        Local Vars are used for Autoencoding Variational Bayes
        See (AEVB; Kingma and Welling, 2014) for details
    method : str or :class:`Inference`
        string name is case insensitive in:

        -   'advi'  for ADVI
        -   'fullrank_advi'  for FullRankADVI
        -   'advi->fullrank_advi'  for fitting ADVI first and then FullRankADVI
        -   'svgd'  for Stein Variational Gradient Descent
        -   'asvgd'  for Amortized Stein Variational Gradient Descent
        -   'nfvi'  for Normalizing Flow
        -   'nfvi=formula'  for Normalizing Flow using formula

    model : :class:`Model`
        PyMC3 model for inference
    random_seed : None or int
        leave None to use package global RandomStream or other
        valid value to create instance specific one
    inf_kwargs : dict
        additional kwargs passed to :class:`Inference`
    start : `Point`
        starting point for inference

    Other Parameters
    ----------------
    frac : `float`
        if method is 'advi->fullrank_advi' represents advi fraction when training
    kwargs : kwargs
        additional kwargs for :func:`Inference.fit`

    Returns
    -------
    :class:`Approximation`
    """
    if inf_kwargs is None:
        inf_kwargs = dict()
    if model is None:
        model = pm.modelcontext(model)
    _select = dict(
        advi=ADVI,
        fullrank_advi=FullRankADVI,
        svgd=SVGD,
        asvgd=ASVGD,
        nfvi=NFVI
    )
    if isinstance(method, str):
        method = method.lower()
        if method == 'advi->fullrank_advi':
            frac = kwargs.pop('frac', .5)
            if not 0. < frac < 1.:
                raise ValueError('frac should be in (0, 1)')
            n1 = int(n * frac)
            n2 = n - n1
            inference = ADVI(
                local_rv=local_rv,
                model=model,
                random_seed=random_seed,
                start=start)
            logger.info('fitting advi ...')
            inference.fit(n1, **kwargs)
            inference = FullRankADVI.from_advi(inference)
            logger.info('fitting fullrank advi ...')
            return inference.fit(n2, **kwargs)
        elif method.startswith('nfvi='):
            formula = method[5:]
            inference = NFVI(
                formula,
                local_rv=local_rv,
                model=model,
                random_seed=random_seed,
                start=start,  # ignored by now, hope I'll find a good application for this argument
                **inf_kwargs
                )
        elif method in _select:
            inference = _select[method](
                local_rv=local_rv,
                model=model,
                random_seed=random_seed,
                start=start,
                **inf_kwargs
            )
        else:
            raise KeyError('method should be one of %s '
                           'or Inference instance' %
                           set(_select.keys()))
    elif isinstance(method, Inference):
        inference = method
    else:
        raise TypeError('method should be one of %s '
                        'or Inference instance' %
                        set(_select.keys()))
    return inference.fit(n, **kwargs)
Exemple #52
0
    def __init__(
        self,
        coarse_models: List[Model],
        vars: Optional[list] = None,
        base_S: Optional = None,
        base_proposal_dist: Optional[Type[Proposal]] = None,
        base_scaling: Union[float, int] = 1.0,
        tune: bool = True,
        base_tune_interval: int = 100,
        model: Optional[Model] = None,
        mode: Optional = None,
        subsampling_rates: List[int] = 5,
        base_blocked: bool = False,
        **kwargs,
    ) -> None:

        warnings.warn("The MLDA implementation in PyMC3 is very young. "
                      "You should be extra critical about its results.")

        model = pm.modelcontext(model)

        # assign internal state
        self.coarse_models = coarse_models
        if not isinstance(coarse_models, list):
            raise ValueError(
                "MLDA step method cannot use coarse_models if it is not a list"
            )
        if len(self.coarse_models) == 0:
            raise ValueError("MLDA step method was given an empty "
                             "list of coarse models. Give at least "
                             "one coarse model.")
        if isinstance(subsampling_rates, int):
            self.subsampling_rates = [subsampling_rates] * len(
                self.coarse_models)
        else:
            if len(subsampling_rates) != len(self.coarse_models):
                raise ValueError(
                    f"List of subsampling rates needs to have the same "
                    f"length as list of coarse models but the lengths "
                    f"were {len(subsampling_rates)}, {len(self.coarse_models)}"
                )
            self.subsampling_rates = subsampling_rates
        self.num_levels = len(self.coarse_models) + 1
        self.base_S = base_S
        self.base_proposal_dist = base_proposal_dist
        self.base_scaling = base_scaling
        self.tune = tune
        self.base_tune_interval = base_tune_interval
        self.model = model
        self.next_model = self.coarse_models[-1]
        self.mode = mode
        self.base_blocked = base_blocked
        self.base_scaling_stats = None

        # Process model variables
        if vars is None:
            vars = model.vars
        vars = pm.inputvars(vars)
        self.vars = vars
        self.var_names = [var.name for var in self.vars]

        self.accepted = 0

        # Construct theano function for current-level model likelihood
        # (for use in acceptance)
        shared = pm.make_shared_replacements(vars, model)
        self.delta_logp = delta_logp(model.logpt, vars, shared)

        # Construct theano function for next-level model likelihood
        # (for use in acceptance)
        next_model = pm.modelcontext(self.next_model)
        vars_next = [
            var for var in next_model.vars if var.name in self.var_names
        ]
        vars_next = pm.inputvars(vars_next)
        shared_next = pm.make_shared_replacements(vars_next, next_model)
        self.delta_logp_next = delta_logp(next_model.logpt, vars_next,
                                          shared_next)

        super().__init__(vars, shared)

        # initialise complete step method hierarchy
        if self.num_levels == 2:
            with self.next_model:
                # make sure the correct variables are selected from next_model
                vars_next = [
                    var for var in self.next_model.vars
                    if var.name in self.var_names
                ]
                # MetropolisMLDA sampler in base level (level=0), targeting self.next_model
                self.next_step_method = pm.MetropolisMLDA(
                    vars=vars_next,
                    proposal_dist=self.base_proposal_dist,
                    S=self.base_S,
                    scaling=self.base_scaling,
                    tune=self.tune,
                    tune_interval=self.base_tune_interval,
                    model=None,
                    blocked=self.base_blocked,
                )
        else:
            # drop the last coarse model
            next_coarse_models = self.coarse_models[:-1]
            next_subsampling_rates = self.subsampling_rates[:-1]
            with self.next_model:
                # make sure the correct variables are selected from next_model
                vars_next = [
                    var for var in self.next_model.vars
                    if var.name in self.var_names
                ]
                # MLDA sampler in some intermediate level, targeting self.next_model
                self.next_step_method = pm.MLDA(
                    vars=vars_next,
                    base_S=self.base_S,
                    base_proposal_dist=self.base_proposal_dist,
                    base_scaling=self.base_scaling,
                    tune=self.tune,
                    base_tune_interval=self.base_tune_interval,
                    model=None,
                    mode=self.mode,
                    subsampling_rates=next_subsampling_rates,
                    coarse_models=next_coarse_models,
                    base_blocked=self.base_blocked,
                    **kwargs,
                )

        # instantiate the recursive DA proposal.
        # this is the main proposal used for
        # all levels (Recursive Delayed Acceptance)
        # (except for level 0 where the step method is MetropolisMLDA and not MLDA)
        self.proposal_dist = RecursiveDAProposal(
            self.next_step_method,
            self.next_model,
            self.tune,
            self.subsampling_rates[-1],
        )
Exemple #53
0
def advi(vars=None, start=None, model=None, n=5000, accurate_elbo=False,
         optimizer=None, learning_rate=.001, epsilon=.1, mode=None,
         tol_obj=0.01, eval_elbo=100, random_seed=None, progressbar=True):
    """Perform automatic differentiation variational inference (ADVI).

    This function implements the meanfield ADVI, where the variational
    posterior distribution is assumed to be spherical Gaussian without
    correlation of parameters and fit to the true posterior distribution.
    The means and standard deviations of the variational posterior are referred
    to as variational parameters.

    The return value of this function is an :code:`ADVIfit` object, which has
    variational parameters. If you want to draw samples from the variational
    posterior, you need to pass the :code:`ADVIfit` object to
    :code:`pymc3.variational.sample_vp()`.

    The variational parameters are defined on the transformed space, which is
    required to do ADVI on an unconstrained parameter space as described in
    [KTR+2016]. The parameters in the :code:`ADVIfit` object are in the
    transformed space, while traces returned by :code:`sample_vp()` are in
    the original space as obtained by MCMC sampling methods in PyMC3.

    The variational parameters are optimized with given optimizer, which is a
    function that returns a dictionary of parameter updates as provided to
    Theano function. If no optimizer is provided, optimization is performed
    with a modified version of adagrad, where only the last (n_window) gradient
    vectors are used to control the learning rate and older gradient vectors
    are ignored. n_window denotes the size of time window and fixed to 10.

    Parameters
    ----------
    vars : object
        Random variables.
    start : Dict or None
        Initial values of parameters (variational means).
    model : Model
        Probabilistic model.
    n : int
        Number of interations updating parameters.
    accurate_elbo : bool
        If true, 100 MC samples are used for accurate calculation of ELBO.
    optimizer : (loss, tensor) -> dict or OrderedDict
        A function that returns parameter updates given loss and parameter
        tensor. If :code:`None` (default), a default Adagrad optimizer is
        used with parameters :code:`learning_rate` and :code:`epsilon` below.
    learning_rate: float
        Base learning rate for adagrad. This parameter is ignored when
        optimizer is given.
    epsilon : float
        Offset in denominator of the scale of learning rate in Adagrad.
        This parameter is ignored when optimizer is given.
    tol_obj : float
        Relative tolerance for testing convergence of ELBO.
    eval_elbo : int
        Window for checking convergence of ELBO. Convergence will be checked
        for every multiple of eval_elbo.
    random_seed : int or None
        Seed to initialize random state. None uses current seed.
    mode :  string or `Mode` instance.
        Compilation mode passed to Theano functions
    progressbar : bool
        Whether or not to display a progress bar in the command line. The
        bar shows the percentage of completion, the sampling speed in
        samples per second (SPS), the estimated remaining time until
        completion ("expected time of arrival"; ETA), and the current ELBO.

    Returns
    -------
    ADVIFit
        Named tuple, which includes 'means', 'stds', and 'elbo_vals'.

    'means' is the mean. 'stds' is the standard deviation.
    'elbo_vals' is the trace of ELBO values during optimizaiton.

    References
    ----------
    .. [KTR+2016] Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A.,
        and Blei, D. M. (2016). Automatic Differentiation Variational
        Inference. arXiv preprint arXiv:1603.00788.
    """
    import warnings
    warnings.warn('Old ADVI interface and sample_vp is deprecated and will '
                  'be removed in future, use pm.fit and pm.sample_approx instead',
                  DeprecationWarning, stacklevel=2)
    model = pm.modelcontext(model)
    if start is None:
        start = model.test_point

    if vars is None:
        vars = model.vars
    vars = pm.inputvars(vars)

    if len(vars) == 0:
        raise ValueError('No free random variables to fit.')

    if not pm.model.all_continuous(vars):
        raise ValueError('Model can not include discrete RVs for ADVI.')

    n_mcsamples = 100 if accurate_elbo else 1

    # Prepare optimizer
    if optimizer is None:
        optimizer = adagrad_optimizer(learning_rate, epsilon)

    # Create variational gradient tensor
    elbo, shared = _calc_elbo(vars, model, n_mcsamples=n_mcsamples,
                              random_seed=random_seed)

    # Set starting values
    for var, share in shared.items():
        share.set_value(start[str(var)])

    order = pm.ArrayOrdering(vars)
    bij = pm.DictToArrayBijection(order, start)
    u_start = bij.map(start)
    w_start = np.zeros_like(u_start)
    uw = np.concatenate([u_start, w_start])

    # Create parameter update function used in the training loop
    uw_shared = theano.shared(uw, 'uw_shared')
    elbo = pm.CallableTensor(elbo)(uw_shared)
    updates = optimizer(loss=-1 * elbo, param=[uw_shared])
    f = theano.function([], [uw_shared, elbo], updates=updates, mode=mode)

    # For tracking convergence of ELBO
    window_size = int(max(0.1 * n // eval_elbo, 2.0))
    circ_buff = deque([], maxlen=window_size)

    # Optimization loop
    elbos = np.empty(n)
    divergence_flag = False
    progress = trange(n) if progressbar else range(n)
    try:
        uw_i, elbo_current = f()
        if np.isnan(elbo_current):
            raise FloatingPointError('NaN occurred in ADVI optimization.')
        for i in progress:
            uw_i, e = f()
            if np.isnan(e):
                raise FloatingPointError('NaN occurred in ADVI optimization.')
            elbos[i] = e

            if progressbar:
                if n < 10:
                    progress.set_description('ELBO = {:,.5g}'.format(elbos[i]))
                elif i % (n // 10) == 0 and i > 0:
                    avg_elbo = infmean(elbos[i - n // 10:i])
                    progress.set_description(
                        'Average ELBO = {:,.5g}'.format(avg_elbo))

            if i % eval_elbo == 0:
                elbo_prev = elbo_current
                elbo_current = elbos[i]
                delta_elbo = abs((elbo_current - elbo_prev) / elbo_prev)
                circ_buff.append(delta_elbo)
                avg_delta = np.mean(circ_buff)
                med_delta = np.median(circ_buff)

                if i > 0 and avg_delta < tol_obj:
                    pm._log.info('Mean ELBO converged.')
                    elbos = elbos[:(i + 1)]
                    break
                elif i > 0 and med_delta < tol_obj:
                    pm._log.info('Median ELBO converged.')
                    elbos = elbos[:(i + 1)]
                    break
                if i > 10 * eval_elbo:
                    if med_delta > 0.5 or avg_delta > 0.5:
                        divergence_flag = True
                    else:
                        divergence_flag = False

    except KeyboardInterrupt:
        elbos = elbos[:i]
        if n < 10:
            pm._log.info('Interrupted at {:,d} [{:.0f}%]: ELBO = {:,.5g}'.format(
                i, 100 * i // n, elbos[i]))
        else:
            avg_elbo = infmean(elbos[i - n // 10:i])
            pm._log.info('Interrupted at {:,d} [{:.0f}%]: Average ELBO = {:,.5g}'.format(
                i, 100 * i // n, avg_elbo))
    else:
        if n < 10:
            pm._log.info('Finished [100%]: ELBO = {:,.5g}'.format(elbos[-1]))
        else:
            avg_elbo = infmean(elbos[-n // 10:])
            pm._log.info('Finished [100%]: Average ELBO = {:,.5g}'.format(avg_elbo))
    finally:
        if progressbar:
            progress.close()

    if divergence_flag:
        pm._log.info('Evidence of divergence detected, inspect ELBO.')

    # Estimated parameters
    l = int(uw_i.size / 2)
    u = bij.rmap(uw_i[:l])
    w = bij.rmap(uw_i[l:])
    # w is in log space
    for var in w.keys():
        w[var] = np.exp(w[var])

    return ADVIFit(u, w, elbos)
Exemple #54
0
def advi(vars=None,
         start=None,
         model=None,
         n=5000,
         accurate_elbo=False,
         optimizer=None,
         learning_rate=.001,
         epsilon=.1,
         mode=None,
         tol_obj=0.01,
         eval_elbo=100,
         random_seed=None,
         progressbar=True):
    """Perform automatic differentiation variational inference (ADVI).

    This function implements the meanfield ADVI, where the variational
    posterior distribution is assumed to be spherical Gaussian without
    correlation of parameters and fit to the true posterior distribution.
    The means and standard deviations of the variational posterior are referred
    to as variational parameters.

    The return value of this function is an :code:`ADVIfit` object, which has
    variational parameters. If you want to draw samples from the variational
    posterior, you need to pass the :code:`ADVIfit` object to
    :code:`pymc3.variational.sample_vp()`.

    The variational parameters are defined on the transformed space, which is
    required to do ADVI on an unconstrained parameter space as described in
    [KTR+2016]. The parameters in the :code:`ADVIfit` object are in the
    transformed space, while traces returned by :code:`sample_vp()` are in
    the original space as obtained by MCMC sampling methods in PyMC3.

    The variational parameters are optimized with given optimizer, which is a
    function that returns a dictionary of parameter updates as provided to
    Theano function. If no optimizer is provided, optimization is performed
    with a modified version of adagrad, where only the last (n_window) gradient
    vectors are used to control the learning rate and older gradient vectors
    are ignored. n_window denotes the size of time window and fixed to 10.

    Parameters
    ----------
    vars : object
        Random variables.
    start : Dict or None
        Initial values of parameters (variational means).
    model : Model
        Probabilistic model.
    n : int
        Number of interations updating parameters.
    accurate_elbo : bool
        If true, 100 MC samples are used for accurate calculation of ELBO.
    optimizer : (loss, tensor) -> dict or OrderedDict
        A function that returns parameter updates given loss and parameter
        tensor. If :code:`None` (default), a default Adagrad optimizer is
        used with parameters :code:`learning_rate` and :code:`epsilon` below.
    learning_rate: float
        Base learning rate for adagrad. This parameter is ignored when
        optimizer is given.
    epsilon : float
        Offset in denominator of the scale of learning rate in Adagrad.
        This parameter is ignored when optimizer is given.
    tol_obj : float
        Relative tolerance for testing convergence of ELBO.
    eval_elbo : int
        Window for checking convergence of ELBO. Convergence will be checked
        for every multiple of eval_elbo.
    random_seed : int or None
        Seed to initialize random state. None uses current seed.
    mode :  string or `Mode` instance.
        Compilation mode passed to Theano functions
    progressbar : bool
        Whether or not to display a progress bar in the command line. The
        bar shows the percentage of completion, the sampling speed in
        samples per second (SPS), the estimated remaining time until
        completion ("expected time of arrival"; ETA), and the current ELBO.

    Returns
    -------
    ADVIFit
        Named tuple, which includes 'means', 'stds', and 'elbo_vals'.

    'means' is the mean. 'stds' is the standard deviation.
    'elbo_vals' is the trace of ELBO values during optimizaiton.

    References
    ----------
    .. [KTR+2016] Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A.,
        and Blei, D. M. (2016). Automatic Differentiation Variational
        Inference. arXiv preprint arXiv:1603.00788.
    """
    import warnings
    warnings.warn(
        'Old ADVI interface and sample_vp is deprecated and will '
        'be removed in future, use pm.fit and pm.sample_approx instead',
        DeprecationWarning,
        stacklevel=2)
    model = pm.modelcontext(model)
    if start is None:
        start = model.test_point

    if vars is None:
        vars = model.vars
    vars = pm.inputvars(vars)

    if len(vars) == 0:
        raise ValueError('No free random variables to fit.')

    if not pm.model.all_continuous(vars):
        raise ValueError('Model can not include discrete RVs for ADVI.')

    n_mcsamples = 100 if accurate_elbo else 1

    # Prepare optimizer
    if optimizer is None:
        optimizer = adagrad_optimizer(learning_rate, epsilon)

    # Create variational gradient tensor
    elbo, shared = _calc_elbo(vars,
                              model,
                              n_mcsamples=n_mcsamples,
                              random_seed=random_seed)

    # Set starting values
    for var, share in shared.items():
        share.set_value(start[str(var)])

    order = pm.ArrayOrdering(vars)
    bij = pm.DictToArrayBijection(order, start)
    u_start = bij.map(start)
    w_start = np.zeros_like(u_start)
    uw = np.concatenate([u_start, w_start])

    # Create parameter update function used in the training loop
    uw_shared = theano.shared(uw, 'uw_shared')
    elbo = pm.CallableTensor(elbo)(uw_shared)
    updates = optimizer(loss=-1 * elbo, param=[uw_shared])
    f = theano.function([], [uw_shared, elbo], updates=updates, mode=mode)

    # For tracking convergence of ELBO
    window_size = int(max(0.1 * n // eval_elbo, 2.0))
    circ_buff = deque([], maxlen=window_size)

    # Optimization loop
    elbos = np.empty(n)
    divergence_flag = False
    progress = trange(n) if progressbar else range(n)
    try:
        uw_i, elbo_current = f()
        if np.isnan(elbo_current):
            raise FloatingPointError('NaN occurred in ADVI optimization.')
        for i in progress:
            uw_i, e = f()
            if np.isnan(e):
                raise FloatingPointError('NaN occurred in ADVI optimization.')
            elbos[i] = e

            if progressbar:
                if n < 10:
                    progress.set_description('ELBO = {:,.5g}'.format(elbos[i]))
                elif i % (n // 10) == 0 and i > 0:
                    avg_elbo = infmean(elbos[i - n // 10:i])
                    progress.set_description(
                        'Average ELBO = {:,.5g}'.format(avg_elbo))

            if i % eval_elbo == 0:
                elbo_prev = elbo_current
                elbo_current = elbos[i]
                delta_elbo = abs((elbo_current - elbo_prev) / elbo_prev)
                circ_buff.append(delta_elbo)
                avg_delta = np.mean(circ_buff)
                med_delta = np.median(circ_buff)

                if i > 0 and avg_delta < tol_obj:
                    pm._log.info('Mean ELBO converged.')
                    elbos = elbos[:(i + 1)]
                    break
                elif i > 0 and med_delta < tol_obj:
                    pm._log.info('Median ELBO converged.')
                    elbos = elbos[:(i + 1)]
                    break
                if i > 10 * eval_elbo:
                    if med_delta > 0.5 or avg_delta > 0.5:
                        divergence_flag = True
                    else:
                        divergence_flag = False

    except KeyboardInterrupt:
        elbos = elbos[:i]
        if n < 10:
            pm._log.info(
                'Interrupted at {:,d} [{:.0f}%]: ELBO = {:,.5g}'.format(
                    i, 100 * i // n, elbos[i]))
        else:
            avg_elbo = infmean(elbos[i - n // 10:i])
            pm._log.info(
                'Interrupted at {:,d} [{:.0f}%]: Average ELBO = {:,.5g}'.
                format(i, 100 * i // n, avg_elbo))
    else:
        if n < 10:
            pm._log.info('Finished [100%]: ELBO = {:,.5g}'.format(elbos[-1]))
        else:
            avg_elbo = infmean(elbos[-n // 10:])
            pm._log.info(
                'Finished [100%]: Average ELBO = {:,.5g}'.format(avg_elbo))
    finally:
        if progressbar:
            progress.close()

    if divergence_flag:
        pm._log.info('Evidence of divergence detected, inspect ELBO.')

    # Estimated parameters
    l = int(uw_i.size / 2)
    u = bij.rmap(uw_i[:l])
    w = bij.rmap(uw_i[l:])
    # w is in log space
    for var in w.keys():
        w[var] = np.exp(w[var])

    return ADVIFit(u, w, elbos)
Exemple #55
0
def advi(vars=None, start=None, model=None, n=5000, accurate_elbo=False,
         optimizer=None, learning_rate=.001, epsilon=.1, random_seed=None):
    """Perform automatic differentiation variational inference (ADVI).

    This function implements the meanfield ADVI, where the variational
    posterior distribution is assumed to be spherical Gaussian without
    correlation of parameters and fit to the true posterior distribution.
    The means and standard deviations of the variational posterior are referred
    to as variational parameters.

    The return value of this function is an :code:`ADVIfit` object, which has
    variational parameters. If you want to draw samples from the variational
    posterior, you need to pass the :code:`ADVIfit` object to
    :code:`pymc3.variational.sample_vp()`.

    The variational parameters are defined on the transformed space, which is
    required to do ADVI on an unconstrained parameter space as described in
    [KTR+2016]. The parameters in the :code:`ADVIfit` object are in the
    transformed space, while traces returned by :code:`sample_vp()` are in
    the original space as obtained by MCMC sampling methods in PyMC3.

    The variational parameters are optimized with given optimizer, which is a
    function that returns a dictionary of parameter updates as provided to
    Theano function. If no optimizer is provided, optimization is performed
    with a modified version of adagrad, where only the last (n_window) gradient
    vectors are used to control the learning rate and older gradient vectors
    are ignored. n_window denotes the size of time window and fixed to 10.

    Parameters
    ----------
    vars : object
        Random variables.
    start : Dict or None
        Initial values of parameters (variational means).
    model : Model
        Probabilistic model.
    n : int
        Number of interations updating parameters.
    accurate_elbo : bool
        If true, 100 MC samples are used for accurate calculation of ELBO.
    optimizer : (loss, tensor) -> dict or OrderedDict
        A function that returns parameter updates given loss and parameter
        tensor. If :code:`None` (default), a default Adagrad optimizer is
        used with parameters :code:`learning_rate` and :code:`epsilon` below.
    learning_rate: float
        Base learning rate for adagrad. This parameter is ignored when
        optimizer is given.
    epsilon : float
        Offset in denominator of the scale of learning rate in Adagrad.
        This parameter is ignored when optimizer is given.
    random_seed : int or None
        Seed to initialize random state. None uses current seed.

    Returns
    -------
    ADVIFit
        Named tuple, which includes 'means', 'stds', and 'elbo_vals'.

    'means' is the mean. 'stds' is the standard deviation.
    'elbo_vals' is the trace of ELBO values during optimizaiton.

    References
    ----------
    .. [KTR+2016] Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A.,
        and Blei, D. M. (2016). Automatic Differentiation Variational
        Inference. arXiv preprint arXiv:1603.00788.
    """
    model = pm.modelcontext(model)
    if start is None:
        start = model.test_point

    if vars is None:
        vars = model.vars
    vars = pm.inputvars(vars)

    if not pm.model.all_continuous(vars):
        raise ValueError('Model should not include discrete RVs for ADVI.')

    n_mcsamples = 100 if accurate_elbo else 1

    # Prepare optimizer
    if optimizer is None:
        optimizer = adagrad_optimizer(learning_rate, epsilon)

    # Create variational gradient tensor
    elbo, shared = _calc_elbo(vars, model, n_mcsamples=n_mcsamples,
                              random_seed=random_seed)

    # Set starting values
    for var, share in shared.items():
        share.set_value(start[str(var)])

    order = pm.ArrayOrdering(vars)
    bij = pm.DictToArrayBijection(order, start)
    u_start = bij.map(start)
    w_start = np.zeros_like(u_start)
    uw = np.concatenate([u_start, w_start])

    # Create parameter update function used in the training loop
    uw_shared = theano.shared(uw, 'uw_shared')
    elbo = pm.CallableTensor(elbo)(uw_shared)
    updates = optimizer(loss=-1 * elbo, param=[uw_shared])
    f = theano.function([], [uw_shared, elbo], updates=updates)

    # Optimization loop
    elbos = np.empty(n)
    try:
        progress = trange(n)
        for i in progress:
            uw_i, e = f()
            elbos[i] = e
            if i % (n // 10) == 0 and i > 0:
                avg_elbo = elbos[i - n // 10:i].mean()
                progress.set_description('Average ELBO = {:,.5g}'.format(avg_elbo))
    except KeyboardInterrupt:
        elbos = elbos[:i]
        avg_elbo = elbos[i - n // 10:].mean()
        pm._log.info('Interrupted at {:,d} [{:.0f}%]: Average ELBO = {:,.5g}'.format(
            i, 100 * i // n, avg_elbo))
    else:
        avg_elbo = elbos[-n // 10:].mean()
        pm._log.info('Finished [100%]: Average ELBO = {:,.5g}'.format(avg_elbo))

    # Estimated parameters
    l = int(uw_i.size / 2)
    u = bij.rmap(uw_i[:l])
    w = bij.rmap(uw_i[l:])
    # w is in log space
    for var in w.keys():
        w[var] = np.exp(w[var])

    return ADVIFit(u, w, elbos)
Exemple #56
0
def ATMIP_sample(n_steps, step=None, start=None, trace=None, chain=0,
                  stage=None, njobs=1, tune=None, progressbar=False,
                  model=None, random_seed=None):
    """
    (C)ATMIP sampling algorithm from Minson et al. 2013:
    Bayesian inversion for finite fault earthquake source models I-
        Theory and algorithm
    (without cascading- C)
    https://gji.oxfordjournals.org/content/194/3/1701.full
    Samples the solution space with n_chains of Metropolis chains, where each
    chain has n_steps iterations. Once finished, the sampled traces are
    evaluated:
    (1) Based on the likelihoods of the final samples, chains are weighted
    (2) the weighted covariance of the ensemble is calculated and set as new
        proposal distribution
    (3) the variation in the ensemble is calculated and the next tempering
        parameter (beta) calculated
    (4) New n_chains Metropolis chains are seeded on the traces with high
        weight for n_steps iterations
    (5) Repeat until beta > 1.

    Parameters
    ----------

    n_steps : int
        The number of samples to draw for each Markov-chain per stage
    step : function from TMCMC initialisation
    start : List of dicts with length(n_chains)
        Starting points in parameter space (or partial point)
        Defaults to random draws from variables (defaults to empty dict)
    trace : backend
        This should be a backend instance.
        Passing either "text" or "sqlite" is taken as a shortcut to set
        up the corresponding backend (with "mcmc" used as the base
        name).
    chain : int
        Chain number used to store sample in backend. If `njobs` is
        greater than one, chain numbers will start here.
    stage : int
        Stage where to start or continue the calculation. If None the start
        will be at stage = 0.
    njobs : int
        The number of cores to be used in parallel. Be aware that theano has
        internal parallelisation. Sometimes this is more efficient especially
        for simple models.
        step.n_chains / njobs has to be an integer number!
    tune : int
        Number of iterations to tune, if applicable (defaults to None)
    trace : result_folder for storing stages, will be created if not existing
    progressbar : bool
        Flag for progress bar
    model : Model (optional if in `with` context) has to contain deterministic
            variable 'name defined under step.likelihood_name' that contains
            model likelihood
    random_seed : int or list of ints
        A list is accepted if more if `njobs` is greater than one.

    Returns
    -------
    MultiTrace object with access to sampling values
    """

    model = pm.modelcontext(model)
    step.n_steps = int(n_steps)
    seed(random_seed)

    if n_steps < 1:
        raise ValueError('Argument `n_steps` should be above 0.')

    if step is None:
        raise Exception('Argument `step` has to be a TMCMC step object.')

    if trace is None:
        raise Exception('Argument `trace` should be either sqlite or text '
                        'backend object.')

    if start is not None:
        if len(start) != step.n_chains:
            raise Exception('Argument `start` should have dicts equal the '
                            'number of chains (step.N-chains)')
        else:
            step.population = start

    if stage is not None:
        step.stage = stage

    if not any(
            step.likelihood_name in var.name for var in model.deterministics):
            raise Exception('Model (deterministic) variables need to contain '
                            'a variable `' + step.likelihood_name + '` as '
                            'defined in `step`.')

    if progressbar:
        verbosity = 5
    else:
        verbosity = 0

    homepath = trace

    if not os.path.exists(homepath):
        os.mkdir(homepath)

    with model:
        with Parallel(n_jobs=njobs, verbose=verbosity) as parallel:
            while step.beta < 1.:
                print('Beta: ' + str(step.beta), ' Stage: ' + str(step.stage))
                if step.stage == 0:
                    # Initial stage
                    print('Sample initial stage: ...')
                    stage_path = homepath + '/stage_' + str(step.stage)
                    trace = pm.backends.Text(stage_path, model=model)
                    initial = _iter_initial(step, chain=chain, trace=trace)
                    progress = pm.progressbar.progress_bar(step.n_chains)
                    try:
                        for i, strace in enumerate(initial):
                            if progressbar:
                                progress.update(i)
                    except KeyboardInterrupt:
                        strace.close()
                    mtrace = pm.backends.base.MultiTrace([strace])
                    step.population, step.array_population, step.likelihoods = \
                                            step.select_end_points(mtrace)
                    step.beta, step.old_beta, step.weights = step.calc_beta()
                    step.covariance = step.calc_covariance()
                    step.res_indx = step.resample()
                    step.stage += 1
                    del(strace, mtrace, trace)
                else:
                    if progressbar and njobs > 1:
                        progressbar = False
                    # Metropolis sampling intermediate stages
                    stage_path = homepath + '/stage_' + str(step.stage)
                    step.proposal_dist = MvNPd(step.covariance)

                    sample_args = {
                            'draws': n_steps,
                            'step': step,
                            'stage_path': stage_path,
                            'progressbar': progressbar,
                            'model': model}
                    mtrace = _iter_parallel_chains(parallel, **sample_args)

                    step.population, step.array_population, step.likelihoods = \
                                            step.select_end_points(mtrace)
                    step.beta, step.old_beta, step.weights = step.calc_beta()
                    step.stage += 1

                    if step.beta > 1.:
                        print('Beta > 1.: ' + str(step.beta))
                        step.beta = 1.
                        break

                    step.covariance = step.calc_covariance()
                    step.res_indx = step.resample()

            # Metropolis sampling final stage
            print('Sample final stage')
            stage_path = homepath + '/stage_final'
            temp = np.exp((1 - step.old_beta) * \
                               (step.likelihoods - step.likelihoods.max()))
            step.weights = temp / np.sum(temp)
            step.covariance = step.calc_covariance()
            step.proposal_dist = MvNPd(step.covariance)
            step.res_indx = step.resample()

            sample_args['step'] = step
            sample_args['stage_path'] = stage_path
            mtrace = _iter_parallel_chains(parallel, **sample_args)
            return mtrace