def test_nest_context_works(self): with pm.Model() as m: new = NewModel() with new: assert pm.modelcontext(None) is new assert pm.modelcontext(None) is m assert 'v1' in m.named_vars assert 'v2' in m.named_vars
def __init__(self, vars=None, S=None, proposal_dist=NormalProposal, scaling=1., tune=True, tune_interval=100, model=None, **kwargs): model = pm.modelcontext(model) if vars is None: vars = model.vars vars = pm.inputvars(vars) if S is None: S = np.ones(sum(v.dsize for v in vars)) self.proposal_dist = proposal_dist(S) self.scaling = np.atleast_1d(scaling) self.tune = tune self.tune_interval = tune_interval self.steps_until_tune = tune_interval self.accepted = 0 # Determine type of variables self.discrete = np.concatenate( [[v.dtype in pm.discrete_types] * (v.dsize or 1) for v in vars]) self.any_discrete = self.discrete.any() self.all_discrete = self.discrete.all() shared = pm.make_shared_replacements(vars, model) self.delta_logp = delta_logp(model.logpt, vars, shared) super(Metropolis, self).__init__(vars, shared)
def get_citations_for_model(model=None, width=79): """Get the citations for the components used an exoplanet PyMC3 Returns: The acknowledgement text for exoplanet and its dependencies and a string containing the BibTeX entries for the citations in the acknowledgement. """ model = pm.modelcontext(model) if not hasattr(model, "__citations__"): logging.warning("no citations registered with model") return "", "" cite = list(CITATIONS["exoplanet"][0]) + \ list(CITATIONS["pymc3"][0]) + \ list(CITATIONS["theano"][0]) bib = [CITATIONS["exoplanet"][1], CITATIONS["pymc3"][1], CITATIONS["theano"][1]] for k, v in model.__citations__.items(): cite += list(v[0]) bib.append(v[1]) txt = (r"This research made use of \textsf{{exoplanet}} " r"\citep{{exoplanet}} and its dependencies \citep{{{0}}}.") txt = txt.format(", ".join(sorted(cite))) txt = textwrap.wrap(txt, width=width) return "\n".join(txt), "\n".join(bib)
def init_nuts(init='advi', n_init=500000, model=None, **kwargs): """Initialize and sample from posterior of a continuous model. This is a convenience function. NUTS convergence and sampling speed is extremely dependent on the choice of mass/scaling matrix. In our experience, using ADVI to estimate a diagonal covariance matrix and using this as the scaling matrix produces robust results over a wide class of continuous models. Parameters ---------- init : str {'advi', 'advi_map', 'map', 'nuts'} Initialization method to use. * advi : Run ADVI to estimate posterior mean and diagonal covariance matrix. * advi_map: Initialize ADVI with MAP and use MAP as starting point. * map : Use the MAP as starting point. * nuts : Run NUTS and estimate posterior mean and covariance matrix. n_init : int Number of iterations of initializer If 'advi', number of iterations, if 'metropolis', number of draws. model : Model (optional if in `with` context) **kwargs : keyword arguments Extra keyword arguments are forwarded to pymc3.NUTS. Returns ------- start, nuts_sampler start : pymc3.model.Point Starting point for sampler nuts_sampler : pymc3.step_methods.NUTS Instantiated and initialized NUTS sampler object """ model = pm.modelcontext(model) pm._log.info('Initializing NUTS using {}...'.format(init)) if init == 'advi': v_params = pm.variational.advi(n=n_init) start = pm.variational.sample_vp(v_params, 1, progressbar=False)[0] cov = np.power(model.dict_to_array(v_params.stds), 2) elif init == 'advi_map': start = pm.find_MAP() v_params = pm.variational.advi(n=n_init, start=start) cov = np.power(model.dict_to_array(v_params.stds), 2) elif init == 'map': start = pm.find_MAP() cov = pm.find_hessian(point=start) elif init == 'nuts': init_trace = pm.sample(step=pm.NUTS(), draws=n_init) cov = pm.trace_cov(init_trace[n_init//2:]) start = {varname: np.mean(init_trace[varname]) for varname in init_trace.varnames} else: raise NotImplemented('Initializer {} is not supported.'.format(init)) step = pm.NUTS(scaling=cov, is_cov=True, **kwargs) return start, step
def __init__(self, vars=None, S=None, proposal_dist=None, lamb=None, scaling=0.001, tune=True, tune_interval=100, model=None, mode=None, **kwargs): warnings.warn('Population based sampling methods such as DEMetropolis are experimental.' \ ' Use carefully and be extra critical about their results!') model = pm.modelcontext(model) if vars is None: vars = model.cont_vars vars = pm.inputvars(vars) if S is None: S = np.ones(sum(v.dsize for v in vars)) if proposal_dist is not None: self.proposal_dist = proposal_dist(S) else: self.proposal_dist = UniformProposal(S) self.scaling = np.atleast_1d(scaling).astype('d') if lamb is None: lamb = 2.38 / np.sqrt(2 * S.size) self.lamb = float(lamb) self.tune = tune self.tune_interval = tune_interval self.steps_until_tune = tune_interval self.accepted = 0 self.mode = mode shared = pm.make_shared_replacements(vars, model) self.delta_logp = delta_logp(model.logpt, vars, shared) super().__init__(vars, shared)
def fit(n=10000, local_rv=None, method='advi', model=None, **kwargs): """ Handy shortcut for using inference methods in functional way Parameters ---------- n : int number of iterations local_rv : dict mapping {model_variable -> local_variable} Local Vars are used for Autoencoding Variational Bayes See (AEVB; Kingma and Welling, 2014) for details method : str or Inference string name is case insensitive in {'advi', 'fullrank_advi', 'advi->fullrank_advi'} model : Model kwargs : kwargs for Inference.fit frac : float if method is 'advi->fullrank_advi' represents advi fraction when training Returns ------- Approximation """ if model is None: model = pm.modelcontext(model) _select = dict( advi=ADVI, fullrank_advi=FullRankADVI, svgd=SVGD ) if isinstance(method, str) and method.lower() == 'advi->fullrank_advi': frac = kwargs.pop('frac', .5) if not 0. < frac < 1.: raise ValueError('frac should be in (0, 1)') n1 = int(n * frac) n2 = n-n1 inference = ADVI(local_rv=local_rv, model=model) logger.info('fitting advi ...') inference.fit(n1, **kwargs) inference = FullRankADVI.from_advi(inference) logger.info('fitting fullrank advi ...') return inference.fit(n2, **kwargs) elif isinstance(method, str): try: inference = _select[method.lower()]( local_rv=local_rv, model=model ) except KeyError: raise KeyError('method should be one of %s ' 'or Inference instance' % set(_select.keys())) elif isinstance(method, Inference): inference = method else: raise TypeError('method should be one of %s ' 'or Inference instance' % set(_select.keys())) return inference.fit(n, **kwargs)
def add_citations_to_model(citations, model=None): try: model = pm.modelcontext(model) if not hasattr(model, "__citations__"): model.__citations__ = dict() for k in citations: model.__citations__[k] = CITATIONS[k] except TypeError: pass
def __init__(self, name='', model=None): super().__init__(name, model) assert pm.modelcontext(None) is self # 1) init variables with Var method self.Var('v1', pm.Normal.dist()) self.v2 = pm.Normal('v2', mu=0, sigma=1) # 2) Potentials and Deterministic variables with method too # be sure that names will not overlap with other same models pm.Deterministic('d', tt.constant(1)) pm.Potential('p', tt.constant(1))
def run_ppc(trace, samples=100, model=None): """Generate Posterior Predictive samples from a model given a trace. """ if model is None: model = pm.modelcontext(model) ppc = defaultdict(list) for idx in np.random.randint(0, len(trace), samples): param = trace[idx] for obs in model.observed_RVs: ppc[obs.name].append(round(obs.distribution.random(point=param))) return ppc
def model_to_graphviz(model=None): """Produce a graphviz Digraph from a PyMC3 model. Requires graphviz, which may be installed most easily with conda install -c conda-forge python-graphviz Alternatively, you may install the `graphviz` binaries yourself, and then `pip install graphviz` to get the python bindings. See http://graphviz.readthedocs.io/en/stable/manual.html for more information. """ model = pm.modelcontext(model) return ModelGraph(model).make_graph()
def __init__(self, vars=None, covariance=None, scaling=1., n_chains=100, tune=True, tune_interval=100, model=None, check_bound=True, likelihood_name='like', proposal_dist=MvNPd, coef_variation=1., **kwargs): model = pm.modelcontext(model) if vars is None: vars = model.vars vars = pm.inputvars(vars) if covariance is None: self.covariance = np.eye(sum(v.dsize for v in vars)) self.scaling = np.atleast_1d(scaling) self.tune = tune self.check_bnd = check_bound self.tune_interval = tune_interval self.steps_until_tune = tune_interval self.proposal_dist = proposal_dist(self.covariance) self.proposal_samples_array = self.proposal_dist(n_chains) self.stage_sample = 0 self.accepted = 0 self.beta = 0 self.stage = 0 self.coef_variation = coef_variation self.n_chains = n_chains self.likelihoods = [] self.likelihood_name = likelihood_name self.discrete = np.concatenate( [[v.dtype in pm.discrete_types] * (v.dsize or 1) for v in vars]) self.any_discrete = self.discrete.any() self.all_discrete = self.discrete.all() # create initial population self.population = [] self.array_population = np.zeros(n_chains) for i in range(self.n_chains): dummy = pm.Point({v.name: v.random() for v in vars}, model=model) self.population.append(dummy) shared = make_shared_replacements(vars, model) self.logp_forw = logp_forw(model.logpt, vars, shared) self.check_bnd = logp_forw(model.varlogpt, vars, shared) self.delta_logp = pm.metropolis.delta_logp(model.logpt, vars, shared) super(ATMCMC, self).__init__(vars, shared)
def get_step_for_trace(self, trace=None, model=None, regular_window=0, regular_variance=1e-3, **kwargs): """Get a PyMC3 NUTS step tuned for a given burn-in trace Args: trace: The ``MultiTrace`` output from a previous run of ``pymc3.sample``. regular_window: The weight (in units of number of steps) to use when regularizing the mass matrix estimate. regular_variance: The amplitude of the regularization for the mass matrix. This will be added to the diagonal of the covariance matrix with weight given by ``regular_window``. """ model = pm.modelcontext(model) # If not given, use the trivial metric if trace is None or model.ndim == 1: potential = quad.QuadPotentialDiag(np.ones(model.ndim)) else: # Loop over samples and convert to the relevant parameter space; # I'm sure that there's an easier way to do this, but I don't know # how to make something work in general... N = len(trace) * trace.nchains samples = np.empty((N, model.ndim)) i = 0 for chain in trace._straces.values(): for p in chain: samples[i] = model.bijection.map(p) i += 1 if self.dense: # Compute the regularized sample covariance cov = np.cov(samples, rowvar=0) if regular_window > 0: cov = cov * N / (N + regular_window) cov[np.diag_indices_from(cov)] += \ regular_variance * regular_window / (N+regular_window) potential = quad.QuadPotentialFull(cov) else: var = np.var(samples, axis=0) if regular_window > 0: var = var * N / (N + regular_window) var += \ regular_variance * regular_window / (N+regular_window) potential = quad.QuadPotentialDiag(var) return pm.NUTS(potential=potential, **kwargs)
def tune(self, tune=1000, start=None, step_kwargs=None, **kwargs): """Run the full tuning run for the mass matrix This will run ``start`` steps of warmup followed by chains with exponentially increasing chains to tune the mass matrix. Args: tune (int): The total number of steps to run. """ model = pm.modelcontext(kwargs.get("model", None)) ntot = self.start + self.window + self.finish if tune < ntot: raise ValueError("'tune' must be at least {0}".format(ntot) + "(start + window + finish)") self.count = 0 self.warmup(start=start, step_kwargs=step_kwargs, **kwargs) steps = self.window trace = None while self.count < tune: trace = self.extend_tune(start=start, step_kwargs=step_kwargs, steps=steps, trace=trace, **kwargs) steps *= 2 if self.count + steps + steps*2 > tune: steps = tune - self.count # Final tuning stage for step size self.extend_tune(start=start, step_kwargs=step_kwargs, steps=self.finish, trace=trace, **kwargs) # Copy across the step size from the parallel runs self._current_step.stop_tuning() expected = [] for chain in self._current_trace._straces.values(): expected.append(chain.get_sampler_stats("step_size")[-1]) step = self._current_step if step_kwargs is None: step_kwargs = dict() else: step_kwargs = dict(step_kwargs) step_kwargs["model"] = model step_kwargs["step_scale"] = np.mean(expected) * model.ndim ** 0.25 step_kwargs["adapt_step_size"] = False step_kwargs["potential"] = step.potential self._current_step = pm.NUTS(**step_kwargs) return self._current_trace
def __init__(self, vars, scaling=1., tune=True, tune_interval=100, model=None): model = pm.modelcontext(model) self.scaling = scaling self.tune = tune self.tune_interval = tune_interval self.steps_until_tune = tune_interval self.accepted = 0 if not all([v.dtype in pm.discrete_types for v in vars]): raise ValueError( 'All variables must be Bernoulli for BinaryMetropolis') super(BinaryMetropolis, self).__init__(vars, [model.fastlogp])
def _get_priors(self, model=None): """Return prior distributions of the likelihood. Returns ------- dict : mapping name -> pymc3 distribution """ model = pymc3.modelcontext(model) priors = {} for key, val in self.priors.items(): if isinstance(val, numbers.Number): priors[key] = val else: priors[key] = model.Var(val[0], val[1]) return priors
def __init__(self, vars, order='random', model=None): model = pm.modelcontext(model) self.dim = sum(v.dsize for v in vars) if order == 'random': self.shuffle_dims = True self.order = list(range(self.dim)) else: if sorted(order) != list(range(self.dim)): raise ValueError('Argument \'order\' has to be a permutation') self.shuffle_dims = False self.order = order if not all([v.dtype in pm.discrete_types for v in vars]): raise ValueError( 'All variables must be binary for BinaryGibbsMetropolis') super(BinaryGibbsMetropolis, self).__init__(vars, [model.fastlogp])
def __init__(self, vars, proposal='uniform', order='random', model=None): model = pm.modelcontext(model) vars = pm.inputvars(vars) dimcats = [] # The above variable is a list of pairs (aggregate dimension, number # of categories). For example, if vars = [x, y] with x being a 2-D # variable with M categories and y being a 3-D variable with N # categories, we will have dimcats = [(0, M), (1, M), (2, N), (3, N), (4, N)]. for v in vars: distr = getattr(v.distribution, 'parent_dist', v.distribution) if isinstance(distr, pm.Categorical): k = draw_values([distr.k])[0] elif isinstance(distr, pm.Bernoulli) or (v.dtype in pm.bool_types): k = 2 else: raise ValueError('All variables must be categorical or binary' + 'for CategoricalGibbsMetropolis') start = len(dimcats) dimcats += [(dim, k) for dim in range(start, start + v.dsize)] if order == 'random': self.shuffle_dims = True self.dimcats = dimcats else: if sorted(order) != list(range(len(dimcats))): raise ValueError('Argument \'order\' has to be a permutation') self.shuffle_dims = False self.dimcats = [dimcats[j] for j in order] if proposal == 'uniform': self.astep = self.astep_unif elif proposal == 'proportional': # Use the optimized "Metropolized Gibbs Sampler" described in Liu96. self.astep = self.astep_prop else: raise ValueError('Argument \'proposal\' should either be ' + '\'uniform\' or \'proportional\'') super(CategoricalGibbsMetropolis, self).__init__(vars, [model.fastlogp])
def __init__(self, vars=None, S=None, proposal_dist=None, scaling=1., tune=True, tune_interval=100, model=None, mode=None, **kwargs): model = pm.modelcontext(model) if vars is None: vars = model.vars vars = pm.inputvars(vars) if S is None: S = np.ones(sum(v.dsize for v in vars)) if proposal_dist is not None: self.proposal_dist = proposal_dist(S) elif S.ndim == 1: self.proposal_dist = NormalProposal(S) elif S.ndim == 2: self.proposal_dist = MultivariateNormalProposal(S) else: raise ValueError("Invalid rank for variance: %s" % S.ndim) self.scaling = np.atleast_1d(scaling).astype('d') self.tune = tune self.tune_interval = tune_interval self.steps_until_tune = tune_interval self.accepted = 0 # Determine type of variables self.discrete = np.concatenate( [[v.dtype in pm.discrete_types] * (v.dsize or 1) for v in vars]) self.any_discrete = self.discrete.any() self.all_discrete = self.discrete.all() self.mode = mode shared = pm.make_shared_replacements(vars, model) self.delta_logp = delta_logp(model.logpt, vars, shared) super().__init__(vars, shared)
def sample_numpyro_nuts( draws=1000, tune=1000, chains=4, target_accept=0.8, random_seed=10, model=None, progress_bar=True, keep_untransformed=False, ): model = modelcontext(model) seed = jax.random.PRNGKey(random_seed) rv_names = [rv.name for rv in model.value_vars] init_state = [model.initial_point[rv_name] for rv_name in rv_names] init_state_batched = jax.tree_map( lambda x: np.repeat(x[None, ...], chains, axis=0), init_state) init_state_batched_at = [at.as_tensor(v) for v in init_state_batched] nuts_inputs = sorted( [ v for v in graph_inputs([model.logpt]) if not isinstance(v, Constant) ], key=lambda x: isinstance(x, SharedVariable), ) map_seed = jax.random.split(seed, chains) numpyro_samples = NumPyroNUTS( nuts_inputs, [model.logpt], target_accept=target_accept, draws=draws, tune=tune, chains=chains, seed=map_seed, progress_bar=progress_bar, )(*init_state_batched_at) # Un-transform the transformed variables in JAX sample_outputs = [] for i, (value_var, rv_samples) in enumerate( zip(model.value_vars, numpyro_samples[:-1])): rv = model.values_to_rvs[value_var] transform = getattr(value_var.tag, "transform", None) if transform is not None: untrans_value_var = transform.backward(rv, rv_samples) untrans_value_var.name = rv.name sample_outputs.append(untrans_value_var) if keep_untransformed: rv_samples.name = value_var.name sample_outputs.append(rv_samples) else: rv_samples.name = rv.name sample_outputs.append(rv_samples) print("Compiling...") tic1 = pd.Timestamp.now() _sample = compile_rv_inplace( [], sample_outputs + [numpyro_samples[-1]], allow_input_downcast=True, on_unused_input="ignore", accept_inplace=True, mode="JAX", ) tic2 = pd.Timestamp.now() print("Compilation time = ", tic2 - tic1) print("Sampling...") *mcmc_samples, leapfrogs_taken = _sample() tic3 = pd.Timestamp.now() print("Sampling time = ", tic3 - tic2) posterior = {k.name: v for k, v in zip(sample_outputs, mcmc_samples)} az_trace = az.from_dict(posterior=posterior) return az_trace
def init_nuts(init='ADVI', njobs=1, n_init=500000, model=None, random_seed=-1, progressbar=True, **kwargs): """Initialize and sample from posterior of a continuous model. This is a convenience function. NUTS convergence and sampling speed is extremely dependent on the choice of mass/scaling matrix. In our experience, using ADVI to estimate a diagonal covariance matrix and using this as the scaling matrix produces robust results over a wide class of continuous models. Parameters ---------- init : str {'ADVI', 'ADVI_MAP', 'MAP', 'NUTS'} Initialization method to use. * ADVI : Run ADVI to estimate posterior mean and diagonal covariance matrix. * ADVI_MAP: Initialize ADVI with MAP and use MAP as starting point. * MAP : Use the MAP as starting point. * NUTS : Run NUTS and estimate posterior mean and covariance matrix. njobs : int Number of parallel jobs to start. n_init : int Number of iterations of initializer If 'ADVI', number of iterations, if 'metropolis', number of draws. model : Model (optional if in `with` context) progressbar : bool Whether or not to display a progressbar for advi sampling. **kwargs : keyword arguments Extra keyword arguments are forwarded to pymc3.NUTS. Returns ------- start : pymc3.model.Point Starting point for sampler nuts_sampler : pymc3.step_methods.NUTS Instantiated and initialized NUTS sampler object """ model = pm.modelcontext(model) pm._log.info('Initializing NUTS using {}...'.format(init)) random_seed = int(np.atleast_1d(random_seed)[0]) if init is not None: init = init.lower() cb = [ pm.callbacks.CheckParametersConvergence(tolerance=1e-2, diff='absolute'), pm.callbacks.CheckParametersConvergence(tolerance=1e-2, diff='relative'), ] if init == 'advi': approx = pm.fit( random_seed=random_seed, n=n_init, method='advi', model=model, callbacks=cb, progressbar=progressbar, obj_optimizer=pm.adagrad_window ) # type: pm.MeanField start = approx.sample(draws=njobs) stds = approx.gbij.rmap(approx.std.eval()) cov = model.dict_to_array(stds) ** 2 if njobs == 1: start = start[0] elif init == 'advi_map': start = pm.find_MAP() approx = pm.MeanField(model=model, start=start) pm.fit( random_seed=random_seed, n=n_init, method=pm.ADVI.from_mean_field(approx), callbacks=cb, progressbar=progressbar, obj_optimizer=pm.adagrad_window ) start = approx.sample(draws=njobs) stds = approx.gbij.rmap(approx.std.eval()) cov = model.dict_to_array(stds) ** 2 if njobs == 1: start = start[0] elif init == 'map': start = pm.find_MAP() cov = pm.find_hessian(point=start) elif init == 'nuts': init_trace = pm.sample(draws=n_init, step=pm.NUTS(), tune=n_init // 2, random_seed=random_seed) cov = np.atleast_1d(pm.trace_cov(init_trace)) start = np.random.choice(init_trace, njobs) if njobs == 1: start = start[0] else: raise NotImplementedError('Initializer {} is not supported.'.format(init)) step = pm.NUTS(scaling=cov, is_cov=True, **kwargs) return start, step
def advi_minibatch(vars=None, start=None, model=None, n=5000, n_mcsamples=1, minibatch_RVs=None, minibatch_tensors=None, minibatches=None, local_RVs=None, observed_RVs=None, encoder_params=[], total_size=None, optimizer=None, learning_rate=.001, epsilon=.1, random_seed=None): """Perform mini-batch ADVI. This function implements a mini-batch ADVI with the meanfield approximation. Autoencoding variational inference is also supported. The log probability terms for mini-batches, corresponding to RVs in minibatch_RVs, are scaled to (total_size) / (the number of samples in each mini-batch), where total_size is an argument for the total data size. minibatch_tensors is a list of tensors (can be shared variables) to which mini-batch samples are set during the optimization. In most cases, these tensors are observations for RVs in the model. local_RVs and observed_RVs are used for autoencoding variational Bayes. Both of these RVs are associated with each of given samples. The difference is that local_RVs are unkown and their posterior distributions are approximated. local_RVs are Ordered dict, whose keys and values are RVs and a tuple of two objects. The first is the theano expression of variational parameters (mean and log of std) of the approximate posterior, which are encoded from given samples by an arbitrary deterministic function, e.g., MLP. The other one is a scaling constant to be multiplied to the log probability term corresponding to the RV. observed_RVs are also Ordered dict with RVs as the keys, but whose values are only the scaling constant as in local_RVs. In this case, total_size is ignored. If local_RVs is None (thus not using autoencoder), the following two settings are equivalent: - observed_RVs=OrderedDict([(rv, total_size / minibatch_size)]) - minibatch_RVs=[rv], total_size=total_size where minibatch_size is minibatch_tensors[0].shape[0]. The variational parameters and the parameters of the autoencoder are simultaneously optimized with given optimizer, which is a function that returns a dictionary of parameter updates as provided to Theano function. See the docstring of pymc3.variational.advi(). Parameters ---------- vars : object List of random variables. If None, variational posteriors (normal distribution) are fit for all RVs in the given model. start : Dict or None Initial values of parameters (variational means). model : Model Probabilistic model. n : int Number of iterations updating parameters. n_mcsamples : int Number of Monte Carlo samples to approximate ELBO. minibatch_RVs : list of ObservedRVs Random variables in the model for which mini-batch tensors are set. When this argument is given, both of arguments local_RVs and observed_RVs must be None. minibatch_tensors : list of (tensors or shared variables) Tensors used to create ObservedRVs in minibatch_RVs. minibatches : generator of list Generates a set of minibatches when calling next(). The length of the returned list must be the same with the number of random variables in `minibatch_tensors`. total_size : int Total size of training samples. This is used to appropriately scale the log likelihood terms corresponding to mini-batches in ELBO. local_RVs : Ordered dict Include encoded variational parameters and a scaling constant for the corresponding RV. See the above description. observed_RVs : Ordered dict Include a scaling constant for the corresponding RV. See the above description encoder_params : list of theano shared variables Parameters of encoder. optimizer : (loss, list of shared variables) -> dict or OrderedDict A function that returns parameter updates given loss and shared variables of parameters. If :code:`None` (default), a default Adagrad optimizer is used with parameters :code:`learning_rate` and :code:`epsilon` below. learning_rate: float Base learning rate for adagrad. This parameter is ignored when an optimizer is given. epsilon : float Offset in denominator of the scale of learning rate in Adagrad. This parameter is ignored when an optimizer is given. random_seed : int Seed to initialize random state. Returns ------- ADVIFit Named tuple, which includes 'means', 'stds', and 'elbo_vals'. """ theano.config.compute_test_value = 'ignore' model = pm.modelcontext(model) vars = inputvars(vars if vars is not None else model.vars) start = start if start is not None else model.test_point check_discrete_rvs(vars) _check_minibatches(minibatch_tensors, minibatches) # Prepare optimizer if optimizer is None: optimizer = adagrad_optimizer(learning_rate, epsilon) # For backward compatibility in how input arguments are given local_RVs, observed_RVs = _get_rvss(minibatch_RVs, local_RVs, observed_RVs, minibatch_tensors, total_size) # Replace local_RVs with transformed variables ds = model.deterministics def get_transformed(v): if v in ds: return v.transformed return v local_RVs = OrderedDict([(get_transformed(v), (uw, s)) for v, (uw, s) in local_RVs.items()]) # Get global variables global_RVs = list(set(vars) - set(list(local_RVs) + list(observed_RVs))) # Ordering for concatenation of random variables global_order = pm.ArrayOrdering([v for v in global_RVs]) local_order = pm.ArrayOrdering([v for v in local_RVs]) # ELBO wrt variational parameters inarray_g, uw_g, replace_g = _join_global_RVs(global_RVs, global_order) inarray_l, uw_l, replace_l = _join_local_RVs(local_RVs, local_order) logpt = _make_logpt(global_RVs, local_RVs, observed_RVs, model) replace = replace_g replace.update(replace_l) logp = theano.clone(logpt, replace, strict=False) elbo = _elbo_t(logp, uw_g, uw_l, inarray_g, inarray_l, n_mcsamples, random_seed) del logpt # Replacements tensors of variational parameters in the graph replaces = dict() # Variational parameters for global RVs if 0 < len(global_RVs): uw_global_shared, bij = _init_uw_global_shared(start, global_RVs, global_order) replaces.update({uw_g: uw_global_shared}) # Variational parameters for local RVs, encoded from samples in # mini-batches if 0 < len(local_RVs): uws = [uw for _, (uw, _) in local_RVs.items()] uw_local_encoded = tt.concatenate([uw[0].ravel() for uw in uws] + [uw[1].ravel() for uw in uws]) replaces.update({uw_l: uw_local_encoded}) # Replace tensors of variational parameters in ELBO elbo = theano.clone(elbo, OrderedDict(replaces), strict=False) # Replace input shared variables with tensors def is_shared(t): return isinstance(t, theano.compile.sharedvalue.SharedVariable) tensors = [(t.type() if is_shared(t) else t) for t in minibatch_tensors] updates = OrderedDict( {t: t_ for t, t_ in zip(minibatch_tensors, tensors) if is_shared(t)}) elbo = theano.clone(elbo, updates, strict=False) # Create parameter update function used in the training loop params = encoder_params if 0 < len(global_RVs): params += [uw_global_shared] updates = OrderedDict(optimizer(loss=-1 * elbo, param=params)) f = theano.function(tensors, elbo, updates=updates) # Optimization loop elbos = np.empty(n) progress = tqdm.trange(n) for i in progress: e = f(*next(minibatches)) elbos[i] = e if i % (n // 10) == 0 and i > 0: avg_elbo = elbos[i - n // 10:i].mean() progress.set_description('Average ELBO = {:,.2f}'.format(avg_elbo)) pm._log.info('Finished minibatch ADVI: ELBO = {:,.2f}'.format(elbos[-1])) # Variational parameters of global RVs if 0 < len(global_RVs): l = int(uw_global_shared.get_value(borrow=True).size / 2) u = bij.rmap(uw_global_shared.get_value(borrow=True)[:l]) w = bij.rmap(uw_global_shared.get_value(borrow=True)[l:]) # w is in log space for var in w.keys(): w[var] = np.exp(w[var]) else: u = dict() w = dict() return ADVIFit(u, w, elbos)
def plot_map_model_and_residuals(ax, data, map_point, t_grid, prediction, gp_list=None, model=None, **kwargs): """ Plots model in data space given samples from the posterior distribution. Also plots residuals with respect to the median model, where the median model is the median of multiple posterior draws of the model in data space, rather then a single draw corresponding to median values of all parameters. All extra keyword arguments are passed to the matplotlib plot function. Parameters ---------- ax : matplotlib.axes Needs to be of shape ``(2, 1)``. data : :func:`~caustic.data.Data` Microlensing event data. model : pymc3.Model PyMC3 model object which was used to obtain posterior samples in the trace. map_point : dict Point in the parameter space for which we want to evaluate the prediction tensor. t_grid : theano.tensor Times at which we want to evaluate model predictions. Shape ``(n_bands, n_pts)``. prediction : theano.tensor Model prediction evaluated at ``t_grid``. gp_list : list List of ``exoplanet.gp.GP`` objects, one per each band. If these are provided the likelihood which is computed is the GP marginal likelihood. """ model = pm.modelcontext(model) # Load data if model.is_standardized is True: tables = data.get_standardized_data() else: tables = data.get_standardized_data(rescale=False) # Evaluate model for each sample on a fine grid n_pts_dense = T.shape(t_grid)[1].eval() n_bands = len(data.light_curves) prediction_eval = np.zeros((n_bands, n_pts_dense)) if gp_list is None: with model: prediction_eval = xo.eval_in_model(prediction, map_point) else: with model: for n in range(n_bands): prediction_eval[n] = xo.eval_in_model( gp_list[n].predict(t_grid[n]), map_point) # Add mean model to GP prediction prediction_eval += xo.eval_in_model(prediction, map_point) # Plot model predictions for each different samples from posterior on dense # grid for n in range(n_bands): # iterate over bands ax[0].plot( t_grid[n].eval(), prediction_eval[n, :], color="C" + str(n), **kwargs, ) # Plot data data.plot_standardized_data(ax[0], rescale=model.is_standardized) ax[0].set_xlabel(None) ax[1].set_xlabel("HJD - 2450000") ax[1].set_ylabel("Residuals") ax[0].set_xlim(T.min(t_grid).eval(), T.max(t_grid).eval()) # Compute residuals with respect to median model for n in range(n_bands): # Interpolate median predictions onto a grid of observed times map_prediction_interp = np.interp(tables[n]["HJD"], t_grid[n].eval(), prediction_eval[n]) residuals = tables[n]["flux"] - map_prediction_interp ax[1].errorbar( tables[n]["HJD"], residuals, tables[n]["flux_err"], fmt="o", color="C" + str(n), alpha=0.2, **kwargs, ) ax[1].grid(True)
def init_nuts(init='auto', njobs=1, n_init=500000, model=None, random_seed=-1, progressbar=True, **kwargs): """Set up the mass matrix initialization for NUTS. NUTS convergence and sampling speed is extremely dependent on the choice of mass/scaling matrix. This function implements different methods for choosing or adapting the mass matrix. Parameters ---------- init : str Initialization method to use. * auto : Choose a default initialization method automatically. Currently, this is `'jitter+adapt_diag'`, but this can change in the future. If you depend on the exact behaviour, choose an initialization method explicitly. * adapt_diag : Start with a identity mass matrix and then adapt a diagonal based on the variance of the tuning samples. All chains use the test value (usually the prior mean) as starting point. * jitter+adapt_diag : Same as `adapt_diag`, but add uniform jitter in [-1, 1] to the starting point in each chain. * advi+adapt_diag : Run ADVI and then adapt the resulting diagonal mass matrix based on the sample variance of the tuning samples. * advi+adapt_diag_grad : Run ADVI and then adapt the resulting diagonal mass matrix based on the variance of the gradients during tuning. This is **experimental** and might be removed in a future release. * advi : Run ADVI to estimate posterior mean and diagonal mass matrix. * advi_map: Initialize ADVI with MAP and use MAP as starting point. * map : Use the MAP as starting point. This is discouraged. * nuts : Run NUTS and estimate posterior mean and mass matrix from the trace. njobs : int Number of parallel jobs to start. n_init : int Number of iterations of initializer If 'ADVI', number of iterations, if 'nuts', number of draws. model : Model (optional if in `with` context) progressbar : bool Whether or not to display a progressbar for advi sampling. **kwargs : keyword arguments Extra keyword arguments are forwarded to pymc3.NUTS. Returns ------- start : pymc3.model.Point Starting point for sampler nuts_sampler : pymc3.step_methods.NUTS Instantiated and initialized NUTS sampler object """ model = pm.modelcontext(model) vars = kwargs.get('vars', model.vars) if set(vars) != set(model.vars): raise ValueError('Must use init_nuts on all variables of a model.') if not pm.model.all_continuous(vars): raise ValueError('init_nuts can only be used for models with only ' 'continuous variables.') if not isinstance(init, str): raise TypeError('init must be a string.') if init is not None: init = init.lower() if init == 'auto': init = 'jitter+adapt_diag' pm._log.info('Initializing NUTS using {}...'.format(init)) random_seed = int(np.atleast_1d(random_seed)[0]) cb = [ pm.callbacks.CheckParametersConvergence( tolerance=1e-2, diff='absolute'), pm.callbacks.CheckParametersConvergence( tolerance=1e-2, diff='relative'), ] if init == 'adapt_diag': start = [model.test_point] * njobs mean = np.mean([model.dict_to_array(vals) for vals in start], axis=0) var = np.ones_like(mean) potential = quadpotential.QuadPotentialDiagAdapt( model.ndim, mean, var, 10) if njobs == 1: start = start[0] elif init == 'jitter+adapt_diag': start = [] for _ in range(njobs): mean = {var: val.copy() for var, val in model.test_point.items()} for val in mean.values(): val[...] += 2 * np.random.rand(*val.shape) - 1 start.append(mean) mean = np.mean([model.dict_to_array(vals) for vals in start], axis=0) var = np.ones_like(mean) potential = quadpotential.QuadPotentialDiagAdapt( model.ndim, mean, var, 10) if njobs == 1: start = start[0] elif init == 'advi+adapt_diag_grad': approx = pm.fit( random_seed=random_seed, n=n_init, method='advi', model=model, callbacks=cb, progressbar=progressbar, obj_optimizer=pm.adagrad_window, ) # type: pm.MeanField start = approx.sample(draws=njobs) start = list(start) stds = approx.bij.rmap(approx.std.eval()) cov = model.dict_to_array(stds) ** 2 mean = approx.bij.rmap(approx.mean.get_value()) mean = model.dict_to_array(mean) weight = 50 potential = quadpotential.QuadPotentialDiagAdaptGrad( model.ndim, mean, cov, weight) if njobs == 1: start = start[0] elif init == 'advi+adapt_diag': approx = pm.fit( random_seed=random_seed, n=n_init, method='advi', model=model, callbacks=cb, progressbar=progressbar, obj_optimizer=pm.adagrad_window, ) # type: pm.MeanField start = approx.sample(draws=njobs) start = list(start) stds = approx.bij.rmap(approx.std.eval()) cov = model.dict_to_array(stds) ** 2 mean = approx.bij.rmap(approx.mean.get_value()) mean = model.dict_to_array(mean) weight = 50 potential = quadpotential.QuadPotentialDiagAdapt( model.ndim, mean, cov, weight) if njobs == 1: start = start[0] elif init == 'advi': approx = pm.fit( random_seed=random_seed, n=n_init, method='advi', model=model, callbacks=cb, progressbar=progressbar, obj_optimizer=pm.adagrad_window ) # type: pm.MeanField start = approx.sample(draws=njobs) start = list(start) stds = approx.bij.rmap(approx.std.eval()) cov = model.dict_to_array(stds) ** 2 potential = quadpotential.QuadPotentialDiag(cov) if njobs == 1: start = start[0] elif init == 'advi_map': start = pm.find_MAP() approx = pm.MeanField(model=model, start=start) pm.fit( random_seed=random_seed, n=n_init, method=pm.KLqp(approx), callbacks=cb, progressbar=progressbar, obj_optimizer=pm.adagrad_window ) start = approx.sample(draws=njobs) start = list(start) stds = approx.bij.rmap(approx.std.eval()) cov = model.dict_to_array(stds) ** 2 potential = quadpotential.QuadPotentialDiag(cov) if njobs == 1: start = start[0] elif init == 'map': start = pm.find_MAP() cov = pm.find_hessian(point=start) start = [start] * njobs potential = quadpotential.QuadPotentialFull(cov) if njobs == 1: start = start[0] elif init == 'nuts': init_trace = pm.sample(draws=n_init, step=pm.NUTS(), tune=n_init // 2, random_seed=random_seed) cov = np.atleast_1d(pm.trace_cov(init_trace)) start = list(np.random.choice(init_trace, njobs)) potential = quadpotential.QuadPotentialFull(cov) if njobs == 1: start = start[0] else: raise NotImplementedError('Initializer {} is not supported.'.format(init)) step = pm.NUTS(potential=potential, **kwargs) return start, step
def advi(vars=None, start=None, model=None, n=5000, accurate_elbo=False, optimizer=None, learning_rate=.001, epsilon=.1, random_seed=None, verbose=1, dp_par=None): """Perform automatic differentiation variational inference (ADVI). This function implements the meanfield ADVI, where the variational posterior distribution is assumed to be spherical Gaussian without correlation of parameters and fit to the true posterior distribution. The means and standard deviations of the variational posterior are referred to as variational parameters. The return value of this function is an :code:`ADVIfit` object, which has variational parameters. If you want to draw samples from the variational posterior, you need to pass the :code:`ADVIfit` object to :code:`pymc3.variational.sample_vp()`. The variational parameters are defined on the transformed space, which is required to do ADVI on an unconstrained parameter space as described in [KTR+2016]. The parameters in the :code:`ADVIfit` object are in the transformed space, while traces returned by :code:`sample_vp()` are in the original space as obtained by MCMC sampling methods in PyMC3. The variational parameters are optimized with given optimizer, which is a function that returns a dictionary of parameter updates as provided to Theano function. If no optimizer is provided, optimization is performed with a modified version of adagrad, where only the last (n_window) gradient vectors are used to control the learning rate and older gradient vectors are ignored. n_window denotes the size of time window and fixed to 10. Parameters ---------- vars : object Random variables. start : Dict or None Initial values of parameters (variational means). model : Model Probabilistic model. n : int Number of interations updating parameters. accurate_elbo : bool If true, 100 MC samples are used for accurate calculation of ELBO. optimizer : (loss, tensor) -> dict or OrderedDict A function that returns parameter updates given loss and parameter tensor. If :code:`None` (default), a default Adagrad optimizer is used with parameters :code:`learning_rate` and :code:`epsilon` below. learning_rate: float Base learning rate for adagrad. This parameter is ignored when optimizer is given. epsilon : float Offset in denominator of the scale of learning rate in Adagrad. This parameter is ignored when optimizer is given. random_seed : int or None Seed to initialize random state. None uses current seed. Returns ------- ADVIFit Named tuple, which includes 'means', 'stds', and 'elbo_vals'. 'means' is the mean. 'stds' is the standard deviation. 'elbo_vals' is the trace of ELBO values during optimizaiton. References ---------- .. [KTR+2016] Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A., and Blei, D. M. (2016). Automatic Differentiation Variational Inference. arXiv preprint arXiv:1603.00788. """ model = pm.modelcontext(model) if start is None: start = model.test_point if vars is None: vars = model.vars vars = pm.inputvars(vars) check_discrete_rvs(vars) n_mcsamples = 100 if accurate_elbo else 1 # Prepare optimizer if optimizer is None: optimizer = adagrad_optimizer(learning_rate, epsilon) # Create variational gradient tensor elbo, shared = _calc_elbo(vars, model, n_mcsamples=n_mcsamples, random_seed=random_seed) # Set starting values for var, share in shared.items(): share.set_value(start[str(var)]) order = pm.ArrayOrdering(vars) bij = pm.DictToArrayBijection(order, start) u_start = bij.map(start) w_start = np.zeros_like(u_start) uw = np.concatenate([u_start, w_start]) # Create parameter update function used in the training loop uw_shared = theano.shared(uw, 'uw_shared') elbo = pm.CallableTensor(elbo)(uw_shared) updates = optimizer(likeloss=-1 * elbo[0], entroloss=-1 * elbo[1], param=uw_shared, dp_par=dp_par, n_par=len(vars)) f = theano.function( [], [uw_shared, tt.add(elbo[1], tt.sum(elbo[0], axis=0))], updates=updates) # Optimization loop elbos = np.empty(n) try: for i in range(n): uw_i, e = f() elbos[i] = e if verbose and not i % (n // 10): if not i: print('Iteration {0} [{1}%]: ELBO = {2}'.format( i, 100 * i // n, e.round(2))) else: avg_elbo = elbos[i - n // 10:i].mean() print('Iteration {0} [{1}%]: Average ELBO = {2}'.format( i, 100 * i // n, avg_elbo.round(2))) except KeyboardInterrupt: if verbose: elbos = elbos[:i] avg_elbo = elbos[i - n // 10:].mean() print('Interrupted at {0} [{1}%]: Average ELBO = {2}'.format( i, 100 * i // n, avg_elbo.round(2))) else: if verbose: avg_elbo = elbos[-n // 10:].mean() print('Finished [100%]: Average ELBO = {}'.format( avg_elbo.round(2))) # Estimated parameters l = int(uw_i.size / 2) u = bij.rmap(uw_i[:l]) w = bij.rmap(uw_i[l:]) # w is in log space for var in w.keys(): w[var] = np.exp(w[var]) return ADVIFit(u, w, elbos)
def advi_minibatch(vars=None, start=None, model=None, n=5000, n_mcsamples=1, minibatch_RVs=None, minibatch_tensors=None, minibatches=None, local_RVs=None, observed_RVs=None, encoder_params=None, total_size=None, optimizer=None, learning_rate=.001, epsilon=.1, random_seed=None): """Perform mini-batch ADVI. This function implements a mini-batch ADVI with the meanfield approximation. Autoencoding variational inference is also supported. The log probability terms for mini-batches, corresponding to RVs in minibatch_RVs, are scaled to (total_size) / (the number of samples in each mini-batch), where total_size is an argument for the total data size. minibatch_tensors is a list of tensors (can be shared variables) to which mini-batch samples are set during the optimization. In most cases, these tensors are observations for RVs in the model. local_RVs and observed_RVs are used for autoencoding variational Bayes. Both of these RVs are associated with each of given samples. The difference is that local_RVs are unkown and their posterior distributions are approximated. local_RVs are Ordered dict, whose keys and values are RVs and a tuple of two objects. The first is the theano expression of variational parameters (mean and log of std) of the approximate posterior, which are encoded from given samples by an arbitrary deterministic function, e.g., MLP. The other one is a scaling constant to be multiplied to the log probability term corresponding to the RV. observed_RVs are also Ordered dict with RVs as the keys, but whose values are only the scaling constant as in local_RVs. In this case, total_size is ignored. If local_RVs is None (thus not using autoencoder), the following two settings are equivalent: - observed_RVs=OrderedDict([(rv, total_size / minibatch_size)]) - minibatch_RVs=[rv], total_size=total_size where minibatch_size is minibatch_tensors[0].shape[0]. The variational parameters and the parameters of the autoencoder are simultaneously optimized with given optimizer, which is a function that returns a dictionary of parameter updates as provided to Theano function. See the docstring of pymc3.variational.advi(). Parameters ---------- vars : object List of random variables. If None, variational posteriors (normal distribution) are fit for all RVs in the given model. start : Dict or None Initial values of parameters (variational means). model : Model Probabilistic model. n : int Number of iterations updating parameters. n_mcsamples : int Number of Monte Carlo samples to approximate ELBO. minibatch_RVs : list of ObservedRVs Random variables in the model for which mini-batch tensors are set. When this argument is given, both of arguments local_RVs and observed_RVs must be None. minibatch_tensors : list of (tensors or shared variables) Tensors used to create ObservedRVs in minibatch_RVs. minibatches : generator of list Generates a set of minibatches when calling next(). The length of the returned list must be the same with the number of random variables in `minibatch_tensors`. total_size : int Total size of training samples. This is used to appropriately scale the log likelihood terms corresponding to mini-batches in ELBO. local_RVs : Ordered dict Include encoded variational parameters and a scaling constant for the corresponding RV. See the above description. observed_RVs : Ordered dict Include a scaling constant for the corresponding RV. See the above description encoder_params : list of theano shared variables Parameters of encoder. optimizer : (loss, list of shared variables) -> dict or OrderedDict A function that returns parameter updates given loss and shared variables of parameters. If :code:`None` (default), a default Adagrad optimizer is used with parameters :code:`learning_rate` and :code:`epsilon` below. learning_rate: float Base learning rate for adagrad. This parameter is ignored when an optimizer is given. epsilon : float Offset in denominator of the scale of learning rate in Adagrad. This parameter is ignored when an optimizer is given. random_seed : int Seed to initialize random state. Returns ------- ADVIFit Named tuple, which includes 'means', 'stds', and 'elbo_vals'. """ theano.config.compute_test_value = 'ignore' model = pm.modelcontext(model) vars = inputvars(vars if vars is not None else model.vars) start = start if start is not None else model.test_point check_discrete_rvs(vars) _check_minibatches(minibatch_tensors, minibatches) if encoder_params is None: encoder_params = [] # Prepare optimizer if optimizer is None: optimizer = adagrad_optimizer(learning_rate, epsilon) # For backward compatibility in how input arguments are given local_RVs, observed_RVs = _get_rvss(minibatch_RVs, local_RVs, observed_RVs, minibatch_tensors, total_size) # Replace local_RVs with transformed variables ds = model.deterministics def get_transformed(v): if v in ds: return v.transformed return v local_RVs = OrderedDict( [(get_transformed(v), (uw, s)) for v, (uw, s) in local_RVs.items()] ) # Get global variables global_RVs = list(set(vars) - set(list(local_RVs) + list(observed_RVs))) # Ordering for concatenation of random variables global_order = pm.ArrayOrdering([v for v in global_RVs]) local_order = pm.ArrayOrdering([v for v in local_RVs]) # ELBO wrt variational parameters inarray_g, uw_g, replace_g = _join_global_RVs(global_RVs, global_order) inarray_l, uw_l, replace_l = _join_local_RVs(local_RVs, local_order) logpt = _make_logpt(global_RVs, local_RVs, observed_RVs, model) replace = replace_g replace.update(replace_l) logp = theano.clone(logpt, replace, strict=False) elbo = _elbo_t(logp, uw_g, uw_l, inarray_g, inarray_l, n_mcsamples, random_seed) del logpt # Replacements tensors of variational parameters in the graph replaces = dict() # Variational parameters for global RVs if 0 < len(global_RVs): uw_global_shared, bij = _init_uw_global_shared(start, global_RVs, global_order) replaces.update({uw_g: uw_global_shared}) # Variational parameters for local RVs, encoded from samples in # mini-batches if 0 < len(local_RVs): uws = [uw for _, (uw, _) in local_RVs.items()] uw_local_encoded = tt.concatenate([uw[0].ravel() for uw in uws] + [uw[1].ravel() for uw in uws]) replaces.update({uw_l: uw_local_encoded}) # Replace tensors of variational parameters in ELBO elbo = theano.clone(elbo, OrderedDict(replaces), strict=False) # Replace input shared variables with tensors def is_shared(t): return isinstance(t, theano.compile.sharedvalue.SharedVariable) tensors = [(t.type() if is_shared(t) else t) for t in minibatch_tensors] updates = OrderedDict( {t: t_ for t, t_ in zip(minibatch_tensors, tensors) if is_shared(t)} ) elbo = theano.clone(elbo, updates, strict=False) # Create parameter update function used in the training loop params = encoder_params if 0 < len(global_RVs): params += [uw_global_shared] updates = OrderedDict(optimizer(loss=-1 * elbo, param=params)) f = theano.function(tensors, elbo, updates=updates) # Optimization loop elbos = np.empty(n) progress = tqdm.trange(n) for i in progress: e = f(*next(minibatches)) elbos[i] = e if i % (n // 10) == 0 and i > 0: avg_elbo = elbos[i - n // 10:i].mean() progress.set_description('Average ELBO = {:,.2f}'.format(avg_elbo)) pm._log.info('Finished minibatch ADVI: ELBO = {:,.2f}'.format(elbos[-1])) # Variational parameters of global RVs if 0 < len(global_RVs): l = int(uw_global_shared.get_value(borrow=True).size / 2) u = bij.rmap(uw_global_shared.get_value(borrow=True)[:l]) w = bij.rmap(uw_global_shared.get_value(borrow=True)[l:]) # w is in log space for var in w.keys(): w[var] = np.exp(w[var]) else: u = dict() w = dict() return ADVIFit(u, w, elbos)
def vaneylen19( name, fixed=False, multi=False, lower=None, upper=None, model=None, **kwargs, ): """The eccentricity distribution for small planets The mixture distribution fit by `Van Eylen et al. (2019) <https://arxiv.org/abs/1807.00549>`_ to a population of well-characterized small transiting planets observed by Kepler. Args: name (str): The name of the eccentricity variable. fixed (bool, optional): If ``True``, use the posterior median hyperparameters. Otherwise, marginalize over the parameters. multi (bool, optional): If ``True``, use the distribution for systems with multiple transiting planets. If ``False`` (default), use the distribution for systems with only one detected transiting planet. lower (float, optional): Restrict the eccentricity to be larger than this value. upper (float, optional): Restrict the eccentricity to be smaller than this value. Returns: The eccentricity distribution. """ model = pm.modelcontext(model) add_citations_to_model(["vaneylen19"], model=model) sigma_gauss_mu = 0.049 sigma_gauss_sd = 0.02 sigma_rayleigh_mu = 0.26 sigma_rayleigh_sd = 0.05 if multi: frac_mu = 0.08 frac_sd = 0.08 else: frac_mu = 0.76 frac_sd = 0.2 with model: if lower is None and upper is None: ecc = UnitUniform(name, **kwargs) else: ecc = pm.Uniform( name, lower=0.0 if lower is None else lower, upper=1.0 if upper is None else upper, **kwargs, ) with pm.Model(name=name): if fixed: sigma_gauss = sigma_gauss_mu sigma_rayleigh = sigma_rayleigh_mu frac = frac_mu else: bounded_normal = pm.Bound(pm.Normal, lower=0) sigma_gauss = bounded_normal( "sigma_gauss", mu=sigma_gauss_mu, sd=sigma_gauss_sd, testval=sigma_gauss_mu, ) sigma_rayleigh = bounded_normal( "sigma_rayleigh", mu=sigma_rayleigh_mu, sd=sigma_rayleigh_sd, testval=sigma_rayleigh_mu, ) frac = pm.Bound(pm.Normal, lower=0, upper=1)( "frac", mu=frac_mu, sd=frac_sd, testval=frac_mu ) gauss = pm.HalfNormal.dist(sigma=sigma_gauss) rayleigh = pm.Weibull.dist( alpha=2, beta=np.sqrt(2) * sigma_rayleigh ) pm.Potential( "prior", pm.math.logaddexp( tt.log(1 - frac) + gauss.logp(ecc), tt.log(frac) + rayleigh.logp(ecc), ), ) return ecc
def __init__(self, var, model=None, values=[0,1]): model = pm.modelcontext(model) self.values = values self.i = 0 super(SequentialScanDiscreteMetropolis, self).__init__([var], [model.fastlogp])
def kipping13( name, fixed=False, long=None, lower=None, upper=None, model=None, **kwargs ): """The beta eccentricity distribution fit by Kipping (2013) The beta distribution parameters fit by `Kipping (2013b) <https://arxiv.org/abs/1306.4982>`_. Args: name (str): The name of the eccentricity variable. fixed (bool, optional): If ``True``, use the posterior median hyperparameters. Otherwise, marginalize over the parameters. long (bool, optional): If ``True``, use the parameters for the long period fit. If ``False``, use the parameters for the short period fit. If not given, the parameters fit using the full dataset are used. lower (float, optional): Restrict the eccentricity to be larger than this value. upper (float, optional): Restrict the eccentricity to be smaller than this value. Returns: The eccentricity distribution. """ model = pm.modelcontext(model) add_citations_to_model(["kipping13b"], model=model) if long is None: # If 'long' is not provided, use the fit for the parameters from the # full dataset alpha_mu = 1.12 alpha_sd = 0.1 beta_mu = 3.09 beta_sd = 0.3 else: # If 'long' is set, select either the long or short period model # parameters if long: alpha_mu = 1.12 alpha_sd = 0.1 beta_mu = 3.09 beta_sd = 0.3 else: alpha_mu = 0.697 alpha_sd = 0.4 beta_mu = 3.27 beta_sd = 0.3 with model: if fixed: # Use the posterior median parameters alpha = alpha_mu beta = beta_mu else: # Marginalize over the uncertainty on the parameters of the beta with pm.Model(name=name): bounded_normal = pm.Bound(pm.Normal, lower=0) alpha = bounded_normal( "alpha", mu=alpha_mu, sd=alpha_sd, testval=alpha_mu ) beta = bounded_normal( "beta", mu=beta_mu, sd=beta_sd, testval=beta_mu ) # Allow for upper and lower bounds if lower is not None or upper is not None: dist = pm.Bound( pm.Beta, lower=0.0 if lower is None else lower, upper=1.0 if upper is None else upper, ) return dist(name, alpha=alpha, beta=beta, **kwargs) return pm.Beta(name, alpha=alpha, beta=beta, **kwargs)
def __init__( self, coarse_models: List[Model], vars: Optional[list] = None, base_sampler="DEMetropolisZ", base_S: Optional = None, base_proposal_dist: Optional[Type[Proposal]] = None, base_scaling: Optional = None, tune: bool = True, base_tune_target: str = "lambda", base_tune_interval: int = 100, base_lamb: Optional = None, base_tune_drop_fraction: float = 0.9, model: Optional[Model] = None, mode: Optional = None, subsampling_rates: List[int] = 5, base_blocked: bool = False, variance_reduction: bool = False, store_Q_fine: bool = False, adaptive_error_model: bool = False, **kwargs, ) -> None: # this variable is used to identify MLDA objects which are # not in the finest level (i.e. child MLDA objects) self.is_child = kwargs.get("is_child", False) if not self.is_child: warnings.warn( "The MLDA implementation in PyMC3 is still immature. You should be particularly critical of its results." ) if not isinstance(coarse_models, list): raise ValueError( "MLDA step method cannot use coarse_models if it is not a list" ) if len(coarse_models) == 0: raise ValueError("MLDA step method was given an empty " "list of coarse models. Give at least " "one coarse model.") # assign internal state model = pm.modelcontext(model) self.model = model self.coarse_models = coarse_models self.model_below = self.coarse_models[-1] self.num_levels = len(self.coarse_models) + 1 # set up variance reduction. self.variance_reduction = variance_reduction self.store_Q_fine = store_Q_fine # check that certain requirements hold # for the variance reduction feature to work if self.variance_reduction or self.store_Q_fine: if not hasattr(self.model, "Q"): raise AttributeError("Model given to MLDA does not contain" "variable 'Q'. You need to include" "the variable in the model definition" "for variance reduction to work or" "for storing the fine Q." "Use pm.Data() to define it.") if not isinstance(self.model.Q, tt.sharedvar.TensorSharedVariable): raise TypeError( "The variable 'Q' in the model definition is not of type " "'TensorSharedVariable'. Use pm.Data() to define the" "variable.") if self.is_child and self.variance_reduction: # this is the subsampling rate applied to the current level # it is stored in the level above and transferred here self.subsampling_rate_above = kwargs.pop("subsampling_rate_above", None) # set up adaptive error model self.adaptive_error_model = adaptive_error_model # check that certain requirements hold # for the adaptive error model feature to work if self.adaptive_error_model: if not hasattr(self.model_below, "mu_B"): raise AttributeError( "Model below in hierarchy does not contain" "variable 'mu_B'. You need to include" "the variable in the model definition" "for adaptive error model to work." "Use pm.Data() to define it.") if not hasattr(self.model_below, "Sigma_B"): raise AttributeError( "Model below in hierarchy does not contain" "variable 'Sigma_B'. You need to include" "the variable in the model definition" "for adaptive error model to work." "Use pm.Data() to define it.") if not (isinstance(self.model_below.mu_B, tt.sharedvar.TensorSharedVariable) and isinstance(self.model_below.Sigma_B, tt.sharedvar.TensorSharedVariable)): raise TypeError( "At least one of the variables 'mu_B' and 'Sigma_B' " "in the definition of the below model is not of type " "'TensorSharedVariable'. Use pm.Data() to define those " "variables.") # this object is used to recursively update the mean and # variance of the bias correction given new differences # between levels self.bias = RecursiveSampleMoments( self.model_below.mu_B.get_value(), self.model_below.Sigma_B.get_value()) # this list holds the bias objects from all levels # it is gradually constructed when MLDA objects are # created and then shared between all levels self.bias_all = kwargs.pop("bias_all", None) if self.bias_all is None: self.bias_all = [self.bias] else: self.bias_all.append(self.bias) # variables used for adaptive error model self.last_synced_output_diff = None self.adaptation_started = False # set up subsampling rates. if isinstance(subsampling_rates, int): self.subsampling_rates = [subsampling_rates] * len( self.coarse_models) else: if len(subsampling_rates) != len(self.coarse_models): raise ValueError( f"List of subsampling rates needs to have the same " f"length as list of coarse models but the lengths " f"were {len(subsampling_rates)}, {len(self.coarse_models)}" ) self.subsampling_rates = subsampling_rates self.subsampling_rate = self.subsampling_rates[-1] self.subchain_selection = None # set up base sampling self.base_sampler = base_sampler # VR is not compatible with compound base samplers so an automatic conversion # to a block sampler happens here if if self.variance_reduction and self.base_sampler == "Metropolis" and not base_blocked: warnings.warn( "Variance reduction is not compatible with non-blocked (compound) samplers." "Automatically switching to a blocked Metropolis sampler.") self.base_blocked = True else: self.base_blocked = base_blocked self.base_S = base_S self.base_proposal_dist = base_proposal_dist if base_scaling is None: if self.base_sampler == "Metropolis": self.base_scaling = 1.0 else: self.base_scaling = 0.001 else: self.base_scaling = float(base_scaling) self.tune = tune if not self.tune and self.base_sampler == "DEMetropolisZ": raise ValueError( f"The argument tune was set to False while using" f" a 'DEMetropolisZ' base sampler. 'DEMetropolisZ' " f" tune needs to be True.") self.base_tune_target = base_tune_target self.base_tune_interval = base_tune_interval self.base_lamb = base_lamb self.base_tune_drop_fraction = float(base_tune_drop_fraction) self.base_tuning_stats = None self.mode = mode # Process model variables if vars is None: vars = model.vars vars = pm.inputvars(vars) self.vars = vars self.var_names = [var.name for var in self.vars] self.accepted = 0 # Construct theano function for current-level model likelihood # (for use in acceptance) shared = pm.make_shared_replacements(vars, model) self.delta_logp = delta_logp_inverse(model.logpt, vars, shared) # Construct theano function for below-level model likelihood # (for use in acceptance) model_below = pm.modelcontext(self.model_below) vars_below = [ var for var in model_below.vars if var.name in self.var_names ] vars_below = pm.inputvars(vars_below) shared_below = pm.make_shared_replacements(vars_below, model_below) self.delta_logp_below = delta_logp(model_below.logpt, vars_below, shared_below) super().__init__(vars, shared) # initialise complete step method hierarchy if self.num_levels == 2: with self.model_below: # make sure the correct variables are selected from model_below vars_below = [ var for var in self.model_below.vars if var.name in self.var_names ] # create kwargs if self.variance_reduction: base_kwargs = { "mlda_subsampling_rate_above": self.subsampling_rate, "mlda_variance_reduction": True, } else: base_kwargs = {} if self.base_sampler == "Metropolis": # MetropolisMLDA sampler in base level (level=0), targeting self.model_below self.step_method_below = pm.MetropolisMLDA( vars=vars_below, proposal_dist=self.base_proposal_dist, S=self.base_S, scaling=self.base_scaling, tune=self.tune, tune_interval=self.base_tune_interval, model=None, mode=self.mode, blocked=self.base_blocked, **base_kwargs, ) else: # DEMetropolisZMLDA sampler in base level (level=0), targeting self.model_below self.step_method_below = pm.DEMetropolisZMLDA( vars=vars_below, S=self.base_S, proposal_dist=self.base_proposal_dist, lamb=self.base_lamb, scaling=self.base_scaling, tune=self.base_tune_target, tune_interval=self.base_tune_interval, tune_drop_fraction=self.base_tune_drop_fraction, model=None, mode=self.mode, **base_kwargs, ) else: # drop the last coarse model coarse_models_below = self.coarse_models[:-1] subsampling_rates_below = self.subsampling_rates[:-1] with self.model_below: # make sure the correct variables are selected from model_below vars_below = [ var for var in self.model_below.vars if var.name in self.var_names ] # create kwargs if self.variance_reduction: mlda_kwargs = { "is_child": True, "subsampling_rate_above": self.subsampling_rate, } else: mlda_kwargs = {"is_child": True} if self.adaptive_error_model: mlda_kwargs = { **mlda_kwargs, **{ "bias_all": self.bias_all } } # MLDA sampler in some intermediate level, targeting self.model_below self.step_method_below = pm.MLDA( vars=vars_below, base_S=self.base_S, base_sampler=self.base_sampler, base_proposal_dist=self.base_proposal_dist, base_scaling=self.base_scaling, tune=self.tune, base_tune_target=self.base_tune_target, base_tune_interval=self.base_tune_interval, base_lamb=self.base_lamb, base_tune_drop_fraction=self.base_tune_drop_fraction, model=None, mode=self.mode, subsampling_rates=subsampling_rates_below, coarse_models=coarse_models_below, base_blocked=self.base_blocked, variance_reduction=self.variance_reduction, store_Q_fine=False, adaptive_error_model=self.adaptive_error_model, **mlda_kwargs, ) # instantiate the recursive DA proposal. # this is the main proposal used for # all levels (Recursive Delayed Acceptance) # (except for level 0 where the step method is MetropolisMLDA # or DEMetropolisZMLDA - not MLDA) self.proposal_dist = RecursiveDAProposal(self.step_method_below, self.model_below, self.tune, self.subsampling_rate) # set up data types of stats. if isinstance(self.step_method_below, MLDA): # get the stat types from the level below if that level is MLDA self.stats_dtypes = self.step_method_below.stats_dtypes else: # otherwise, set it up from scratch. self.stats_dtypes = [{ "accept": np.float64, "accepted": np.bool, "tune": np.bool }] if isinstance(self.step_method_below, MetropolisMLDA): self.stats_dtypes.append({"base_scaling": np.float64}) elif isinstance(self.step_method_below, DEMetropolisZMLDA): self.stats_dtypes.append({ "base_scaling": np.float64, "base_lambda": np.float64 }) elif isinstance(self.step_method_below, CompoundStep): for method in self.step_method_below.methods: if isinstance(method, MetropolisMLDA): self.stats_dtypes.append({"base_scaling": np.float64}) elif isinstance(method, DEMetropolisZMLDA): self.stats_dtypes.append({ "base_scaling": np.float64, "base_lambda": np.float64 }) # initialise necessary variables for doing variance reduction if self.variance_reduction: self.sub_counter = 0 self.Q_diff = [] if self.is_child: self.Q_reg = [np.nan] * self.subsampling_rate_above if self.num_levels == 2: self.Q_base_full = [] if not self.is_child: for level in range(self.num_levels - 1, 0, -1): self.stats_dtypes[0][f"Q_{level}_{level - 1}"] = object self.stats_dtypes[0]["Q_0"] = object # initialise necessary variables for doing variance reduction or storing fine Q if self.variance_reduction or self.store_Q_fine: self.Q_last = np.nan self.Q_diff_last = np.nan if self.store_Q_fine and not self.is_child: self.stats_dtypes[0][f"Q_{self.num_levels - 1}"] = object
def get_args_for_theano_function(point=None, model=None): model = pm.modelcontext(model) if point is None: point = model.test_point return [point[k.name] for k in model.vars]
def get_theano_function_for_var(var, model=None, **kwargs): model = pm.modelcontext(model) kwargs["on_unused_input"] = kwargs.get("on_unused_input", "ignore") return theano.function(model.vars, var, **kwargs)
def sample_vp(vparams, draws=1000, model=None, local_RVs=None, random_seed=None, hide_transformed=True, progressbar=True): """Draw samples from variational posterior. Parameters ---------- vparams : dict or pymc3.variational.ADVIFit Estimated variational parameters of the model. draws : int Number of random samples. model : pymc3.Model Probabilistic model. random_seed : int or None Seed of random number generator. None to use current seed. hide_transformed : bool If False, transformed variables are also sampled. Default is True. Returns ------- trace : pymc3.backends.base.MultiTrace Samples drawn from the variational posterior. """ model = pm.modelcontext(model) if isinstance(vparams, ADVIFit): vparams = {'means': vparams.means, 'stds': vparams.stds} ds = model.deterministics get_transformed = lambda v: v if v not in ds else v.transformed rvs = lambda x: [get_transformed(v) for v in x] if x is not None else [] global_RVs = list(set(model.free_RVs) - set(rvs(local_RVs))) # Make dict for replacements of random variables if random_seed is None: r = MRG_RandomStreams(seed=123) else: r = MRG_RandomStreams(seed=123) updates = {} for v in global_RVs: u = theano.shared(vparams['means'][str(v)]).ravel() w = theano.shared(vparams['stds'][str(v)]).ravel() n = r.normal(size=u.tag.test_value.shape) updates.update({v: (n * w + u).reshape(v.tag.test_value.shape)}) if local_RVs is not None: ds = model.deterministics get_transformed = lambda v: v if v not in ds else v.transformed for v_, (uw, _) in local_RVs.items(): v = get_transformed(v_) u = uw[0].ravel() w = uw[1].ravel() n = r.normal(size=u.tag.test_value.shape) updates.update( {v: (n * tt.exp(w) + u).reshape(v.tag.test_value.shape)}) # Replace some nodes of the graph with variational distributions vars = model.free_RVs samples = theano.clone(vars, updates) f = theano.function([], samples) # Random variables which will be sampled vars_sampled = [v for v in model.unobserved_RVs if not str(v).endswith('_')] \ if hide_transformed else \ [v for v in model.unobserved_RVs] varnames = [str(var) for var in model.unobserved_RVs] trace = pm.sampling.NDArray(model=model, vars=vars_sampled) trace.setup(draws=draws, chain=0) range_ = trange(draws) if progressbar else range(draws) for i in range_: # 'point' is like {'var1': np.array(0.1), 'var2': np.array(0.2), ...} point = {varname: value for varname, value in zip(varnames, f())} trace.record(point) return MultiTrace([trace])
def fit( n=10000, local_rv=None, method="advi", model=None, random_seed=None, start=None, inf_kwargs=None, **kwargs, ): r"""Handy shortcut for using inference methods in functional way Parameters ---------- n: `int` number of iterations local_rv: dict[var->tuple] mapping {model_variable -> approx params} Local Vars are used for Autoencoding Variational Bayes See (AEVB; Kingma and Welling, 2014) for details method: str or :class:`Inference` string name is case insensitive in: - 'advi' for ADVI - 'fullrank_advi' for FullRankADVI - 'svgd' for Stein Variational Gradient Descent - 'asvgd' for Amortized Stein Variational Gradient Descent - 'nfvi' for Normalizing Flow with default `scale-loc` flow - 'nfvi=<formula>' for Normalizing Flow using formula model: :class:`Model` PyMC3 model for inference random_seed: None or int leave None to use package global RandomStream or other valid value to create instance specific one inf_kwargs: dict additional kwargs passed to :class:`Inference` start: `Point` starting point for inference Other Parameters ---------------- score: bool evaluate loss on each iteration or not callbacks: list[function: (Approximation, losses, i) -> None] calls provided functions after each iteration step progressbar: bool whether to show progressbar or not obj_n_mc: `int` Number of monte carlo samples used for approximation of objective gradients tf_n_mc: `int` Number of monte carlo samples used for approximation of test function gradients obj_optimizer: function (grads, params) -> updates Optimizer that is used for objective params test_optimizer: function (grads, params) -> updates Optimizer that is used for test function params more_obj_params: `list` Add custom params for objective optimizer more_tf_params: `list` Add custom params for test function optimizer more_updates: `dict` Add custom updates to resulting updates total_grad_norm_constraint: `float` Bounds gradient norm, prevents exploding gradient problem fn_kwargs: `dict` Add kwargs to aesara.function (e.g. `{'profile': True}`) more_replacements: `dict` Apply custom replacements before calculating gradients Returns ------- :class:`Approximation` """ if inf_kwargs is None: inf_kwargs = dict() else: inf_kwargs = inf_kwargs.copy() if local_rv is not None: inf_kwargs["local_rv"] = local_rv if random_seed is not None: inf_kwargs["random_seed"] = random_seed if start is not None: inf_kwargs["start"] = start if model is None: model = pm.modelcontext(model) _select = dict(advi=ADVI, fullrank_advi=FullRankADVI, svgd=SVGD, asvgd=ASVGD, nfvi=NFVI) if isinstance(method, str): method = method.lower() if method.startswith("nfvi="): formula = method[5:] inference = NFVI(formula, **inf_kwargs) elif method in _select: inference = _select[method](model=model, **inf_kwargs) else: raise KeyError( f"method should be one of {set(_select.keys())} or Inference instance" ) elif isinstance(method, Inference): inference = method else: raise TypeError( f"method should be one of {set(_select.keys())} or Inference instance" ) return inference.fit(n, **kwargs)
def advi_minibatch(vars=None, start=None, model=None, n=5000, n_mcsamples=1, minibatch_RVs=None, minibatch_tensors=None, minibatches=None, global_RVs=None, local_RVs=None, observed_RVs=None, encoder_params=None, total_size=None, optimizer=None, learning_rate=.001, epsilon=.1, random_seed=None, mode=None): """Perform mini-batch ADVI. This function implements a mini-batch automatic differentiation variational inference (ADVI; Kucukelbir et al., 2015) with the meanfield approximation. Autoencoding variational Bayes (AEVB; Kingma and Welling, 2014) is also supported. For explanation, we classify random variables in probabilistic models into three types. Observed random variables :math:`{\cal Y}=\{\mathbf{y}_{i}\}_{i=1}^{N}` are :math:`N` observations. Each :math:`\mathbf{y}_{i}` can be a set of observed random variables, i.e., :math:`\mathbf{y}_{i}=\{\mathbf{y}_{i}^{k}\}_{k=1}^{V_{o}}`, where :math:`V_{k}` is the number of the types of observed random variables in the model. The next ones are global random variables :math:`\Theta=\{\\theta^{k}\}_{k=1}^{V_{g}}`, which are used to calculate the probabilities for all observed samples. The last ones are local random variables :math:`{\cal Z}=\{\mathbf{z}_{i}\}_{i=1}^{N}`, where :math:`\mathbf{z}_{i}=\{\mathbf{z}_{i}^{k}\}_{k=1}^{V_{l}}`. These RVs are used only in AEVB. The goal of ADVI is to approximate the posterior distribution :math:`p(\Theta,{\cal Z}|{\cal Y})` by variational posterior :math:`q(\Theta)\prod_{i=1}^{N}q(\mathbf{z}_{i})`. All of these terms are normal distributions (mean-field approximation). :math:`q(\Theta)` is parametrized with its means and standard deviations. These parameters are denoted as :math:`\gamma`. While :math:`\gamma` is a constant, the parameters of :math:`q(\mathbf{z}_{i})` are dependent on each observation. Therefore these parameters are denoted as :math:`\\xi(\mathbf{y}_{i}; \\nu)`, where :math:`\\nu` is the parameters of :math:`\\xi(\cdot)`. For example, :math:`\\xi(\cdot)` can be a multilayer perceptron or convolutional neural network. In addition to :math:`\\xi(\cdot)`, we can also include deterministic mappings for the likelihood of observations. We denote the parameters of the deterministic mappings as :math:`\eta`. An example of such mappings is the deconvolutional neural network used in the convolutional VAE example in the PyMC3 notebook directory. This function maximizes the evidence lower bound (ELBO) :math:`{\cal L}(\gamma, \\nu, \eta)` defined as follows: .. math:: {\cal L}(\gamma,\\nu,\eta) & = \mathbf{c}_{o}\mathbb{E}_{q(\Theta)}\left[ \sum_{i=1}^{N}\mathbb{E}_{q(\mathbf{z}_{i})}\left[ \log p(\mathbf{y}_{i}|\mathbf{z}_{i},\Theta,\eta) \\right]\\right] \\\\ & - \mathbf{c}_{g}KL\left[q(\Theta)||p(\Theta)\\right] - \mathbf{c}_{l}\sum_{i=1}^{N} KL\left[q(\mathbf{z}_{i})||p(\mathbf{z}_{i})\\right], where :math:`KL[q(v)||p(v)]` is the Kullback-Leibler divergence .. math:: KL[q(v)||p(v)] = \int q(v)\log\\frac{q(v)}{p(v)}dv, :math:`\mathbf{c}_{o/g/l}` are vectors for weighting each term of ELBO. More precisely, we can write each of the terms in ELBO as follows: .. math:: \mathbf{c}_{o}\log p(\mathbf{y}_{i}|\mathbf{z}_{i},\Theta,\eta) & = & \sum_{k=1}^{V_{o}}c_{o}^{k} \log p(\mathbf{y}_{i}^{k}| {\\rm pa}(\mathbf{y}_{i}^{k},\Theta,\eta)) \\\\ \mathbf{c}_{g}KL\left[q(\Theta)||p(\Theta)\\right] & = & \sum_{k=1}^{V_{g}}c_{g}^{k}KL\left[ q(\\theta^{k})||p(\\theta^{k}|{\\rm pa(\\theta^{k})})\\right] \\\\ \mathbf{c}_{l}KL\left[q(\mathbf{z}_{i}||p(\mathbf{z}_{i})\\right] & = & \sum_{k=1}^{V_{l}}c_{l}^{k}KL\left[ q(\mathbf{z}_{i}^{k})|| p(\mathbf{z}_{i}^{k}|{\\rm pa}(\mathbf{z}_{i}^{k}))\\right], where :math:`{\\rm pa}(v)` denotes the set of parent variables of :math:`v` in the directed acyclic graph of the model. When using mini-batches, :math:`c_{o}^{k}` and :math:`c_{l}^{k}` should be set to :math:`N/M`, where :math:`M` is the number of observations in each mini-batch. Another weighting scheme was proposed in (Blundell et al., 2015) for accelarating model fitting. For working with ADVI, we need to give the probabilistic model (:code:`model`), the three types of RVs (:code:`observed_RVs`, :code:`global_RVs` and :code:`local_RVs`), the tensors to which mini-bathced samples are supplied (:code:`minibatches`) and parameters of deterministic mappings :math:`\\xi` and :math:`\eta` (:code:`encoder_params`) as input arguments. :code:`observed_RVs` is a :code:`OrderedDict` of the form :code:`{y_k: c_k}`, where :code:`y_k` is a random variable defined in the PyMC3 model. :code:`c_k` is a scalar (:math:`c_{o}^{k}`) and it can be a shared variable. :code:`global_RVs` is a :code:`OrderedDict` of the form :code:`{t_k: c_k}`, where :code:`t_k` is a random variable defined in the PyMC3 model. :code:`c_k` is a scalar (:math:`c_{g}^{k}`) and it can be a shared variable. :code:`local_RVs` is a :code:`OrderedDict` of the form :code:`{z_k: ((m_k, s_k), c_k)}`, where :code:`z_k` is a random variable defined in the PyMC3 model. :code:`c_k` is a scalar (:math:`c_{l}^{k}`) and it can be a shared variable. :code:`(m_k, s_k)` is a pair of tensors of means and log standard deviations of the variational distribution; samples drawn from the variational distribution replaces :code:`z_k`. It should be noted that if :code:`z_k` has a transformation that changes the dimension (e.g., StickBreakingTransform), the variational distribution must have the same dimension. For example, if :code:`z_k` is distributed with Dirichlet distribution with :code:`p` choices, :math:`m_k` and :code:`s_k` has the shape :code:`(n_samples_in_minibatch, p - 1)`. :code:`minibatch_tensors` is a list of tensors (can be shared variables) to which mini-batch samples are set during the optimization. These tensors are observations (:code:`obs=`) in :code:`observed_RVs`. :code:`minibatches` is a generator of a list of :code:`numpy.ndarray`. Each item of the list will be set to tensors in :code:`minibatch_tensors`. :code:`encoder_params` is a list of shared variables of the parameters :math:`\\nu` and :math:`\eta`. We do not need to include the variational parameters of the global variables, :math:`\gamma`, because these are automatically created and updated in this function. The following is a list of example notebooks using advi_minibatch: - docs/source/notebooks/GLM-hierarchical-advi-minibatch.ipynb - docs/source/notebooks/bayesian_neural_network_advi.ipynb - docs/source/notebooks/convolutional_vae_keras_advi.ipynb - docs/source/notebooks/gaussian-mixture-model-advi.ipynb - docs/source/notebooks/lda-advi-aevb.ipynb Parameters ---------- vars : object List of random variables. If None, variational posteriors (normal distribution) are fit for all RVs in the given model. start : Dict or None Initial values of parameters (variational means). model : Model Probabilistic model. n : int Number of iterations updating parameters. n_mcsamples : int Number of Monte Carlo samples to approximate ELBO. minibatch_RVs : list of ObservedRVs Random variables in the model for which mini-batch tensors are set. When this argument is given, both of arguments local_RVs and observed_RVs must be None. minibatch_tensors : list of (tensors or shared variables) Tensors used to create ObservedRVs in minibatch_RVs. minibatches : generator of list Generates a set of minibatches when calling next(). The length of the returned list must be the same with the number of random variables in `minibatch_tensors`. total_size : int Total size of training samples. This is used to appropriately scale the log likelihood terms corresponding to mini-batches in ELBO. observed_RVs : Ordered dict Include a scaling constant for the corresponding RV. See the above description. global_RVs : Ordered dict or None Include a scaling constant for the corresponding RV. See the above description. If :code:`None`, it is set to :code:`{v: 1 for v in grvs}`, where :code:`grvs` is :code:`list(set(vars) - set(list(local_RVs) + list(observed_RVs)))`. local_RVs : Ordered dict or None Include encoded variational parameters and a scaling constant for the corresponding RV. See the above description. encoder_params : list of theano shared variables Parameters of encoder. optimizer : (loss, list of shared variables) -> dict or OrderedDict A function that returns parameter updates given loss and shared variables of parameters. If :code:`None` (default), a default Adagrad optimizer is used with parameters :code:`learning_rate` and :code:`epsilon` below. learning_rate: float Base learning rate for adagrad. This parameter is ignored when :code:`optimizer` is set. epsilon : float Offset in denominator of the scale of learning rate in Adagrad. This parameter is ignored when :code:`optimizer` is set. random_seed : int Seed to initialize random state. Returns ------- ADVIFit Named tuple, which includes 'means', 'stds', and 'elbo_vals'. References ---------- - Kingma, D. P., & Welling, M. (2014). Auto-Encoding Variational Bayes. stat, 1050, 1. - Kucukelbir, A., Ranganath, R., Gelman, A., & Blei, D. (2015). Automatic variational inference in Stan. In Advances in neural information processing systems (pp. 568-576). - Blundell, C., Cornebise, J., Kavukcuoglu, K., & Wierstra, D. (2015). Weight Uncertainty in Neural Network. In Proceedings of the 32nd International Conference on Machine Learning (ICML-15) (pp. 1613-1622). """ if encoder_params is None: encoder_params = [] model = pm.modelcontext(model) vars = inputvars(vars if vars is not None else model.vars) start = start if start is not None else model.test_point if not pm.model.all_continuous(vars): raise ValueError('Model can not include discrete RVs for ADVI.') _check_minibatches(minibatch_tensors, minibatches) if encoder_params is None: encoder_params = [] # Prepare optimizer if optimizer is None: optimizer = adagrad_optimizer(learning_rate, epsilon) # For backward compatibility in how input arguments are given local_RVs, observed_RVs = _get_rvss(minibatch_RVs, local_RVs, observed_RVs, minibatch_tensors, total_size) # Replace local_RVs with transformed variables def get_transformed(v): if hasattr(v, 'transformed'): return v.transformed return v local_RVs = OrderedDict([(get_transformed(v), (uw, s)) for v, (uw, s) in local_RVs.items()]) # Get global variables grvs = list(set(vars) - set(list(local_RVs) + list(observed_RVs))) if global_RVs is None: global_RVs = OrderedDict({v: 1 for v in grvs}) elif len(grvs) != len(global_RVs): _value_error('global_RVs ({}) must have all global RVs: {}'.format( [v for v in global_RVs], grvs)) # ELBO wrt variational parameters elbo, uw_l, uw_g = _make_elbo_t(observed_RVs, global_RVs, local_RVs, model.potentials, n_mcsamples, random_seed) # Replacements tensors of variational parameters in the graph replaces = dict() # Variational parameters for global RVs if 0 < len(global_RVs): uw_global_shared, bij = _init_uw_global_shared(start, global_RVs) replaces.update({uw_g: uw_global_shared}) # Variational parameters for local RVs, encoded from samples in # mini-batches if 0 < len(local_RVs): uws = [uw for _, (uw, _) in local_RVs.items()] uw_local_encoded = tt.concatenate([uw[0].ravel() for uw in uws] + [uw[1].ravel() for uw in uws]) replaces.update({uw_l: uw_local_encoded}) # Replace tensors of variational parameters in ELBO elbo = theano.clone(elbo, OrderedDict(replaces), strict=False) # Replace input shared variables with tensors def is_shared(t): return isinstance(t, theano.compile.sharedvalue.SharedVariable) tensors = [(t.type() if is_shared(t) else t) for t in minibatch_tensors] updates = OrderedDict( {t: t_ for t, t_ in zip(minibatch_tensors, tensors) if is_shared(t)}) elbo = theano.clone(elbo, updates, strict=False) # Create parameter update function used in the training loop params = encoder_params if 0 < len(global_RVs): params += [uw_global_shared] updates = OrderedDict(optimizer(loss=-1 * elbo, param=params)) f = theano.function(tensors, elbo, updates=updates, mode=mode) # Optimization loop elbos = np.empty(n) progress = tqdm.trange(n) for i in progress: e = f(*next(minibatches)) elbos[i] = e if n < 10: progress.set_description('ELBO = {:,.2f}'.format(elbos[i])) elif i % (n // 10) == 0 and i > 0: avg_elbo = elbos[i - n // 10:i].mean() progress.set_description('Average ELBO = {:,.2f}'.format(avg_elbo)) pm._log.info('Finished minibatch ADVI: ELBO = {:,.2f}'.format(elbos[-1])) # Variational parameters of global RVs if 0 < len(global_RVs): l = int(uw_global_shared.get_value(borrow=True).size / 2) u = bij.rmap(uw_global_shared.get_value(borrow=True)[:l]) w = bij.rmap(uw_global_shared.get_value(borrow=True)[l:]) # w is in log space for var in w.keys(): w[var] = np.exp(w[var]) else: u = dict() w = dict() return ADVIFit(u, w, elbos)
def fit(n=10000, local_rv=None, method='advi', model=None, random_seed=None, start=None, inf_kwargs=None, **kwargs): R"""Handy shortcut for using inference methods in functional way Parameters ---------- n : `int` number of iterations local_rv : dict[var->tuple] mapping {model_variable -> approx params} Local Vars are used for Autoencoding Variational Bayes See (AEVB; Kingma and Welling, 2014) for details method : str or :class:`Inference` string name is case insensitive in: - 'advi' for ADVI - 'fullrank_advi' for FullRankADVI - 'svgd' for Stein Variational Gradient Descent - 'asvgd' for Amortized Stein Variational Gradient Descent - 'nfvi' for Normalizing Flow with default `scale-loc` flow - 'nfvi=<formula>' for Normalizing Flow using formula model : :class:`Model` PyMC3 model for inference random_seed : None or int leave None to use package global RandomStream or other valid value to create instance specific one inf_kwargs : dict additional kwargs passed to :class:`Inference` start : `Point` starting point for inference Other Parameters ---------------- score : bool evaluate loss on each iteration or not callbacks : list[function : (Approximation, losses, i) -> None] calls provided functions after each iteration step progressbar : bool whether to show progressbar or not obj_n_mc : `int` Number of monte carlo samples used for approximation of objective gradients tf_n_mc : `int` Number of monte carlo samples used for approximation of test function gradients obj_optimizer : function (grads, params) -> updates Optimizer that is used for objective params test_optimizer : function (grads, params) -> updates Optimizer that is used for test function params more_obj_params : `list` Add custom params for objective optimizer more_tf_params : `list` Add custom params for test function optimizer more_updates : `dict` Add custom updates to resulting updates total_grad_norm_constraint : `float` Bounds gradient norm, prevents exploding gradient problem fn_kwargs : `dict` Add kwargs to theano.function (e.g. `{'profile': True}`) more_replacements : `dict` Apply custom replacements before calculating gradients Returns ------- :class:`Approximation` """ if inf_kwargs is None: inf_kwargs = dict() else: inf_kwargs = inf_kwargs.copy() if local_rv is not None: inf_kwargs['local_rv'] = local_rv if random_seed is not None: inf_kwargs['random_seed'] = random_seed if start is not None: inf_kwargs['start'] = start if model is None: model = pm.modelcontext(model) _select = dict( advi=ADVI, fullrank_advi=FullRankADVI, svgd=SVGD, asvgd=ASVGD, nfvi=NFVI ) if isinstance(method, str): method = method.lower() if method.startswith('nfvi='): formula = method[5:] inference = NFVI( formula, **inf_kwargs ) elif method in _select: inference = _select[method]( model=model, **inf_kwargs ) else: raise KeyError('method should be one of %s ' 'or Inference instance' % set(_select.keys())) elif isinstance(method, Inference): inference = method else: raise TypeError('method should be one of %s ' 'or Inference instance' % set(_select.keys())) return inference.fit(n, **kwargs)
def sample_vp( vparams, draws=1000, model=None, local_RVs=None, random_seed=None, hide_transformed=True, progressbar=True): """Draw samples from variational posterior. Parameters ---------- vparams : dict or pymc3.variational.ADVIFit Estimated variational parameters of the model. draws : int Number of random samples. model : pymc3.Model Probabilistic model. random_seed : int or None Seed of random number generator. None to use current seed. hide_transformed : bool If False, transformed variables are also sampled. Default is True. Returns ------- trace : pymc3.backends.base.MultiTrace Samples drawn from the variational posterior. """ model = pm.modelcontext(model) if isinstance(vparams, ADVIFit): vparams = { 'means': vparams.means, 'stds': vparams.stds } ds = model.deterministics def get_transformed(v): return v if v not in ds else v.transformed def rvs(x): return [get_transformed(v) for v in x] if x is not None else [] global_RVs = list(set(model.free_RVs) - set(rvs(local_RVs))) # Make dict for replacements of random variables if random_seed is None: r = MRG_RandomStreams(gen_random_state()) else: r = MRG_RandomStreams(seed=random_seed) updates = {} for v in global_RVs: u = theano.shared(vparams['means'][str(v)]).ravel() w = theano.shared(vparams['stds'][str(v)]).ravel() n = r.normal(size=u.tag.test_value.shape) updates.update({v: (n * w + u).reshape(v.tag.test_value.shape)}) if local_RVs is not None: for v_, (uw, _) in local_RVs.items(): v = get_transformed(v_) u = uw[0].ravel() w = uw[1].ravel() n = r.normal(size=u.tag.test_value.shape) updates.update( {v: (n * tt.exp(w) + u).reshape(v.tag.test_value.shape)}) # Replace some nodes of the graph with variational distributions vars = model.free_RVs samples = theano.clone(vars, updates) f = theano.function([], samples) # Random variables which will be sampled if hide_transformed: vars_sampled = [v_ for v_ in model.unobserved_RVs if not str(v_).endswith('_')] else: vars_sampled = [v_ for v_ in model.unobserved_RVs] varnames = [str(var) for var in model.unobserved_RVs] trace = pm.sampling.NDArray(model=model, vars=vars_sampled) trace.setup(draws=draws, chain=0) range_ = trange(draws) if progressbar else range(draws) for _ in range_: # 'point' is like {'var1': np.array(0.1), 'var2': np.array(0.2), ...} point = {varname: value for varname, value in zip(varnames, f())} trace.record(point) return MultiTrace([trace])
def optimize( start=None, vars=None, model=None, return_info=False, verbose=True, **kwargs ): """Maximize the log prob of a PyMC3 model using scipy All extra arguments are passed directly to the ``scipy.optimize.minimize`` function. Args: start: The PyMC3 coordinate dictionary of the starting position vars: The variables to optimize model: The PyMC3 model return_info: Return both the coordinate dictionary and the result of ``scipy.optimize.minimize`` verbose: Print the success flag and log probability to the screen """ from scipy.optimize import minimize model = pm.modelcontext(model) # Work out the full starting coordinates if start is None: start = model.test_point else: update_start_vals(start, model.test_point, model) # Fit all the parameters by default if vars is None: vars = model.cont_vars vars = inputvars(vars) allinmodel(vars, model) # Work out the relevant bijection map start = Point(start, model=model) bij = DictToArrayBijection(ArrayOrdering(vars), start) # Pre-compile the theano model and gradient nlp = -model.logpt grad = theano.grad(nlp, vars, disconnected_inputs="ignore") func = get_theano_function_for_var([nlp] + grad, model=model) if verbose: names = [ get_untransformed_name(v.name) if is_transformed_name(v.name) else v.name for v in vars ] sys.stderr.write( "optimizing logp for variables: [{0}]\n".format(", ".join(names)) ) bar = tqdm.tqdm() # This returns the objective function and its derivatives def objective(vec): res = func(*get_args_for_theano_function(bij.rmap(vec), model=model)) d = dict(zip((v.name for v in vars), res[1:])) g = bij.map(d) if verbose: bar.set_postfix(logp="{0:e}".format(-res[0])) bar.update() return res[0], g # Optimize using scipy.optimize x0 = bij.map(start) initial = objective(x0)[0] kwargs["jac"] = True info = minimize(objective, x0, **kwargs) # Only accept the output if it is better than it was x = info.x if (np.isfinite(info.fun) and info.fun < initial) else x0 # Coerce the output into the right format vars = get_default_varnames(model.unobserved_RVs, True) point = { var.name: value for var, value in zip(vars, model.fastfn(vars)(bij.rmap(x))) } if verbose: bar.close() sys.stderr.write("message: {0}\n".format(info.message)) sys.stderr.write("logp: {0} -> {1}\n".format(-initial, -info.fun)) if not np.isfinite(info.fun): logger.warning("final logp not finite, returning initial point") logger.warning( "this suggests that something is wrong with the model" ) logger.debug("{0}".format(info)) if return_info: return point, info return point
def compute_source_mag_and_blend_fraction(data, Delta_F, F_base, u_0, model=None): """ Converts flux parameters :math:`(\Delta F, F_\mathrm{base})` to physically more relevant interesting quantities, the source star brightness in magnitudes and the blend ratio :math:`g=F_B/F_S`. Parameters ---------- data : :func:`~caustic.data.Data` Microlensing event data. Delta_F : theano.tensor Tensor of shape ``(n_bands)``. F_base : theano.tensor Tensor of shape ``(n_bands)``. u_0 : theano.tensor Lens--source separation at time :math:`t_0`. standardized : bool Wether or not the flux is standardized to unit std deviation and zero median. By default ``True``. model : pymc3.Model PyMC3 model object which was used to obtain posterior samples in the trace. Returns ------- tuple ``(m_source, g)``. """ model = pm.modelcontext(model) if model.is_standardized is True: # Revert F_base and Delta_F to non-standardized units data.units = "fluxes" fluxes_median = np.zeros(len(data.light_curves)) fluxes_std = np.zeros(len(data.light_curves)) for i, table in enumerate(data.light_curves): mask = table["mask"] fluxes_median[i] = np.median(table["flux"][mask]) fluxes_std[i] = np.std(table["flux"][mask]) # Flux parameters to standard flux units Delta_F_ = T.as_tensor_variable(fluxes_std) * Delta_F F_base_ = T.as_tensor_variable( fluxes_std) * F_base + T.as_tensor_variable(fluxes_median) else: Delta_F_ = Delta_F F_base_ = F_base # Calculate source flux and blend flux A_u0 = (u_0**2 + 2) / (T.abs_(u_0) * T.sqrt(u_0**2 + 4)) F_S = Delta_F_ / (A_u0 - 1) F_B = F_base_ - F_S g = F_B / F_S # Convert fluxes to magnitudes zero_point = 22.0 m_source = zero_point - 2.5 * T.log10(F_S) return m_source, g
def advi_minibatch(vars=None, start=None, model=None, n=5000, n_mcsamples=1, minibatch_RVs=None, minibatch_tensors=None, minibatches=None, global_RVs=None, local_RVs=None, observed_RVs=None, encoder_params=None, total_size=None, optimizer=None, learning_rate=.001, epsilon=.1, random_seed=None, mode=None): """Perform mini-batch ADVI. This function implements a mini-batch automatic differentiation variational inference (ADVI; Kucukelbir et al., 2015) with the meanfield approximation. Autoencoding variational Bayes (AEVB; Kingma and Welling, 2014) is also supported. For explanation, we classify random variables in probabilistic models into three types. Observed random variables :math:`{\cal Y}=\{\mathbf{y}_{i}\}_{i=1}^{N}` are :math:`N` observations. Each :math:`\mathbf{y}_{i}` can be a set of observed random variables, i.e., :math:`\mathbf{y}_{i}=\{\mathbf{y}_{i}^{k}\}_{k=1}^{V_{o}}`, where :math:`V_{k}` is the number of the types of observed random variables in the model. The next ones are global random variables :math:`\Theta=\{\\theta^{k}\}_{k=1}^{V_{g}}`, which are used to calculate the probabilities for all observed samples. The last ones are local random variables :math:`{\cal Z}=\{\mathbf{z}_{i}\}_{i=1}^{N}`, where :math:`\mathbf{z}_{i}=\{\mathbf{z}_{i}^{k}\}_{k=1}^{V_{l}}`. These RVs are used only in AEVB. The goal of ADVI is to approximate the posterior distribution :math:`p(\Theta,{\cal Z}|{\cal Y})` by variational posterior :math:`q(\Theta)\prod_{i=1}^{N}q(\mathbf{z}_{i})`. All of these terms are normal distributions (mean-field approximation). :math:`q(\Theta)` is parametrized with its means and standard deviations. These parameters are denoted as :math:`\gamma`. While :math:`\gamma` is a constant, the parameters of :math:`q(\mathbf{z}_{i})` are dependent on each observation. Therefore these parameters are denoted as :math:`\\xi(\mathbf{y}_{i}; \\nu)`, where :math:`\\nu` is the parameters of :math:`\\xi(\cdot)`. For example, :math:`\\xi(\cdot)` can be a multilayer perceptron or convolutional neural network. In addition to :math:`\\xi(\cdot)`, we can also include deterministic mappings for the likelihood of observations. We denote the parameters of the deterministic mappings as :math:`\eta`. An example of such mappings is the deconvolutional neural network used in the convolutional VAE example in the PyMC3 notebook directory. This function maximizes the evidence lower bound (ELBO) :math:`{\cal L}(\gamma, \\nu, \eta)` defined as follows: .. math:: {\cal L}(\gamma,\\nu,\eta) & = \mathbf{c}_{o}\mathbb{E}_{q(\Theta)}\left[ \sum_{i=1}^{N}\mathbb{E}_{q(\mathbf{z}_{i})}\left[ \log p(\mathbf{y}_{i}|\mathbf{z}_{i},\Theta,\eta) \\right]\\right] \\\\ & - \mathbf{c}_{g}KL\left[q(\Theta)||p(\Theta)\\right] - \mathbf{c}_{l}\sum_{i=1}^{N} KL\left[q(\mathbf{z}_{i})||p(\mathbf{z}_{i})\\right], where :math:`KL[q(v)||p(v)]` is the Kullback-Leibler divergence .. math:: KL[q(v)||p(v)] = \int q(v)\log\\frac{q(v)}{p(v)}dv, :math:`\mathbf{c}_{o/g/l}` are vectors for weighting each term of ELBO. More precisely, we can write each of the terms in ELBO as follows: .. math:: \mathbf{c}_{o}\log p(\mathbf{y}_{i}|\mathbf{z}_{i},\Theta,\eta) & = & \sum_{k=1}^{V_{o}}c_{o}^{k} \log p(\mathbf{y}_{i}^{k}| {\\rm pa}(\mathbf{y}_{i}^{k},\Theta,\eta)) \\\\ \mathbf{c}_{g}KL\left[q(\Theta)||p(\Theta)\\right] & = & \sum_{k=1}^{V_{g}}c_{g}^{k}KL\left[ q(\\theta^{k})||p(\\theta^{k}|{\\rm pa(\\theta^{k})})\\right] \\\\ \mathbf{c}_{l}KL\left[q(\mathbf{z}_{i}||p(\mathbf{z}_{i})\\right] & = & \sum_{k=1}^{V_{l}}c_{l}^{k}KL\left[ q(\mathbf{z}_{i}^{k})|| p(\mathbf{z}_{i}^{k}|{\\rm pa}(\mathbf{z}_{i}^{k}))\\right], where :math:`{\\rm pa}(v)` denotes the set of parent variables of :math:`v` in the directed acyclic graph of the model. When using mini-batches, :math:`c_{o}^{k}` and :math:`c_{l}^{k}` should be set to :math:`N/M`, where :math:`M` is the number of observations in each mini-batch. Another weighting scheme was proposed in (Blundell et al., 2015) for accelarating model fitting. For working with ADVI, we need to give the probabilistic model (:code:`model`), the three types of RVs (:code:`observed_RVs`, :code:`global_RVs` and :code:`local_RVs`), the tensors to which mini-bathced samples are supplied (:code:`minibatches`) and parameters of deterministic mappings :math:`\\xi` and :math:`\eta` (:code:`encoder_params`) as input arguments. :code:`observed_RVs` is a :code:`OrderedDict` of the form :code:`{y_k: c_k}`, where :code:`y_k` is a random variable defined in the PyMC3 model. :code:`c_k` is a scalar (:math:`c_{o}^{k}`) and it can be a shared variable. :code:`global_RVs` is a :code:`OrderedDict` of the form :code:`{t_k: c_k}`, where :code:`t_k` is a random variable defined in the PyMC3 model. :code:`c_k` is a scalar (:math:`c_{g}^{k}`) and it can be a shared variable. :code:`local_RVs` is a :code:`OrderedDict` of the form :code:`{z_k: ((m_k, s_k), c_k)}`, where :code:`z_k` is a random variable defined in the PyMC3 model. :code:`c_k` is a scalar (:math:`c_{l}^{k}`) and it can be a shared variable. :code:`(m_k, s_k)` is a pair of tensors of means and log standard deviations of the variational distribution; samples drawn from the variational distribution replaces :code:`z_k`. It should be noted that if :code:`z_k` has a transformation that changes the dimension (e.g., StickBreakingTransform), the variational distribution must have the same dimension. For example, if :code:`z_k` is distributed with Dirichlet distribution with :code:`p` choices, :math:`m_k` and :code:`s_k` has the shape :code:`(n_samples_in_minibatch, p - 1)`. :code:`minibatch_tensors` is a list of tensors (can be shared variables) to which mini-batch samples are set during the optimization. These tensors are observations (:code:`obs=`) in :code:`observed_RVs`. :code:`minibatches` is a generator of a list of :code:`numpy.ndarray`. Each item of the list will be set to tensors in :code:`minibatch_tensors`. :code:`encoder_params` is a list of shared variables of the parameters :math:`\\nu` and :math:`\eta`. We do not need to include the variational parameters of the global variables, :math:`\gamma`, because these are automatically created and updated in this function. The following is a list of example notebooks using advi_minibatch: - docs/source/notebooks/GLM-hierarchical-advi-minibatch.ipynb - docs/source/notebooks/bayesian_neural_network_advi.ipynb - docs/source/notebooks/convolutional_vae_keras_advi.ipynb - docs/source/notebooks/gaussian-mixture-model-advi.ipynb - docs/source/notebooks/lda-advi-aevb.ipynb Parameters ---------- vars : object List of random variables. If None, variational posteriors (normal distribution) are fit for all RVs in the given model. start : Dict or None Initial values of parameters (variational means). model : Model Probabilistic model. n : int Number of iterations updating parameters. n_mcsamples : int Number of Monte Carlo samples to approximate ELBO. minibatch_RVs : list of ObservedRVs Random variables in the model for which mini-batch tensors are set. When this argument is given, both of arguments local_RVs and observed_RVs must be None. minibatch_tensors : list of (tensors or shared variables) Tensors used to create ObservedRVs in minibatch_RVs. minibatches : generator of list Generates a set of minibatches when calling next(). The length of the returned list must be the same with the number of random variables in `minibatch_tensors`. total_size : int Total size of training samples. This is used to appropriately scale the log likelihood terms corresponding to mini-batches in ELBO. observed_RVs : Ordered dict Include a scaling constant for the corresponding RV. See the above description. global_RVs : Ordered dict or None Include a scaling constant for the corresponding RV. See the above description. If :code:`None`, it is set to :code:`{v: 1 for v in grvs}`, where :code:`grvs` is :code:`list(set(vars) - set(list(local_RVs) + list(observed_RVs)))`. local_RVs : Ordered dict or None Include encoded variational parameters and a scaling constant for the corresponding RV. See the above description. encoder_params : list of theano shared variables Parameters of encoder. optimizer : (loss, list of shared variables) -> dict or OrderedDict A function that returns parameter updates given loss and shared variables of parameters. If :code:`None` (default), a default Adagrad optimizer is used with parameters :code:`learning_rate` and :code:`epsilon` below. learning_rate: float Base learning rate for adagrad. This parameter is ignored when :code:`optimizer` is set. epsilon : float Offset in denominator of the scale of learning rate in Adagrad. This parameter is ignored when :code:`optimizer` is set. random_seed : int Seed to initialize random state. Returns ------- ADVIFit Named tuple, which includes 'means', 'stds', and 'elbo_vals'. References ---------- - Kingma, D. P., & Welling, M. (2014). Auto-Encoding Variational Bayes. stat, 1050, 1. - Kucukelbir, A., Ranganath, R., Gelman, A., & Blei, D. (2015). Automatic variational inference in Stan. In Advances in neural information processing systems (pp. 568-576). - Blundell, C., Cornebise, J., Kavukcuoglu, K., & Wierstra, D. (2015). Weight Uncertainty in Neural Network. In Proceedings of the 32nd International Conference on Machine Learning (ICML-15) (pp. 1613-1622). """ import warnings warnings.warn('Old ADVI interface is deprecated and be removed in future, use pm.ADVI instead', DeprecationWarning, stacklevel=2) if encoder_params is None: encoder_params = [] model = pm.modelcontext(model) vars = inputvars(vars if vars is not None else model.vars) start = start if start is not None else model.test_point if not pm.model.all_continuous(vars): raise ValueError('Model can not include discrete RVs for ADVI.') _check_minibatches(minibatch_tensors, minibatches) if encoder_params is None: encoder_params = [] # Prepare optimizer if optimizer is None: optimizer = adagrad_optimizer(learning_rate, epsilon) # For backward compatibility in how input arguments are given local_RVs, observed_RVs = _get_rvss(minibatch_RVs, local_RVs, observed_RVs, minibatch_tensors, total_size) # Replace local_RVs with transformed variables def get_transformed(v): if hasattr(v, 'transformed'): return v.transformed return v local_RVs = OrderedDict( [(get_transformed(v), (uw, s)) for v, (uw, s) in local_RVs.items()] ) # Get global variables grvs = list(set(vars) - set(list(local_RVs) + list(observed_RVs))) if global_RVs is None: global_RVs = OrderedDict({v: 1 for v in grvs}) _value_error(len(grvs) == len(global_RVs), 'global_RVs ({}) must have all global RVs: {}'.format( [v for v in global_RVs], grvs) ) # ELBO wrt variational parameters elbo, uw_l, uw_g = _make_elbo_t(observed_RVs, global_RVs, local_RVs, model.potentials, n_mcsamples, random_seed) # Replacements tensors of variational parameters in the graph replaces = dict() # Variational parameters for global RVs if 0 < len(global_RVs): uw_global_shared, bij = _init_uw_global_shared(start, global_RVs) replaces.update({uw_g: uw_global_shared}) # Variational parameters for local RVs, encoded from samples in # mini-batches if 0 < len(local_RVs): uws = [uw for _, (uw, _) in local_RVs.items()] uw_local_encoded = tt.concatenate([uw[0].ravel() for uw in uws] + [uw[1].ravel() for uw in uws]) replaces.update({uw_l: uw_local_encoded}) # Replace tensors of variational parameters in ELBO elbo = theano.clone(elbo, OrderedDict(replaces), strict=False) # Replace input shared variables with tensors def is_shared(t): return isinstance(t, theano.compile.sharedvalue.SharedVariable) tensors = [(t.type() if is_shared(t) else t) for t in minibatch_tensors] updates = OrderedDict( {t: t_ for t, t_ in zip(minibatch_tensors, tensors) if is_shared(t)} ) elbo = theano.clone(elbo, updates, strict=False) # Create parameter update function used in the training loop params = encoder_params if 0 < len(global_RVs): params += [uw_global_shared] updates = OrderedDict(optimizer(loss=-1 * elbo, param=params)) f = theano.function(tensors, elbo, updates=updates, mode=mode) # Optimization loop elbos = np.empty(n) progress = tqdm.trange(n) for i in progress: e = f(*next(minibatches)) if np.isnan(e): raise FloatingPointError('NaN occurred in ADVI optimization.') elbos[i] = e if n < 10: progress.set_description('ELBO = {:,.2f}'.format(elbos[i])) elif i % (n // 10) == 0 and i > 0: avg_elbo = infmean(elbos[i - n // 10:i]) progress.set_description('Average ELBO = {:,.2f}'.format(avg_elbo)) pm._log.info('Finished minibatch ADVI: ELBO = {:,.2f}'.format(elbos[-1])) # Variational parameters of global RVs if 0 < len(global_RVs): l = int(uw_global_shared.get_value(borrow=True).size / 2) u = bij.rmap(uw_global_shared.get_value(borrow=True)[:l]) w = bij.rmap(uw_global_shared.get_value(borrow=True)[l:]) # w is in log space for var in w.keys(): w[var] = np.exp(w[var]) else: u = dict() w = dict() return ADVIFit(u, w, elbos)
def sample_vp(vparams, draws=1000, model=None, local_RVs=None, random_seed=None, include_transformed=False, progressbar=True): """Draw samples from variational posterior. Parameters ---------- vparams : dict or pymc3.variational.ADVIFit Estimated variational parameters of the model. draws : int Number of random samples. model : pymc3.Model Probabilistic model. random_seed : int or None Seed of random number generator. None to use current seed. include_transformed : bool If True, transformed variables are also sampled. Default is False. Returns ------- trace : pymc3.backends.base.MultiTrace Samples drawn from the variational posterior. """ import warnings warnings.warn( 'Old ADVI interface and sample_vp is deprecated and will ' 'be removed in future, use pm.fit and pm.sample_approx instead', DeprecationWarning, stacklevel=2) model = pm.modelcontext(model) if isinstance(vparams, ADVIFit): vparams = {'means': vparams.means, 'stds': vparams.stds} ds = model.deterministics def get_transformed(v): return v if v not in ds else v.transformed def rvs(x): return [get_transformed(v) for v in x] if x is not None else [] global_RVs = list(set(model.free_RVs) - set(rvs(local_RVs))) # Make dict for replacements of random variables if random_seed is None: r = MRG_RandomStreams(gen_random_state()) else: r = MRG_RandomStreams(seed=random_seed) updates = {} for v in global_RVs: u = theano.shared(vparams['means'][str(v)]).ravel() w = theano.shared(vparams['stds'][str(v)]).ravel() n = r.normal(size=u.tag.test_value.shape) updates.update({v: (n * w + u).reshape(v.tag.test_value.shape)}) if local_RVs is not None: for v_, (uw, _) in local_RVs.items(): v = get_transformed(v_) u = uw[0].ravel() w = uw[1].ravel() n = r.normal(size=u.tag.test_value.shape) updates.update( {v: (n * tt.exp(w) + u).reshape(v.tag.test_value.shape)}) # Replace some nodes of the graph with variational distributions vars = model.free_RVs samples = theano.clone(vars, updates) f = theano.function([], samples) # Random variables which will be sampled vars_sampled = pm.util.get_default_varnames( model.unobserved_RVs, include_transformed=include_transformed) varnames = [str(var) for var in model.unobserved_RVs] trace = pm.sampling.NDArray(model=model, vars=vars_sampled) trace.setup(draws=draws, chain=0) range_ = trange(draws) if progressbar else range(draws) for _ in range_: # 'point' is like {'var1': np.array(0.1), 'var2': np.array(0.2), ...} point = {varname: value for varname, value in zip(varnames, f())} trace.record(point) return MultiTrace([trace])
def fit(n=10000, local_rv=None, method='advi', model=None, random_seed=None, start=None, inf_kwargs=None, **kwargs): R""" Handy shortcut for using inference methods in functional way Parameters ---------- n : `int` number of iterations local_rv : dict[var->tuple] mapping {model_variable -> local_variable (:math:`\mu`, :math:`\rho`)} Local Vars are used for Autoencoding Variational Bayes See (AEVB; Kingma and Welling, 2014) for details method : str or :class:`Inference` string name is case insensitive in: - 'advi' for ADVI - 'fullrank_advi' for FullRankADVI - 'advi->fullrank_advi' for fitting ADVI first and then FullRankADVI - 'svgd' for Stein Variational Gradient Descent - 'asvgd' for Amortized Stein Variational Gradient Descent - 'nfvi' for Normalizing Flow - 'nfvi=formula' for Normalizing Flow using formula model : :class:`Model` PyMC3 model for inference random_seed : None or int leave None to use package global RandomStream or other valid value to create instance specific one inf_kwargs : dict additional kwargs passed to :class:`Inference` start : `Point` starting point for inference Other Parameters ---------------- frac : `float` if method is 'advi->fullrank_advi' represents advi fraction when training kwargs : kwargs additional kwargs for :func:`Inference.fit` Returns ------- :class:`Approximation` """ if inf_kwargs is None: inf_kwargs = dict() if model is None: model = pm.modelcontext(model) _select = dict(advi=ADVI, fullrank_advi=FullRankADVI, svgd=SVGD, asvgd=ASVGD, nfvi=NFVI) if isinstance(method, str): method = method.lower() if method == 'advi->fullrank_advi': frac = kwargs.pop('frac', .5) if not 0. < frac < 1.: raise ValueError('frac should be in (0, 1)') n1 = int(n * frac) n2 = n - n1 inference = ADVI(local_rv=local_rv, model=model, random_seed=random_seed, start=start) logger.info('fitting advi ...') inference.fit(n1, **kwargs) inference = FullRankADVI.from_advi(inference) logger.info('fitting fullrank advi ...') return inference.fit(n2, **kwargs) elif method.startswith('nfvi='): formula = method[5:] inference = NFVI( formula, local_rv=local_rv, model=model, random_seed=random_seed, start= start, # ignored by now, hope I'll find a good application for this argument **inf_kwargs) elif method in _select: inference = _select[method](local_rv=local_rv, model=model, random_seed=random_seed, start=start, **inf_kwargs) else: raise KeyError('method should be one of %s ' 'or Inference instance' % set(_select.keys())) elif isinstance(method, Inference): inference = method else: raise TypeError('method should be one of %s ' 'or Inference instance' % set(_select.keys())) return inference.fit(n, **kwargs)
def sample_numpyro_nuts( draws=1000, tune=1000, chains=4, target_accept=0.8, random_seed=10, model=None, progress_bar=True, keep_untransformed=False, ): from numpyro.infer import MCMC, NUTS from pymc3 import modelcontext model = modelcontext(model) seed = jax.random.PRNGKey(random_seed) fgraph = aesara.graph.fg.FunctionGraph(model.free_RVs, [model.logpt]) fns = jax_funcify(fgraph) logp_fn_jax = fns[0] rv_names = [rv.name for rv in model.free_RVs] init_state = [model.test_point[rv_name] for rv_name in rv_names] init_state_batched = jax.tree_map(lambda x: np.repeat(x[None, ...], chains, axis=0), init_state) @jax.jit def _sample(current_state, seed): step_size = jax.tree_map(jax.numpy.ones_like, init_state) nuts_kernel = NUTS( potential_fn=lambda x: -logp_fn_jax(*x), # model=model, target_accept_prob=target_accept, adapt_step_size=True, adapt_mass_matrix=True, dense_mass=False, ) pmap_numpyro = MCMC( nuts_kernel, num_warmup=tune, num_samples=draws, num_chains=chains, postprocess_fn=None, chain_method="parallel", progress_bar=progress_bar, ) pmap_numpyro.run(seed, init_params=current_state, extra_fields=("num_steps",)) samples = pmap_numpyro.get_samples(group_by_chain=True) leapfrogs_taken = pmap_numpyro.get_extra_fields(group_by_chain=True)["num_steps"] return samples, leapfrogs_taken print("Compiling...") tic2 = pd.Timestamp.now() map_seed = jax.random.split(seed, chains) mcmc_samples, leapfrogs_taken = _sample(init_state_batched, map_seed) # map_seed = jax.random.split(seed, chains) # mcmc_samples = _sample(init_state_batched, map_seed) # tic4 = pd.Timestamp.now() # print("Sampling time = ", tic4 - tic3) posterior = {k: v for k, v in zip(rv_names, mcmc_samples)} tic3 = pd.Timestamp.now() posterior = _transform_samples(posterior, model, keep_untransformed=keep_untransformed) tic4 = pd.Timestamp.now() az_trace = az.from_dict(posterior=posterior) print("Compilation + sampling time = ", tic3 - tic2) print("Transformation time = ", tic4 - tic3) return az_trace # , leapfrogs_taken, tic3 - tic2
def __init__(self, vars, model=None, **kwargs): model = pm.modelcontext(model) self.model = model vars = pm.inputvars(vars) super(ConstantStep, self).__init__(vars, [model.fastlogp], **kwargs)
def sample_tfp_nuts( draws=1000, tune=1000, chains=4, target_accept=0.8, random_seed=10, model=None, num_tuning_epoch=2, num_compute_step_size=500, ): import jax from tensorflow_probability.substrates import jax as tfp model = modelcontext(model) seed = jax.random.PRNGKey(random_seed) fgraph = aesara.graph.fg.FunctionGraph(model.free_RVs, [model.logpt]) fns = jax_funcify(fgraph) logp_fn_jax = fns[0] rv_names = [rv.name for rv in model.free_RVs] init_state = [model.test_point[rv_name] for rv_name in rv_names] init_state_batched = jax.tree_map(lambda x: np.repeat(x[None, ...], chains, axis=0), init_state) @jax.pmap def _sample(init_state, seed): def gen_kernel(step_size): hmc = tfp.mcmc.NoUTurnSampler(target_log_prob_fn=logp_fn_jax, step_size=step_size) return tfp.mcmc.DualAveragingStepSizeAdaptation( hmc, tune // num_tuning_epoch, target_accept_prob=target_accept ) def trace_fn(_, pkr): return pkr.new_step_size def get_tuned_stepsize(samples, step_size): return step_size[-1] * jax.numpy.std(samples[-num_compute_step_size:]) step_size = jax.tree_map(jax.numpy.ones_like, init_state) for i in range(num_tuning_epoch - 1): tuning_hmc = gen_kernel(step_size) init_samples, tuning_result, kernel_results = tfp.mcmc.sample_chain( num_results=tune // num_tuning_epoch, current_state=init_state, kernel=tuning_hmc, trace_fn=trace_fn, return_final_kernel_results=True, seed=seed, ) step_size = jax.tree_multimap(get_tuned_stepsize, list(init_samples), tuning_result) init_state = [x[-1] for x in init_samples] # Run inference sample_kernel = gen_kernel(step_size) mcmc_samples, leapfrog_num = tfp.mcmc.sample_chain( num_results=draws, num_burnin_steps=tune // num_tuning_epoch, current_state=init_state, kernel=sample_kernel, trace_fn=lambda _, pkr: pkr.inner_results.leapfrogs_taken, seed=seed, ) return mcmc_samples, leapfrog_num print("Compiling...") tic2 = pd.Timestamp.now() map_seed = jax.random.split(seed, chains) mcmc_samples, leapfrog_num = _sample(init_state_batched, map_seed) # map_seed = jax.random.split(seed, chains) # mcmc_samples = _sample(init_state_batched, map_seed) # tic4 = pd.Timestamp.now() # print("Sampling time = ", tic4 - tic3) posterior = {k: v for k, v in zip(rv_names, mcmc_samples)} az_trace = az.from_dict(posterior=posterior) tic3 = pd.Timestamp.now() print("Compilation + sampling time = ", tic3 - tic2) return az_trace # , leapfrog_num, tic3 - tic2
def __init__(self, var, model=None, values=[0,1]): model = pm.modelcontext(model) self.values = values super(RandomScanDiscreteMetropolis, self).__init__([var], [model.fastlogp])
def init_nuts(init='ADVI', njobs=1, n_init=500000, model=None, random_seed=-1, progressbar=True, **kwargs): """Initialize and sample from posterior of a continuous model. This is a convenience function. NUTS convergence and sampling speed is extremely dependent on the choice of mass/scaling matrix. In our experience, using ADVI to estimate a diagonal covariance matrix and using this as the scaling matrix produces robust results over a wide class of continuous models. Parameters ---------- init : str {'ADVI', 'ADVI_MAP', 'MAP', 'NUTS'} Initialization method to use. * ADVI : Run ADVI to estimate posterior mean and diagonal covariance matrix. * ADVI_MAP: Initialize ADVI with MAP and use MAP as starting point. * MAP : Use the MAP as starting point. * NUTS : Run NUTS and estimate posterior mean and covariance matrix. njobs : int Number of parallel jobs to start. n_init : int Number of iterations of initializer If 'ADVI', number of iterations, if 'metropolis', number of draws. model : Model (optional if in `with` context) progressbar : bool Whether or not to display a progressbar for advi sampling. **kwargs : keyword arguments Extra keyword arguments are forwarded to pymc3.NUTS. Returns ------- start : pymc3.model.Point Starting point for sampler nuts_sampler : pymc3.step_methods.NUTS Instantiated and initialized NUTS sampler object """ model = pm.modelcontext(model) pm._log.info('Initializing NUTS using {}...'.format(init)) random_seed = int(np.atleast_1d(random_seed)[0]) if init is not None: init = init.lower() if init == 'advi': v_params = pm.variational.advi(n=n_init, random_seed=random_seed, progressbar=progressbar) start = pm.variational.sample_vp(v_params, njobs, progressbar=False, hide_transformed=False, random_seed=random_seed) if njobs == 1: start = start[0] cov = np.power(model.dict_to_array(v_params.stds), 2) elif init == 'advi_map': start = pm.find_MAP() v_params = pm.variational.advi(n=n_init, start=start, random_seed=random_seed) cov = np.power(model.dict_to_array(v_params.stds), 2) elif init == 'map': start = pm.find_MAP() cov = pm.find_hessian(point=start) elif init == 'nuts': init_trace = pm.sample(step=pm.NUTS(), draws=n_init, random_seed=random_seed)[n_init // 2:] cov = np.atleast_1d(pm.trace_cov(init_trace)) start = np.random.choice(init_trace, njobs) if njobs == 1: start = start[0] else: raise NotImplementedError( 'Initializer {} is not supported.'.format(init)) step = pm.NUTS(scaling=cov, is_cov=True, **kwargs) return start, step
def get_step_for_trace(trace=None, model=None, diag=False, regularize=True, regular_window=5, regular_variance=1e-3, **kwargs): """ Define a tuning procedure that adapts off-diagonal mass matrix terms adapted from a blog post by Dan Foreman-Mackey here: https://dfm.io/posts/pymc3-mass-matrix/ Args: trace (trace): pymc3 trace object model (model): pymc3 model object diag (bool): flag to tune only the diagonal elements regularize (bool): flag to turn on covariance matrix regularization regular_window (int): size of parameter space at which regularization becomes important regular_variance (float): magnitude of covariance floor Returns: pymc3 step_methods object """ model = pm.modelcontext(model) # If not given, use the trivial metric if trace is None: potential = pm.step_methods.hmc.quadpotential.QuadPotentialFull( np.eye(model.ndim)) return pm.NUTS(potential=potential, **kwargs) # Loop over samples and convert to the relevant parameter space # while removing divergent samples div_mask = np.invert(np.copy(trace.diverging)) samples = np.empty((div_mask.sum() * trace.nchains, model.ndim)) i = 0 imask = 0 for chain in trace._straces.values(): for p in chain: if div_mask[imask]: samples[i] = model.bijection.map(p) i += 1 imask += 1 # Compute the sample covariance cov = np.cov(samples, rowvar=0) if diag: cov = np.diag(np.diag(cov)) # Stan uses a regularized estimator for the covariance matrix to # be less sensitive to numerical issues for large parameter spaces. if regularize: N = len(samples) cov = cov * N / (N + regular_window) cov[np.diag_indices_from( cov)] += regular_variance * regular_window / (N + regular_window) # Use the sample covariance as the inverse metric potential = pm.step_methods.hmc.quadpotential.QuadPotentialFull(cov) return pm.NUTS(potential=potential, **kwargs)
def modelcontext(model=None): return pm.modelcontext(model)
def init_nuts(init='auto', chains=1, n_init=500000, model=None, random_seed=None, progressbar=True, **kwargs): """Set up the mass matrix initialization for NUTS. NUTS convergence and sampling speed is extremely dependent on the choice of mass/scaling matrix. This function implements different methods for choosing or adapting the mass matrix. Parameters ---------- init : str Initialization method to use. * auto : Choose a default initialization method automatically. Currently, this is `'jitter+adapt_diag'`, but this can change in the future. If you depend on the exact behaviour, choose an initialization method explicitly. * adapt_diag : Start with a identity mass matrix and then adapt a diagonal based on the variance of the tuning samples. All chains use the test value (usually the prior mean) as starting point. * jitter+adapt_diag : Same as `adapt_diag`, but add uniform jitter in [-1, 1] to the starting point in each chain. * advi+adapt_diag : Run ADVI and then adapt the resulting diagonal mass matrix based on the sample variance of the tuning samples. * advi+adapt_diag_grad : Run ADVI and then adapt the resulting diagonal mass matrix based on the variance of the gradients during tuning. This is **experimental** and might be removed in a future release. * advi : Run ADVI to estimate posterior mean and diagonal mass matrix. * advi_map: Initialize ADVI with MAP and use MAP as starting point. * map : Use the MAP as starting point. This is discouraged. * nuts : Run NUTS and estimate posterior mean and mass matrix from the trace. chains : int Number of jobs to start. n_init : int Number of iterations of initializer If 'ADVI', number of iterations, if 'nuts', number of draws. model : Model (optional if in `with` context) progressbar : bool Whether or not to display a progressbar for advi sampling. **kwargs : keyword arguments Extra keyword arguments are forwarded to pymc3.NUTS. Returns ------- start : pymc3.model.Point Starting point for sampler nuts_sampler : pymc3.step_methods.NUTS Instantiated and initialized NUTS sampler object """ model = pm.modelcontext(model) vars = kwargs.get('vars', model.vars) if set(vars) != set(model.vars): raise ValueError('Must use init_nuts on all variables of a model.') if not pm.model.all_continuous(vars): raise ValueError('init_nuts can only be used for models with only ' 'continuous variables.') if not isinstance(init, str): raise TypeError('init must be a string.') if init is not None: init = init.lower() if init == 'auto': init = 'jitter+adapt_diag' pm._log.info('Initializing NUTS using {}...'.format(init)) if random_seed is not None: random_seed = int(np.atleast_1d(random_seed)[0]) np.random.seed(random_seed) cb = [ pm.callbacks.CheckParametersConvergence(tolerance=1e-2, diff='absolute'), pm.callbacks.CheckParametersConvergence(tolerance=1e-2, diff='relative'), ] if init == 'adapt_diag': start = [model.test_point] * chains mean = np.mean([model.dict_to_array(vals) for vals in start], axis=0) var = np.ones_like(mean) potential = quadpotential.QuadPotentialDiagAdapt( model.ndim, mean, var, 10) elif init == 'jitter+adapt_diag': start = [] for _ in range(chains): mean = {var: val.copy() for var, val in model.test_point.items()} for val in mean.values(): val[...] += 2 * np.random.rand(*val.shape) - 1 start.append(mean) mean = np.mean([model.dict_to_array(vals) for vals in start], axis=0) var = np.ones_like(mean) potential = quadpotential.QuadPotentialDiagAdapt( model.ndim, mean, var, 10) elif init == 'advi+adapt_diag_grad': approx = pm.fit( random_seed=random_seed, n=n_init, method='advi', model=model, callbacks=cb, progressbar=progressbar, obj_optimizer=pm.adagrad_window, ) # type: pm.MeanField start = approx.sample(draws=chains) start = list(start) stds = approx.bij.rmap(approx.std.eval()) cov = model.dict_to_array(stds)**2 mean = approx.bij.rmap(approx.mean.get_value()) mean = model.dict_to_array(mean) weight = 50 potential = quadpotential.QuadPotentialDiagAdaptGrad( model.ndim, mean, cov, weight) elif init == 'advi+adapt_diag': approx = pm.fit( random_seed=random_seed, n=n_init, method='advi', model=model, callbacks=cb, progressbar=progressbar, obj_optimizer=pm.adagrad_window, ) # type: pm.MeanField start = approx.sample(draws=chains) start = list(start) stds = approx.bij.rmap(approx.std.eval()) cov = model.dict_to_array(stds)**2 mean = approx.bij.rmap(approx.mean.get_value()) mean = model.dict_to_array(mean) weight = 50 potential = quadpotential.QuadPotentialDiagAdapt( model.ndim, mean, cov, weight) elif init == 'advi': approx = pm.fit(random_seed=random_seed, n=n_init, method='advi', model=model, callbacks=cb, progressbar=progressbar, obj_optimizer=pm.adagrad_window) # type: pm.MeanField start = approx.sample(draws=chains) start = list(start) stds = approx.bij.rmap(approx.std.eval()) cov = model.dict_to_array(stds)**2 potential = quadpotential.QuadPotentialDiag(cov) elif init == 'advi_map': start = pm.find_MAP(include_transformed=True) approx = pm.MeanField(model=model, start=start) pm.fit(random_seed=random_seed, n=n_init, method=pm.KLqp(approx), callbacks=cb, progressbar=progressbar, obj_optimizer=pm.adagrad_window) start = approx.sample(draws=chains) start = list(start) stds = approx.bij.rmap(approx.std.eval()) cov = model.dict_to_array(stds)**2 potential = quadpotential.QuadPotentialDiag(cov) elif init == 'map': start = pm.find_MAP(include_transformed=True) cov = pm.find_hessian(point=start) start = [start] * chains potential = quadpotential.QuadPotentialFull(cov) elif init == 'nuts': init_trace = pm.sample(draws=n_init, step=pm.NUTS(), tune=n_init // 2, random_seed=random_seed) cov = np.atleast_1d(pm.trace_cov(init_trace)) start = list(np.random.choice(init_trace, chains)) potential = quadpotential.QuadPotentialFull(cov) else: raise NotImplementedError( 'Initializer {} is not supported.'.format(init)) step = pm.NUTS(potential=potential, model=model, **kwargs) return start, step
def fit(n=10000, local_rv=None, method='advi', model=None, seed=None, start=None, **kwargs): """ Handy shortcut for using inference methods in functional way Parameters ---------- n : int number of iterations local_rv : dict[var->tuple] mapping {model_variable -> local_variable (:math:`\\mu`, :math:`\\rho`)} Local Vars are used for Autoencoding Variational Bayes See (AEVB; Kingma and Welling, 2014) for details method : str or Inference string name is case insensitive in {'advi', 'fullrank_advi', 'advi->fullrank_advi'} model : Model kwargs : kwargs for Inference.fit frac : float if method is 'advi->fullrank_advi' represents advi fraction when training seed : None or int leave None to use package global RandomStream or other valid value to create instance specific one start : Point starting point for inference Returns ------- Approximation """ if model is None: model = pm.modelcontext(model) _select = dict( advi=ADVI, fullrank_advi=FullRankADVI, svgd=SVGD ) if isinstance(method, str) and method.lower() == 'advi->fullrank_advi': frac = kwargs.pop('frac', .5) if not 0. < frac < 1.: raise ValueError('frac should be in (0, 1)') n1 = int(n * frac) n2 = n-n1 inference = ADVI(local_rv=local_rv, model=model, seed=seed, start=start) logger.info('fitting advi ...') inference.fit(n1, **kwargs) inference = FullRankADVI.from_advi(inference) logger.info('fitting fullrank advi ...') return inference.fit(n2, **kwargs) elif isinstance(method, str): try: inference = _select[method.lower()]( local_rv=local_rv, model=model, seed=seed, start=start ) except KeyError: raise KeyError('method should be one of %s ' 'or Inference instance' % set(_select.keys())) elif isinstance(method, Inference): inference = method else: raise TypeError('method should be one of %s ' 'or Inference instance' % set(_select.keys())) return inference.fit(n, **kwargs)
def fit(n=10000, local_rv=None, method='advi', model=None, random_seed=None, start=None, inf_kwargs=None, **kwargs): R""" Handy shortcut for using inference methods in functional way Parameters ---------- n : `int` number of iterations local_rv : dict[var->tuple] mapping {model_variable -> local_variable (:math:`\mu`, :math:`\rho`)} Local Vars are used for Autoencoding Variational Bayes See (AEVB; Kingma and Welling, 2014) for details method : str or :class:`Inference` string name is case insensitive in: - 'advi' for ADVI - 'fullrank_advi' for FullRankADVI - 'advi->fullrank_advi' for fitting ADVI first and then FullRankADVI - 'svgd' for Stein Variational Gradient Descent - 'asvgd' for Amortized Stein Variational Gradient Descent - 'nfvi' for Normalizing Flow - 'nfvi=formula' for Normalizing Flow using formula model : :class:`Model` PyMC3 model for inference random_seed : None or int leave None to use package global RandomStream or other valid value to create instance specific one inf_kwargs : dict additional kwargs passed to :class:`Inference` start : `Point` starting point for inference Other Parameters ---------------- frac : `float` if method is 'advi->fullrank_advi' represents advi fraction when training kwargs : kwargs additional kwargs for :func:`Inference.fit` Returns ------- :class:`Approximation` """ if inf_kwargs is None: inf_kwargs = dict() if model is None: model = pm.modelcontext(model) _select = dict( advi=ADVI, fullrank_advi=FullRankADVI, svgd=SVGD, asvgd=ASVGD, nfvi=NFVI ) if isinstance(method, str): method = method.lower() if method == 'advi->fullrank_advi': frac = kwargs.pop('frac', .5) if not 0. < frac < 1.: raise ValueError('frac should be in (0, 1)') n1 = int(n * frac) n2 = n - n1 inference = ADVI( local_rv=local_rv, model=model, random_seed=random_seed, start=start) logger.info('fitting advi ...') inference.fit(n1, **kwargs) inference = FullRankADVI.from_advi(inference) logger.info('fitting fullrank advi ...') return inference.fit(n2, **kwargs) elif method.startswith('nfvi='): formula = method[5:] inference = NFVI( formula, local_rv=local_rv, model=model, random_seed=random_seed, start=start, # ignored by now, hope I'll find a good application for this argument **inf_kwargs ) elif method in _select: inference = _select[method]( local_rv=local_rv, model=model, random_seed=random_seed, start=start, **inf_kwargs ) else: raise KeyError('method should be one of %s ' 'or Inference instance' % set(_select.keys())) elif isinstance(method, Inference): inference = method else: raise TypeError('method should be one of %s ' 'or Inference instance' % set(_select.keys())) return inference.fit(n, **kwargs)
def __init__( self, coarse_models: List[Model], vars: Optional[list] = None, base_S: Optional = None, base_proposal_dist: Optional[Type[Proposal]] = None, base_scaling: Union[float, int] = 1.0, tune: bool = True, base_tune_interval: int = 100, model: Optional[Model] = None, mode: Optional = None, subsampling_rates: List[int] = 5, base_blocked: bool = False, **kwargs, ) -> None: warnings.warn("The MLDA implementation in PyMC3 is very young. " "You should be extra critical about its results.") model = pm.modelcontext(model) # assign internal state self.coarse_models = coarse_models if not isinstance(coarse_models, list): raise ValueError( "MLDA step method cannot use coarse_models if it is not a list" ) if len(self.coarse_models) == 0: raise ValueError("MLDA step method was given an empty " "list of coarse models. Give at least " "one coarse model.") if isinstance(subsampling_rates, int): self.subsampling_rates = [subsampling_rates] * len( self.coarse_models) else: if len(subsampling_rates) != len(self.coarse_models): raise ValueError( f"List of subsampling rates needs to have the same " f"length as list of coarse models but the lengths " f"were {len(subsampling_rates)}, {len(self.coarse_models)}" ) self.subsampling_rates = subsampling_rates self.num_levels = len(self.coarse_models) + 1 self.base_S = base_S self.base_proposal_dist = base_proposal_dist self.base_scaling = base_scaling self.tune = tune self.base_tune_interval = base_tune_interval self.model = model self.next_model = self.coarse_models[-1] self.mode = mode self.base_blocked = base_blocked self.base_scaling_stats = None # Process model variables if vars is None: vars = model.vars vars = pm.inputvars(vars) self.vars = vars self.var_names = [var.name for var in self.vars] self.accepted = 0 # Construct theano function for current-level model likelihood # (for use in acceptance) shared = pm.make_shared_replacements(vars, model) self.delta_logp = delta_logp(model.logpt, vars, shared) # Construct theano function for next-level model likelihood # (for use in acceptance) next_model = pm.modelcontext(self.next_model) vars_next = [ var for var in next_model.vars if var.name in self.var_names ] vars_next = pm.inputvars(vars_next) shared_next = pm.make_shared_replacements(vars_next, next_model) self.delta_logp_next = delta_logp(next_model.logpt, vars_next, shared_next) super().__init__(vars, shared) # initialise complete step method hierarchy if self.num_levels == 2: with self.next_model: # make sure the correct variables are selected from next_model vars_next = [ var for var in self.next_model.vars if var.name in self.var_names ] # MetropolisMLDA sampler in base level (level=0), targeting self.next_model self.next_step_method = pm.MetropolisMLDA( vars=vars_next, proposal_dist=self.base_proposal_dist, S=self.base_S, scaling=self.base_scaling, tune=self.tune, tune_interval=self.base_tune_interval, model=None, blocked=self.base_blocked, ) else: # drop the last coarse model next_coarse_models = self.coarse_models[:-1] next_subsampling_rates = self.subsampling_rates[:-1] with self.next_model: # make sure the correct variables are selected from next_model vars_next = [ var for var in self.next_model.vars if var.name in self.var_names ] # MLDA sampler in some intermediate level, targeting self.next_model self.next_step_method = pm.MLDA( vars=vars_next, base_S=self.base_S, base_proposal_dist=self.base_proposal_dist, base_scaling=self.base_scaling, tune=self.tune, base_tune_interval=self.base_tune_interval, model=None, mode=self.mode, subsampling_rates=next_subsampling_rates, coarse_models=next_coarse_models, base_blocked=self.base_blocked, **kwargs, ) # instantiate the recursive DA proposal. # this is the main proposal used for # all levels (Recursive Delayed Acceptance) # (except for level 0 where the step method is MetropolisMLDA and not MLDA) self.proposal_dist = RecursiveDAProposal( self.next_step_method, self.next_model, self.tune, self.subsampling_rates[-1], )
def advi(vars=None, start=None, model=None, n=5000, accurate_elbo=False, optimizer=None, learning_rate=.001, epsilon=.1, mode=None, tol_obj=0.01, eval_elbo=100, random_seed=None, progressbar=True): """Perform automatic differentiation variational inference (ADVI). This function implements the meanfield ADVI, where the variational posterior distribution is assumed to be spherical Gaussian without correlation of parameters and fit to the true posterior distribution. The means and standard deviations of the variational posterior are referred to as variational parameters. The return value of this function is an :code:`ADVIfit` object, which has variational parameters. If you want to draw samples from the variational posterior, you need to pass the :code:`ADVIfit` object to :code:`pymc3.variational.sample_vp()`. The variational parameters are defined on the transformed space, which is required to do ADVI on an unconstrained parameter space as described in [KTR+2016]. The parameters in the :code:`ADVIfit` object are in the transformed space, while traces returned by :code:`sample_vp()` are in the original space as obtained by MCMC sampling methods in PyMC3. The variational parameters are optimized with given optimizer, which is a function that returns a dictionary of parameter updates as provided to Theano function. If no optimizer is provided, optimization is performed with a modified version of adagrad, where only the last (n_window) gradient vectors are used to control the learning rate and older gradient vectors are ignored. n_window denotes the size of time window and fixed to 10. Parameters ---------- vars : object Random variables. start : Dict or None Initial values of parameters (variational means). model : Model Probabilistic model. n : int Number of interations updating parameters. accurate_elbo : bool If true, 100 MC samples are used for accurate calculation of ELBO. optimizer : (loss, tensor) -> dict or OrderedDict A function that returns parameter updates given loss and parameter tensor. If :code:`None` (default), a default Adagrad optimizer is used with parameters :code:`learning_rate` and :code:`epsilon` below. learning_rate: float Base learning rate for adagrad. This parameter is ignored when optimizer is given. epsilon : float Offset in denominator of the scale of learning rate in Adagrad. This parameter is ignored when optimizer is given. tol_obj : float Relative tolerance for testing convergence of ELBO. eval_elbo : int Window for checking convergence of ELBO. Convergence will be checked for every multiple of eval_elbo. random_seed : int or None Seed to initialize random state. None uses current seed. mode : string or `Mode` instance. Compilation mode passed to Theano functions progressbar : bool Whether or not to display a progress bar in the command line. The bar shows the percentage of completion, the sampling speed in samples per second (SPS), the estimated remaining time until completion ("expected time of arrival"; ETA), and the current ELBO. Returns ------- ADVIFit Named tuple, which includes 'means', 'stds', and 'elbo_vals'. 'means' is the mean. 'stds' is the standard deviation. 'elbo_vals' is the trace of ELBO values during optimizaiton. References ---------- .. [KTR+2016] Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A., and Blei, D. M. (2016). Automatic Differentiation Variational Inference. arXiv preprint arXiv:1603.00788. """ import warnings warnings.warn('Old ADVI interface and sample_vp is deprecated and will ' 'be removed in future, use pm.fit and pm.sample_approx instead', DeprecationWarning, stacklevel=2) model = pm.modelcontext(model) if start is None: start = model.test_point if vars is None: vars = model.vars vars = pm.inputvars(vars) if len(vars) == 0: raise ValueError('No free random variables to fit.') if not pm.model.all_continuous(vars): raise ValueError('Model can not include discrete RVs for ADVI.') n_mcsamples = 100 if accurate_elbo else 1 # Prepare optimizer if optimizer is None: optimizer = adagrad_optimizer(learning_rate, epsilon) # Create variational gradient tensor elbo, shared = _calc_elbo(vars, model, n_mcsamples=n_mcsamples, random_seed=random_seed) # Set starting values for var, share in shared.items(): share.set_value(start[str(var)]) order = pm.ArrayOrdering(vars) bij = pm.DictToArrayBijection(order, start) u_start = bij.map(start) w_start = np.zeros_like(u_start) uw = np.concatenate([u_start, w_start]) # Create parameter update function used in the training loop uw_shared = theano.shared(uw, 'uw_shared') elbo = pm.CallableTensor(elbo)(uw_shared) updates = optimizer(loss=-1 * elbo, param=[uw_shared]) f = theano.function([], [uw_shared, elbo], updates=updates, mode=mode) # For tracking convergence of ELBO window_size = int(max(0.1 * n // eval_elbo, 2.0)) circ_buff = deque([], maxlen=window_size) # Optimization loop elbos = np.empty(n) divergence_flag = False progress = trange(n) if progressbar else range(n) try: uw_i, elbo_current = f() if np.isnan(elbo_current): raise FloatingPointError('NaN occurred in ADVI optimization.') for i in progress: uw_i, e = f() if np.isnan(e): raise FloatingPointError('NaN occurred in ADVI optimization.') elbos[i] = e if progressbar: if n < 10: progress.set_description('ELBO = {:,.5g}'.format(elbos[i])) elif i % (n // 10) == 0 and i > 0: avg_elbo = infmean(elbos[i - n // 10:i]) progress.set_description( 'Average ELBO = {:,.5g}'.format(avg_elbo)) if i % eval_elbo == 0: elbo_prev = elbo_current elbo_current = elbos[i] delta_elbo = abs((elbo_current - elbo_prev) / elbo_prev) circ_buff.append(delta_elbo) avg_delta = np.mean(circ_buff) med_delta = np.median(circ_buff) if i > 0 and avg_delta < tol_obj: pm._log.info('Mean ELBO converged.') elbos = elbos[:(i + 1)] break elif i > 0 and med_delta < tol_obj: pm._log.info('Median ELBO converged.') elbos = elbos[:(i + 1)] break if i > 10 * eval_elbo: if med_delta > 0.5 or avg_delta > 0.5: divergence_flag = True else: divergence_flag = False except KeyboardInterrupt: elbos = elbos[:i] if n < 10: pm._log.info('Interrupted at {:,d} [{:.0f}%]: ELBO = {:,.5g}'.format( i, 100 * i // n, elbos[i])) else: avg_elbo = infmean(elbos[i - n // 10:i]) pm._log.info('Interrupted at {:,d} [{:.0f}%]: Average ELBO = {:,.5g}'.format( i, 100 * i // n, avg_elbo)) else: if n < 10: pm._log.info('Finished [100%]: ELBO = {:,.5g}'.format(elbos[-1])) else: avg_elbo = infmean(elbos[-n // 10:]) pm._log.info('Finished [100%]: Average ELBO = {:,.5g}'.format(avg_elbo)) finally: if progressbar: progress.close() if divergence_flag: pm._log.info('Evidence of divergence detected, inspect ELBO.') # Estimated parameters l = int(uw_i.size / 2) u = bij.rmap(uw_i[:l]) w = bij.rmap(uw_i[l:]) # w is in log space for var in w.keys(): w[var] = np.exp(w[var]) return ADVIFit(u, w, elbos)
def advi(vars=None, start=None, model=None, n=5000, accurate_elbo=False, optimizer=None, learning_rate=.001, epsilon=.1, mode=None, tol_obj=0.01, eval_elbo=100, random_seed=None, progressbar=True): """Perform automatic differentiation variational inference (ADVI). This function implements the meanfield ADVI, where the variational posterior distribution is assumed to be spherical Gaussian without correlation of parameters and fit to the true posterior distribution. The means and standard deviations of the variational posterior are referred to as variational parameters. The return value of this function is an :code:`ADVIfit` object, which has variational parameters. If you want to draw samples from the variational posterior, you need to pass the :code:`ADVIfit` object to :code:`pymc3.variational.sample_vp()`. The variational parameters are defined on the transformed space, which is required to do ADVI on an unconstrained parameter space as described in [KTR+2016]. The parameters in the :code:`ADVIfit` object are in the transformed space, while traces returned by :code:`sample_vp()` are in the original space as obtained by MCMC sampling methods in PyMC3. The variational parameters are optimized with given optimizer, which is a function that returns a dictionary of parameter updates as provided to Theano function. If no optimizer is provided, optimization is performed with a modified version of adagrad, where only the last (n_window) gradient vectors are used to control the learning rate and older gradient vectors are ignored. n_window denotes the size of time window and fixed to 10. Parameters ---------- vars : object Random variables. start : Dict or None Initial values of parameters (variational means). model : Model Probabilistic model. n : int Number of interations updating parameters. accurate_elbo : bool If true, 100 MC samples are used for accurate calculation of ELBO. optimizer : (loss, tensor) -> dict or OrderedDict A function that returns parameter updates given loss and parameter tensor. If :code:`None` (default), a default Adagrad optimizer is used with parameters :code:`learning_rate` and :code:`epsilon` below. learning_rate: float Base learning rate for adagrad. This parameter is ignored when optimizer is given. epsilon : float Offset in denominator of the scale of learning rate in Adagrad. This parameter is ignored when optimizer is given. tol_obj : float Relative tolerance for testing convergence of ELBO. eval_elbo : int Window for checking convergence of ELBO. Convergence will be checked for every multiple of eval_elbo. random_seed : int or None Seed to initialize random state. None uses current seed. mode : string or `Mode` instance. Compilation mode passed to Theano functions progressbar : bool Whether or not to display a progress bar in the command line. The bar shows the percentage of completion, the sampling speed in samples per second (SPS), the estimated remaining time until completion ("expected time of arrival"; ETA), and the current ELBO. Returns ------- ADVIFit Named tuple, which includes 'means', 'stds', and 'elbo_vals'. 'means' is the mean. 'stds' is the standard deviation. 'elbo_vals' is the trace of ELBO values during optimizaiton. References ---------- .. [KTR+2016] Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A., and Blei, D. M. (2016). Automatic Differentiation Variational Inference. arXiv preprint arXiv:1603.00788. """ import warnings warnings.warn( 'Old ADVI interface and sample_vp is deprecated and will ' 'be removed in future, use pm.fit and pm.sample_approx instead', DeprecationWarning, stacklevel=2) model = pm.modelcontext(model) if start is None: start = model.test_point if vars is None: vars = model.vars vars = pm.inputvars(vars) if len(vars) == 0: raise ValueError('No free random variables to fit.') if not pm.model.all_continuous(vars): raise ValueError('Model can not include discrete RVs for ADVI.') n_mcsamples = 100 if accurate_elbo else 1 # Prepare optimizer if optimizer is None: optimizer = adagrad_optimizer(learning_rate, epsilon) # Create variational gradient tensor elbo, shared = _calc_elbo(vars, model, n_mcsamples=n_mcsamples, random_seed=random_seed) # Set starting values for var, share in shared.items(): share.set_value(start[str(var)]) order = pm.ArrayOrdering(vars) bij = pm.DictToArrayBijection(order, start) u_start = bij.map(start) w_start = np.zeros_like(u_start) uw = np.concatenate([u_start, w_start]) # Create parameter update function used in the training loop uw_shared = theano.shared(uw, 'uw_shared') elbo = pm.CallableTensor(elbo)(uw_shared) updates = optimizer(loss=-1 * elbo, param=[uw_shared]) f = theano.function([], [uw_shared, elbo], updates=updates, mode=mode) # For tracking convergence of ELBO window_size = int(max(0.1 * n // eval_elbo, 2.0)) circ_buff = deque([], maxlen=window_size) # Optimization loop elbos = np.empty(n) divergence_flag = False progress = trange(n) if progressbar else range(n) try: uw_i, elbo_current = f() if np.isnan(elbo_current): raise FloatingPointError('NaN occurred in ADVI optimization.') for i in progress: uw_i, e = f() if np.isnan(e): raise FloatingPointError('NaN occurred in ADVI optimization.') elbos[i] = e if progressbar: if n < 10: progress.set_description('ELBO = {:,.5g}'.format(elbos[i])) elif i % (n // 10) == 0 and i > 0: avg_elbo = infmean(elbos[i - n // 10:i]) progress.set_description( 'Average ELBO = {:,.5g}'.format(avg_elbo)) if i % eval_elbo == 0: elbo_prev = elbo_current elbo_current = elbos[i] delta_elbo = abs((elbo_current - elbo_prev) / elbo_prev) circ_buff.append(delta_elbo) avg_delta = np.mean(circ_buff) med_delta = np.median(circ_buff) if i > 0 and avg_delta < tol_obj: pm._log.info('Mean ELBO converged.') elbos = elbos[:(i + 1)] break elif i > 0 and med_delta < tol_obj: pm._log.info('Median ELBO converged.') elbos = elbos[:(i + 1)] break if i > 10 * eval_elbo: if med_delta > 0.5 or avg_delta > 0.5: divergence_flag = True else: divergence_flag = False except KeyboardInterrupt: elbos = elbos[:i] if n < 10: pm._log.info( 'Interrupted at {:,d} [{:.0f}%]: ELBO = {:,.5g}'.format( i, 100 * i // n, elbos[i])) else: avg_elbo = infmean(elbos[i - n // 10:i]) pm._log.info( 'Interrupted at {:,d} [{:.0f}%]: Average ELBO = {:,.5g}'. format(i, 100 * i // n, avg_elbo)) else: if n < 10: pm._log.info('Finished [100%]: ELBO = {:,.5g}'.format(elbos[-1])) else: avg_elbo = infmean(elbos[-n // 10:]) pm._log.info( 'Finished [100%]: Average ELBO = {:,.5g}'.format(avg_elbo)) finally: if progressbar: progress.close() if divergence_flag: pm._log.info('Evidence of divergence detected, inspect ELBO.') # Estimated parameters l = int(uw_i.size / 2) u = bij.rmap(uw_i[:l]) w = bij.rmap(uw_i[l:]) # w is in log space for var in w.keys(): w[var] = np.exp(w[var]) return ADVIFit(u, w, elbos)
def advi(vars=None, start=None, model=None, n=5000, accurate_elbo=False, optimizer=None, learning_rate=.001, epsilon=.1, random_seed=None): """Perform automatic differentiation variational inference (ADVI). This function implements the meanfield ADVI, where the variational posterior distribution is assumed to be spherical Gaussian without correlation of parameters and fit to the true posterior distribution. The means and standard deviations of the variational posterior are referred to as variational parameters. The return value of this function is an :code:`ADVIfit` object, which has variational parameters. If you want to draw samples from the variational posterior, you need to pass the :code:`ADVIfit` object to :code:`pymc3.variational.sample_vp()`. The variational parameters are defined on the transformed space, which is required to do ADVI on an unconstrained parameter space as described in [KTR+2016]. The parameters in the :code:`ADVIfit` object are in the transformed space, while traces returned by :code:`sample_vp()` are in the original space as obtained by MCMC sampling methods in PyMC3. The variational parameters are optimized with given optimizer, which is a function that returns a dictionary of parameter updates as provided to Theano function. If no optimizer is provided, optimization is performed with a modified version of adagrad, where only the last (n_window) gradient vectors are used to control the learning rate and older gradient vectors are ignored. n_window denotes the size of time window and fixed to 10. Parameters ---------- vars : object Random variables. start : Dict or None Initial values of parameters (variational means). model : Model Probabilistic model. n : int Number of interations updating parameters. accurate_elbo : bool If true, 100 MC samples are used for accurate calculation of ELBO. optimizer : (loss, tensor) -> dict or OrderedDict A function that returns parameter updates given loss and parameter tensor. If :code:`None` (default), a default Adagrad optimizer is used with parameters :code:`learning_rate` and :code:`epsilon` below. learning_rate: float Base learning rate for adagrad. This parameter is ignored when optimizer is given. epsilon : float Offset in denominator of the scale of learning rate in Adagrad. This parameter is ignored when optimizer is given. random_seed : int or None Seed to initialize random state. None uses current seed. Returns ------- ADVIFit Named tuple, which includes 'means', 'stds', and 'elbo_vals'. 'means' is the mean. 'stds' is the standard deviation. 'elbo_vals' is the trace of ELBO values during optimizaiton. References ---------- .. [KTR+2016] Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A., and Blei, D. M. (2016). Automatic Differentiation Variational Inference. arXiv preprint arXiv:1603.00788. """ model = pm.modelcontext(model) if start is None: start = model.test_point if vars is None: vars = model.vars vars = pm.inputvars(vars) if not pm.model.all_continuous(vars): raise ValueError('Model should not include discrete RVs for ADVI.') n_mcsamples = 100 if accurate_elbo else 1 # Prepare optimizer if optimizer is None: optimizer = adagrad_optimizer(learning_rate, epsilon) # Create variational gradient tensor elbo, shared = _calc_elbo(vars, model, n_mcsamples=n_mcsamples, random_seed=random_seed) # Set starting values for var, share in shared.items(): share.set_value(start[str(var)]) order = pm.ArrayOrdering(vars) bij = pm.DictToArrayBijection(order, start) u_start = bij.map(start) w_start = np.zeros_like(u_start) uw = np.concatenate([u_start, w_start]) # Create parameter update function used in the training loop uw_shared = theano.shared(uw, 'uw_shared') elbo = pm.CallableTensor(elbo)(uw_shared) updates = optimizer(loss=-1 * elbo, param=[uw_shared]) f = theano.function([], [uw_shared, elbo], updates=updates) # Optimization loop elbos = np.empty(n) try: progress = trange(n) for i in progress: uw_i, e = f() elbos[i] = e if i % (n // 10) == 0 and i > 0: avg_elbo = elbos[i - n // 10:i].mean() progress.set_description('Average ELBO = {:,.5g}'.format(avg_elbo)) except KeyboardInterrupt: elbos = elbos[:i] avg_elbo = elbos[i - n // 10:].mean() pm._log.info('Interrupted at {:,d} [{:.0f}%]: Average ELBO = {:,.5g}'.format( i, 100 * i // n, avg_elbo)) else: avg_elbo = elbos[-n // 10:].mean() pm._log.info('Finished [100%]: Average ELBO = {:,.5g}'.format(avg_elbo)) # Estimated parameters l = int(uw_i.size / 2) u = bij.rmap(uw_i[:l]) w = bij.rmap(uw_i[l:]) # w is in log space for var in w.keys(): w[var] = np.exp(w[var]) return ADVIFit(u, w, elbos)
def ATMIP_sample(n_steps, step=None, start=None, trace=None, chain=0, stage=None, njobs=1, tune=None, progressbar=False, model=None, random_seed=None): """ (C)ATMIP sampling algorithm from Minson et al. 2013: Bayesian inversion for finite fault earthquake source models I- Theory and algorithm (without cascading- C) https://gji.oxfordjournals.org/content/194/3/1701.full Samples the solution space with n_chains of Metropolis chains, where each chain has n_steps iterations. Once finished, the sampled traces are evaluated: (1) Based on the likelihoods of the final samples, chains are weighted (2) the weighted covariance of the ensemble is calculated and set as new proposal distribution (3) the variation in the ensemble is calculated and the next tempering parameter (beta) calculated (4) New n_chains Metropolis chains are seeded on the traces with high weight for n_steps iterations (5) Repeat until beta > 1. Parameters ---------- n_steps : int The number of samples to draw for each Markov-chain per stage step : function from TMCMC initialisation start : List of dicts with length(n_chains) Starting points in parameter space (or partial point) Defaults to random draws from variables (defaults to empty dict) trace : backend This should be a backend instance. Passing either "text" or "sqlite" is taken as a shortcut to set up the corresponding backend (with "mcmc" used as the base name). chain : int Chain number used to store sample in backend. If `njobs` is greater than one, chain numbers will start here. stage : int Stage where to start or continue the calculation. If None the start will be at stage = 0. njobs : int The number of cores to be used in parallel. Be aware that theano has internal parallelisation. Sometimes this is more efficient especially for simple models. step.n_chains / njobs has to be an integer number! tune : int Number of iterations to tune, if applicable (defaults to None) trace : result_folder for storing stages, will be created if not existing progressbar : bool Flag for progress bar model : Model (optional if in `with` context) has to contain deterministic variable 'name defined under step.likelihood_name' that contains model likelihood random_seed : int or list of ints A list is accepted if more if `njobs` is greater than one. Returns ------- MultiTrace object with access to sampling values """ model = pm.modelcontext(model) step.n_steps = int(n_steps) seed(random_seed) if n_steps < 1: raise ValueError('Argument `n_steps` should be above 0.') if step is None: raise Exception('Argument `step` has to be a TMCMC step object.') if trace is None: raise Exception('Argument `trace` should be either sqlite or text ' 'backend object.') if start is not None: if len(start) != step.n_chains: raise Exception('Argument `start` should have dicts equal the ' 'number of chains (step.N-chains)') else: step.population = start if stage is not None: step.stage = stage if not any( step.likelihood_name in var.name for var in model.deterministics): raise Exception('Model (deterministic) variables need to contain ' 'a variable `' + step.likelihood_name + '` as ' 'defined in `step`.') if progressbar: verbosity = 5 else: verbosity = 0 homepath = trace if not os.path.exists(homepath): os.mkdir(homepath) with model: with Parallel(n_jobs=njobs, verbose=verbosity) as parallel: while step.beta < 1.: print('Beta: ' + str(step.beta), ' Stage: ' + str(step.stage)) if step.stage == 0: # Initial stage print('Sample initial stage: ...') stage_path = homepath + '/stage_' + str(step.stage) trace = pm.backends.Text(stage_path, model=model) initial = _iter_initial(step, chain=chain, trace=trace) progress = pm.progressbar.progress_bar(step.n_chains) try: for i, strace in enumerate(initial): if progressbar: progress.update(i) except KeyboardInterrupt: strace.close() mtrace = pm.backends.base.MultiTrace([strace]) step.population, step.array_population, step.likelihoods = \ step.select_end_points(mtrace) step.beta, step.old_beta, step.weights = step.calc_beta() step.covariance = step.calc_covariance() step.res_indx = step.resample() step.stage += 1 del(strace, mtrace, trace) else: if progressbar and njobs > 1: progressbar = False # Metropolis sampling intermediate stages stage_path = homepath + '/stage_' + str(step.stage) step.proposal_dist = MvNPd(step.covariance) sample_args = { 'draws': n_steps, 'step': step, 'stage_path': stage_path, 'progressbar': progressbar, 'model': model} mtrace = _iter_parallel_chains(parallel, **sample_args) step.population, step.array_population, step.likelihoods = \ step.select_end_points(mtrace) step.beta, step.old_beta, step.weights = step.calc_beta() step.stage += 1 if step.beta > 1.: print('Beta > 1.: ' + str(step.beta)) step.beta = 1. break step.covariance = step.calc_covariance() step.res_indx = step.resample() # Metropolis sampling final stage print('Sample final stage') stage_path = homepath + '/stage_final' temp = np.exp((1 - step.old_beta) * \ (step.likelihoods - step.likelihoods.max())) step.weights = temp / np.sum(temp) step.covariance = step.calc_covariance() step.proposal_dist = MvNPd(step.covariance) step.res_indx = step.resample() sample_args['step'] = step sample_args['stage_path'] = stage_path mtrace = _iter_parallel_chains(parallel, **sample_args) return mtrace