def __init__(self, vars=None, S=None, proposal_dist=None, lamb=None, scaling=0.001, tune=None, tune_interval=100, model=None, mode=None, **kwargs): model = pm.modelcontext(model) if vars is None: vars = model.cont_vars vars = pm.inputvars(vars) if S is None: S = np.ones(model.ndim) if proposal_dist is not None: self.proposal_dist = proposal_dist(S) else: self.proposal_dist = UniformProposal(S) self.scaling = np.atleast_1d(scaling).astype('d') if lamb is None: lamb = 2.38 / np.sqrt(2 * model.ndim) self.lamb = float(lamb) if not tune in {None, 'scaling', 'lambda'}: raise ValueError('The parameter "tune" must be one of {None, scaling, lambda}') self.tune = tune self.tune_interval = tune_interval self.steps_until_tune = tune_interval self.accepted = 0 self.mode = mode shared = pm.make_shared_replacements(vars, model) self.delta_logp = delta_logp(model.logpt, vars, shared) super().__init__(vars, shared)
def __init__(self, vars=None, S=None, proposal_dist=None, lamb=None, scaling=0.001, tune=True, tune_interval=100, model=None, mode=None, **kwargs): model = pm.modelcontext(model) if vars is None: vars = model.cont_vars vars = pm.inputvars(vars) if S is None: S = np.ones(sum(v.dsize for v in vars)) if proposal_dist is not None: self.proposal_dist = proposal_dist(S) else: self.proposal_dist = UniformProposal(S) self.scaling = np.atleast_1d(scaling).astype('d') if lamb is None: lamb = 2.38 / np.sqrt(2 * S.size) self.lamb = float(lamb) self.tune = tune self.tune_interval = tune_interval self.steps_until_tune = tune_interval self.accepted = 0 self.mode = mode shared = pm.make_shared_replacements(vars, model) self.delta_logp = delta_logp(model.logpt, vars, shared) super().__init__(vars, shared)
def __init__(self, vars=None, S=None, proposal_dist=NormalProposal, scaling=1., tune=True, tune_interval=100, model=None, **kwargs): model = pm.modelcontext(model) if vars is None: vars = model.vars vars = pm.inputvars(vars) if S is None: S = np.ones(sum(v.dsize for v in vars)) self.proposal_dist = proposal_dist(S) self.scaling = np.atleast_1d(scaling) self.tune = tune self.tune_interval = tune_interval self.steps_until_tune = tune_interval self.accepted = 0 # Determine type of variables self.discrete = np.concatenate( [[v.dtype in pm.discrete_types] * (v.dsize or 1) for v in vars]) self.any_discrete = self.discrete.any() self.all_discrete = self.discrete.all() shared = pm.make_shared_replacements(vars, model) self.delta_logp = delta_logp(model.logpt, vars, shared) super(Metropolis, self).__init__(vars, shared)
def __init__(self, vars=None, S=None, proposal_dist=None, lamb=None, scaling=0.001, tune=True, tune_interval=100, model=None, mode=None, **kwargs): warnings.warn('Population based sampling methods such as DEMetropolis are experimental.' \ ' Use carefully and be extra critical about their results!') model = pm.modelcontext(model) if vars is None: vars = model.cont_vars vars = pm.inputvars(vars) if S is None: S = np.ones(sum(v.dsize for v in vars)) if proposal_dist is not None: self.proposal_dist = proposal_dist(S) else: self.proposal_dist = UniformProposal(S) self.scaling = np.atleast_1d(scaling).astype('d') if lamb is None: lamb = 2.38 / np.sqrt(2 * S.size) self.lamb = float(lamb) self.tune = tune self.tune_interval = tune_interval self.steps_until_tune = tune_interval self.accepted = 0 self.mode = mode shared = pm.make_shared_replacements(vars, model) self.delta_logp = delta_logp(model.logpt, vars, shared) super(DEMetropolis, self).__init__(vars, shared)
def __init__(self, *args, **kwargs): """ Initialise MetropolisMLDA. This is a mix of the parent's class' initialisation and some extra code specific for MLDA. """ # flag to that variance reduction is activated - forces MetropolisMLDA # to store quantities of interest in a register if True self.mlda_variance_reduction = kwargs.pop("mlda_variance_reduction", False) if self.mlda_variance_reduction: # Subsampling rate of MLDA sampler one level up self.mlda_subsampling_rate_above = kwargs.pop( "mlda_subsampling_rate_above") self.sub_counter = 0 self.Q_last = np.nan self.Q_reg = [np.nan] * self.mlda_subsampling_rate_above # extract some necessary variables model = pm.modelcontext(kwargs.get("model", None)) vars = kwargs.get("vars", None) if vars is None: vars = model.vars vars = pm.inputvars(vars) shared = pm.make_shared_replacements(vars, model) # call parent class __init__ super().__init__(*args, **kwargs) # modify the delta function and point to model if VR is used if self.mlda_variance_reduction: self.delta_logp = delta_logp_inverse(model.logpt, vars, shared) self.model = model
def __init__(self, dir_priors, hmm_states, values=None, model=None, rng=None): r"""Initialize a `TransMatConjugateStep` object. Parameters ---------- dir_priors: list of Dirichlets State-ordered from-to prior transition probabilities. hmm_states: random variable The HMM states variable using `dir_priors` as its transition matrix. """ model = pm.modelcontext(model) dir_priors = list( chain.from_iterable([pm.inputvars(d) for d in dir_priors])) self.rng = rng self.dists = list(dir_priors) self.hmm_states = hmm_states.name # TODO: Perform a consistency check between `hmm_states.Gamma` and # `dir_priors`. super().__init__(dir_priors, [], allvars=True)
def __init__(self, vars=None, S=None, proposal_dist=None, lamb=None, scaling=0.001, tune=True, tune_interval=100, model=None, mode=None, **kwargs): warnings.warn('Population based sampling methods such as DEMetropolis are experimental.' \ ' Use carefully and be extra critical about their results!') model = pm.modelcontext(model) if vars is None: vars = model.cont_vars vars = pm.inputvars(vars) if S is None: S = np.ones(sum(v.dsize for v in vars)) if proposal_dist is not None: self.proposal_dist = proposal_dist(S) else: self.proposal_dist = UniformProposal(S) self.scaling = np.atleast_1d(scaling).astype('d') if lamb is None: lamb = 2.38 / np.sqrt(2 * S.size) self.lamb = float(lamb) self.tune = tune self.tune_interval = tune_interval self.steps_until_tune = tune_interval self.accepted = 0 self.mode = mode shared = pm.make_shared_replacements(vars, model) self.delta_logp = delta_logp(model.logpt, vars, shared) super().__init__(vars, shared)
def __init__(self, vars=None, S=None, proposal_dist=None, lamb=None, scaling=0.001, tune='lambda', tune_interval=100, tune_drop_fraction: float = 0.9, model=None, mode=None, **kwargs): warnings.warn( 'The DEMetropolisZ implementation in PyMC3 is very young. You should be extra critical about its results.' ' See Pull Request #3784 for more information.') model = pm.modelcontext(model) if vars is None: vars = model.cont_vars vars = pm.inputvars(vars) if S is None: S = np.ones(model.ndim) if proposal_dist is not None: self.proposal_dist = proposal_dist(S) else: self.proposal_dist = UniformProposal(S) self.scaling = np.atleast_1d(scaling).astype('d') if lamb is None: # default to the optimal lambda for normally distributed targets lamb = 2.38 / np.sqrt(2 * model.ndim) self.lamb = float(lamb) if tune not in {None, 'scaling', 'lambda'}: raise ValueError( 'The parameter "tune" must be one of {None, scaling, lambda}') self.tune = True self.tune_target = tune self.tune_interval = tune_interval self.tune_drop_fraction = tune_drop_fraction self.steps_until_tune = tune_interval self.accepted = 0 # cache local history for the Z-proposals self._history = [] # remember initial settings before tuning so they can be reset self._untuned_settings = dict(scaling=self.scaling, lamb=self.lamb, steps_until_tune=tune_interval, accepted=self.accepted) self.mode = mode shared = pm.make_shared_replacements(vars, model) self.delta_logp = delta_logp(model.logpt, vars, shared) super().__init__(vars, shared)
def __init__( self, vars=None, S=None, proposal_dist=None, lamb=None, scaling=0.001, tune="lambda", tune_interval=100, tune_drop_fraction: float = 0.9, model=None, mode=None, **kwargs ): model = pm.modelcontext(model) initial_values = model.initial_point initial_values_size = sum(initial_values[n.name].size for n in model.value_vars) if vars is None: vars = model.cont_vars vars = pm.inputvars(vars) if S is None: S = np.ones(initial_values_size) if proposal_dist is not None: self.proposal_dist = proposal_dist(S) else: self.proposal_dist = UniformProposal(S) self.scaling = np.atleast_1d(scaling).astype("d") if lamb is None: # default to the optimal lambda for normally distributed targets lamb = 2.38 / np.sqrt(2 * initial_values_size) self.lamb = float(lamb) if tune not in {None, "scaling", "lambda"}: raise ValueError('The parameter "tune" must be one of {None, scaling, lambda}') self.tune = True self.tune_target = tune self.tune_interval = tune_interval self.tune_drop_fraction = tune_drop_fraction self.steps_until_tune = tune_interval self.accepted = 0 # cache local history for the Z-proposals self._history = [] # remember initial settings before tuning so they can be reset self._untuned_settings = dict( scaling=self.scaling, lamb=self.lamb, steps_until_tune=tune_interval, accepted=self.accepted, ) self.mode = mode shared = pm.make_shared_replacements(initial_values, vars, model) self.delta_logp = delta_logp(initial_values, model.logpt, vars, shared) super().__init__(vars, shared)
def __init__(self, vars=None, covariance=None, scaling=1., n_chains=100, tune=True, tune_interval=100, model=None, check_bound=True, likelihood_name='like', proposal_dist=MvNPd, coef_variation=1., **kwargs): model = pm.modelcontext(model) if vars is None: vars = model.vars vars = pm.inputvars(vars) if covariance is None: self.covariance = np.eye(sum(v.dsize for v in vars)) self.scaling = np.atleast_1d(scaling) self.tune = tune self.check_bnd = check_bound self.tune_interval = tune_interval self.steps_until_tune = tune_interval self.proposal_dist = proposal_dist(self.covariance) self.proposal_samples_array = self.proposal_dist(n_chains) self.stage_sample = 0 self.accepted = 0 self.beta = 0 self.stage = 0 self.coef_variation = coef_variation self.n_chains = n_chains self.likelihoods = [] self.likelihood_name = likelihood_name self.discrete = np.concatenate( [[v.dtype in pm.discrete_types] * (v.dsize or 1) for v in vars]) self.any_discrete = self.discrete.any() self.all_discrete = self.discrete.all() # create initial population self.population = [] self.array_population = np.zeros(n_chains) for i in range(self.n_chains): dummy = pm.Point({v.name: v.random() for v in vars}, model=model) self.population.append(dummy) shared = make_shared_replacements(vars, model) self.logp_forw = logp_forw(model.logpt, vars, shared) self.check_bnd = logp_forw(model.varlogpt, vars, shared) self.delta_logp = pm.metropolis.delta_logp(model.logpt, vars, shared) super(ATMCMC, self).__init__(vars, shared)
def __init__(self, vars, model=None): model = pymc3.modelcontext(model) if len(vars) != 1: raise ValueError("Please provide only one") vars = pymc3.inputvars(vars) self.__var = vars[0] self.__var_name = self.__var.name super(RandomFieldGibbs, self).__init__(vars, [model.fastlogp])
def __init__( self, vars=None, S=None, proposal_dist=None, scaling=1.0, tune=True, tune_interval=100, model=None, mode=None, **kwargs ): model = pm.modelcontext(model) if vars is None: vars = model.vars vars = pm.inputvars(vars) if S is None: S = np.ones(sum(v.dsize for v in vars)) if proposal_dist is not None: self.proposal_dist = proposal_dist(S) elif S.ndim == 1: self.proposal_dist = NormalProposal(S) elif S.ndim == 2: self.proposal_dist = MultivariateNormalProposal(S) else: raise ValueError("Invalid rank for variance: %s" % S.ndim) self.scaling = np.atleast_1d(scaling).astype("d") self.tune = tune self.tune_interval = tune_interval self.steps_until_tune = tune_interval self.accepted = 0 # Determine type of variables self.discrete = np.concatenate( [[v.dtype in pm.discrete_types] * (v.dsize or 1) for v in vars] ) self.any_discrete = self.discrete.any() self.all_discrete = self.discrete.all() # remember initial settings before tuning so they can be reset self._untuned_settings = dict( scaling=self.scaling, steps_until_tune=tune_interval, accepted=self.accepted ) self.mode = mode shared = pm.make_shared_replacements(vars, model) self.delta_logp = delta_logp(model.logpt, vars, shared) super().__init__(vars, shared)
def __init__(self, model, observed): self.model = model self.observed = observed vars = pm.inputvars(model.cont_vars) bij = pm.DictToArrayBijection(pm.ArrayOrdering(vars), model.test_point) self.logp = bij.mapf(model.fastlogp) self.dlogp = bij.mapf(model.fastdlogp(vars)) self.num_vars = len(vars)
def __init__(self, vars=None, w=1., tune=True, model=None, max_iter=10, **kwargs): self.model = pm.modelcontext(model) self.w = w self.tune = tune self.w_sum = 0 self.n_tunes = 0 self.max_iter = max_iter if vars is None: vars = self.model.cont_vars vars = pm.inputvars(vars) super(RobustSlice, self).__init__(vars, [self.model.fastlogp], **kwargs)
def svgd(vars=None, n=5000, n_particles=100, jitter=.01, optimizer=adagrad, start=None, progressbar=True, random_seed=None, model=None): if random_seed is not None: seed(random_seed) model = modelcontext(model) if vars is None: vars = model.vars vars = pm.inputvars(vars) if start is None: start = model.test_point start = model.dict_to_array(start) # Initialize particles x0 = np.tile(start, (n_particles, 1)) x0 += np.random.normal(0, jitter, x0.shape) theta = theano.shared(x0) # Create theano svgd gradient expression and function logp_grad_vec = _make_vectorized_logp_grad(vars, model, theta) svgd_grad = -1 * _svgd_gradient(vars, model, theta, logp_grad_vec) # maximize svgd_updates = optimizer([svgd_grad], [theta], learning_rate=1e-3) i = tt.iscalar('i') svgd_step = theano.function([i], [i], updates=svgd_updates) # Run svgd optimization if progressbar: progress = tqdm(np.arange(n)) else: progress = np.arange(n) for ii in progress: svgd_step(ii) theta_val = theta.get_value() # Build trace strace = pm.backends.NDArray() strace.setup(theta_val.shape[0], 1) for p in theta_val: strace.record(model.bijection.rmap(p)) strace.close() trace = pm.backends.base.MultiTrace([strace]) return trace
def __init__(self, model_vars, values=None, model=None, rng=None): """Initialize a `TransMatConjugateStep` object.""" model = pm.modelcontext(model) if isinstance(model_vars, Variable): model_vars = [model_vars] model_vars = list(chain.from_iterable([pm.inputvars(v) for v in model_vars])) # TODO: Are the rows in this matrix our `dir_priors`? dir_priors = [] self.dir_priors_untrans = [] for d in model_vars: untrans_var = model.named_vars[get_untransformed_name(d.name)] if isinstance(untrans_var.distribution, pm.Dirichlet): self.dir_priors_untrans.append(untrans_var) dir_priors.append(d) state_seqs = [ v for v in model.vars + model.observed_RVs if isinstance(v.distribution, DiscreteMarkovChain) and all(d in graph_inputs([v.distribution.Gammas]) for d in dir_priors) ] if not self.dir_priors_untrans or not len(state_seqs) == 1: raise ValueError( "This step method requires a set of Dirichlet priors" " that comprise a single transition matrix" ) (state_seq,) = state_seqs Gamma = state_seq.distribution.Gammas self._set_row_mappings(Gamma, dir_priors, model) if len(self.row_remaps) != len(dir_priors): raise TypeError( "The Dirichlet priors could not be found" " in the graph for {}".format(state_seq.distribution.Gammas) ) if state_seq in model.observed_RVs: self.state_seq_obs = np.asarray(state_seq.distribution.data) self.rng = rng self.dists = list(dir_priors) self.state_seq_name = state_seq.name super().__init__(dir_priors, [], allvars=True)
def __init__(self, vars, data, model=None, **kwargs): model = pymc3.modelcontext(model) if len(vars) != 1: raise ValueError("Please provide only one") vars = pymc3.inputvars(vars) self.__var = vars[0] self.__var_name = self.__var.name self.__data = data self.__alpha = kwargs.get("alpha", 1) self.__acceptance_rate = kwargs.get("acceptance_rate", .5) super(StructureMCMC, self).__init__(vars, [model.fastlogp])
def __init__(self, vars, proposal="uniform", order="random", model=None): model = pm.modelcontext(model) vars = pm.inputvars(vars) initial_point = model.initial_point dimcats = [] # The above variable is a list of pairs (aggregate dimension, number # of categories). For example, if vars = [x, y] with x being a 2-D # variable with M categories and y being a 3-D variable with N # categories, we will have dimcats = [(0, M), (1, M), (2, N), (3, N), (4, N)]. for v in vars: v_init_val = initial_point[v.name] rv_var = model.values_to_rvs[v] distr = getattr(rv_var.owner, "op", None) if isinstance(distr, CategoricalRV): k_graph = rv_var.owner.inputs[3].shape[-1] (k_graph,), _ = rvs_to_value_vars((k_graph,), apply_transforms=True) k = model.fn(k_graph)(initial_point) elif isinstance(distr, BernoulliRV): k = 2 else: raise ValueError( "All variables must be categorical or binary" + "for CategoricalGibbsMetropolis" ) start = len(dimcats) dimcats += [(dim, k) for dim in range(start, start + v_init_val.size)] if order == "random": self.shuffle_dims = True self.dimcats = dimcats else: if sorted(order) != list(range(len(dimcats))): raise ValueError("Argument 'order' has to be a permutation") self.shuffle_dims = False self.dimcats = [dimcats[j] for j in order] if proposal == "uniform": self.astep = self.astep_unif elif proposal == "proportional": # Use the optimized "Metropolized Gibbs Sampler" described in Liu96. self.astep = self.astep_prop else: raise ValueError("Argument 'proposal' should either be 'uniform' or 'proportional'") super().__init__(vars, [model.fastlogp])
def __init__(self, vars, values=None, model=None): if len(vars) > 1: raise ValueError("This sampler only takes one variable.") (var, ) = pm.inputvars(vars) if not isinstance(var.distribution, DiscreteMarkovChain): raise TypeError( "This sampler only samples `DiscreteMarkovChain`s.") model = pm.modelcontext(model) self.vars = [var] self.dependent_rvs = [ v for v in model.basic_RVs if v is not var and var in graph_inputs([v.logpt]) ] dep_comps_logp_stacked = [] for i, dependent_rv in enumerate(self.dependent_rvs): if isinstance(dependent_rv.distribution, SwitchingProcess): comp_logps = [] # Get the log-likelihoood sequences for each state in this # `SwitchingProcess` observations distribution for comp_dist in dependent_rv.distribution.comp_dists: comp_logps.append(comp_dist.logp(dependent_rv)) comp_logp_stacked = at.stack(comp_logps) else: raise TypeError( "This sampler only supports `SwitchingProcess` observations" ) dep_comps_logp_stacked.append(comp_logp_stacked) comp_logp_stacked = at.sum(dep_comps_logp_stacked, axis=0) (M, ) = draw_values([var.distribution.gamma_0.shape[-1]], point=model.test_point) N = model.test_point[var.name].shape[-1] self.alphas = np.empty((M, N), dtype=float) self.log_lik_states = model.fn(comp_logp_stacked) self.gamma_0_fn = model.fn(var.distribution.gamma_0) self.Gammas_fn = model.fn(var.distribution.Gammas)
def __init__(self, model): """ Parameters ---------- model : pymc3.Model The probability model, written with Theano shared variables to form any observations. The Theano shared variables are set during inference. """ self.model = model vars = pm.inputvars(model.cont_vars) self.n_vars = len(vars) bij = pm.DictToArrayBijection(pm.ArrayOrdering(vars), model.test_point) self.logp = bij.mapf(model.fastlogp) self.dlogp = bij.mapf(model.fastdlogp(vars))
def __init__(self, model): """ Parameters ---------- model : pymc3.Model The probability model, written with Theano shared variables to form any observations. The Theano shared variables are set during inference. """ self.model = model vars = pm.inputvars(model.cont_vars) self.num_vars = len(vars) bij = pm.DictToArrayBijection(pm.ArrayOrdering(vars), model.test_point) self.logp = bij.mapf(model.fastlogp) self.dlogp = bij.mapf(model.fastdlogp(vars))
def __init__(self, model): """ Parameters ---------- model : pymc3.Model The probability model, written with Theano shared variables to form any observations and with `transform=None` for any latent variables. The Theano shared variables are set during inference, and all latent variables live on their original (constrained) space. """ self.model = model self.n_vars = None vars = pm.inputvars(model.cont_vars) bij = pm.DictToArrayBijection(pm.ArrayOrdering(vars), model.test_point) self.logp = bij.mapf(model.fastlogp) self.dlogp = bij.mapf(model.fastdlogp(vars))
def __init__(self, vars=None, S=None, proposal_dist=None, proposal_density = None, scaling=1., tune=True, tune_interval=100, model=None, mode=None, **kwargs): model = pm.modelcontext(model) if vars is None: vars = model.vars vars = pm.inputvars(vars) if S is None: S = np.ones(sum(v.dsize for v in vars)) if proposal_dist is not None: self.proposal_dist = proposal_dist elif S.ndim == 1: self.proposal_dist = NormalProposal(S) elif S.ndim == 2: self.proposal_dist = MultivariateNormalProposal(S) else: raise ValueError("Invalid rank for variance: %s" % S.ndim) if proposal_density is not None: self.proposal_density = proposal_density else : raise ValueError("You must provide a proposal density to ensure unbiased samples") self.scaling = np.atleast_1d(scaling).astype('d') self.tune = tune self.tune_interval = tune_interval self.steps_until_tune = tune_interval self.accepted = 0 # Determine type of variables self.discrete = np.concatenate( [[v.dtype in pm.discrete_types] * (v.dsize or 1) for v in vars]) self.any_discrete = self.discrete.any() self.all_discrete = self.discrete.all() self.mode = mode shared = pm.make_shared_replacements(vars, model) self.delta_logp = delta_logp(model.logpt, vars, shared) super(PIP_Metropolis, self).__init__(vars, shared)
def __init__(self, vars, proposal="uniform", order="random", model=None): model = pm.modelcontext(model) vars = pm.inputvars(vars) dimcats = [] # The above variable is a list of pairs (aggregate dimension, number # of categories). For example, if vars = [x, y] with x being a 2-D # variable with M categories and y being a 3-D variable with N # categories, we will have dimcats = [(0, M), (1, M), (2, N), (3, N), (4, N)]. for v in vars: distr = getattr(v.distribution, "parent_dist", v.distribution) if isinstance(distr, pm.Categorical): k = draw_values([distr.k])[0] elif isinstance(distr, pm.Bernoulli) or (v.dtype in pm.bool_types): k = 2 else: raise ValueError( "All variables must be categorical or binary" + "for CategoricalGibbsMetropolis") start = len(dimcats) dimcats += [(dim, k) for dim in range(start, start + v.dsize)] if order == "random": self.shuffle_dims = True self.dimcats = dimcats else: if sorted(order) != list(range(len(dimcats))): raise ValueError("Argument 'order' has to be a permutation") self.shuffle_dims = False self.dimcats = [dimcats[j] for j in order] if proposal == "uniform": self.astep = self.astep_unif elif proposal == "proportional": # Use the optimized "Metropolized Gibbs Sampler" described in Liu96. self.astep = self.astep_prop else: raise ValueError( "Argument 'proposal' should either be 'uniform' or 'proportional'" ) super().__init__(vars, [model.fastlogp])
def __init__(self, var, values=None, model=None): model = pm.modelcontext(model) (var, ) = pm.inputvars(var) self.dependent_rvs = [ v for v in model.basic_RVs if v is not var and var in tt.gof.graph.inputs([v.logpt]) ] # We compile a function--from a Theano graph--that computes the # total log-likelihood values for each state in the sequence. dependents_log_lik = model.fn( tt.sum([v.logp_elemwiset for v in self.dependent_rvs], axis=0)) self.gamma_0_fn = model.fn(var.distribution.gamma_0) self.Gammas_fn = model.fn(var.distribution.Gammas) super().__init__([var], [dependents_log_lik], allvars=True)
def __init__(self, vars, proposal='uniform', order='random', model=None): model = pm.modelcontext(model) vars = pm.inputvars(vars) dimcats = [] # The above variable is a list of pairs (aggregate dimension, number # of categories). For example, if vars = [x, y] with x being a 2-D # variable with M categories and y being a 3-D variable with N # categories, we will have dimcats = [(0, M), (1, M), (2, N), (3, N), (4, N)]. for v in vars: distr = getattr(v.distribution, 'parent_dist', v.distribution) if isinstance(distr, pm.Categorical): k = draw_values([distr.k])[0] elif isinstance(distr, pm.Bernoulli) or (v.dtype in pm.bool_types): k = 2 else: raise ValueError('All variables must be categorical or binary' + 'for CategoricalGibbsMetropolis') start = len(dimcats) dimcats += [(dim, k) for dim in range(start, start + v.dsize)] if order == 'random': self.shuffle_dims = True self.dimcats = dimcats else: if sorted(order) != list(range(len(dimcats))): raise ValueError('Argument \'order\' has to be a permutation') self.shuffle_dims = False self.dimcats = [dimcats[j] for j in order] if proposal == 'uniform': self.astep = self.astep_unif elif proposal == 'proportional': # Use the optimized "Metropolized Gibbs Sampler" described in Liu96. self.astep = self.astep_prop else: raise ValueError('Argument \'proposal\' should either be ' + '\'uniform\' or \'proportional\'') super(CategoricalGibbsMetropolis, self).__init__(vars, [model.fastlogp])
def __init__(self, vars=None, S=None, proposal_dist=None, scaling=1., tune=True, tune_interval=100, model=None, mode=None, **kwargs): model = pm.modelcontext(model) if vars is None: vars = model.vars vars = pm.inputvars(vars) if S is None: S = np.ones(sum(v.dsize for v in vars)) if proposal_dist is not None: self.proposal_dist = proposal_dist(S) elif S.ndim == 1: self.proposal_dist = NormalProposal(S) elif S.ndim == 2: self.proposal_dist = MultivariateNormalProposal(S) else: raise ValueError("Invalid rank for variance: %s" % S.ndim) self.scaling = np.atleast_1d(scaling).astype('d') self.tune = tune self.tune_interval = tune_interval self.steps_until_tune = tune_interval self.accepted = 0 # Determine type of variables self.discrete = np.concatenate( [[v.dtype in pm.discrete_types] * (v.dsize or 1) for v in vars]) self.any_discrete = self.discrete.any() self.all_discrete = self.discrete.all() self.mode = mode shared = pm.make_shared_replacements(vars, model) self.delta_logp = delta_logp(model.logpt, vars, shared) super().__init__(vars, shared)
def __init__(self, *args, **kwargs): """ Initialise DEMetropolisZMLDA, uses parent class __init__ and extra code specific for use within MLDA. """ # flag used for signaling the end of tuning self.tuning_end_trigger = False model = pm.modelcontext(kwargs.get("model", None)) initial_values = model.initial_point # flag to that variance reduction is activated - forces DEMetropolisZMLDA # to store quantities of interest in a register if True self.mlda_variance_reduction = kwargs.pop("mlda_variance_reduction", False) if self.mlda_variance_reduction: # Subsampling rate of MLDA sampler one level up self.mlda_subsampling_rate_above = kwargs.pop("mlda_subsampling_rate_above") self.sub_counter = 0 self.Q_last = np.nan self.Q_reg = [np.nan] * self.mlda_subsampling_rate_above # extract some necessary variables value_vars = kwargs.get("vars", None) if value_vars is None: value_vars = model.value_vars value_vars = pm.inputvars(value_vars) shared = pm.make_shared_replacements(initial_values, value_vars, model) # call parent class __init__ super().__init__(*args, **kwargs) # modify the delta function and point to model if VR is used if self.mlda_variance_reduction: self.delta_logp = delta_logp_inverse(initial_values, model.logpt, value_vars, shared) self.model = model
def __init__(self, vars=None, S=None, proposal_dist=None, scaling=1.0, tune=True, tune_interval=100, model=None, mode=None, **kwargs): """Create an instance of a Metropolis stepper Parameters ---------- vars: list List of variables for sampler S: standard deviation or covariance matrix Some measure of variance to parameterize proposal distribution proposal_dist: function Function that returns zero-mean deviates when parameterized with S (and n). Defaults to normal. scaling: scalar or array Initial scale factor for proposal. Defaults to 1. tune: bool Flag for tuning. Defaults to True. tune_interval: int The frequency of tuning. Defaults to 100 iterations. model: PyMC Model Optional model for sampling step. Defaults to None (taken from context). mode: string or `Mode` instance. compilation mode passed to Aesara functions """ model = pm.modelcontext(model) if vars is None: vars = model.vars vars = pm.inputvars(vars) if S is None: S = np.ones(sum(v.dsize for v in vars)) if proposal_dist is not None: self.proposal_dist = proposal_dist(S) elif S.ndim == 1: self.proposal_dist = NormalProposal(S) elif S.ndim == 2: self.proposal_dist = MultivariateNormalProposal(S) else: raise ValueError("Invalid rank for variance: %s" % S.ndim) self.scaling = np.atleast_1d(scaling).astype("d") self.tune = tune self.tune_interval = tune_interval self.steps_until_tune = tune_interval self.accepted = 0 # Determine type of variables self.discrete = np.concatenate( [[v.dtype in pm.discrete_types] * (v.dsize or 1) for v in vars]) self.any_discrete = self.discrete.any() self.all_discrete = self.discrete.all() # remember initial settings before tuning so they can be reset self._untuned_settings = dict(scaling=self.scaling, steps_until_tune=tune_interval, accepted=self.accepted) self.mode = mode shared = pm.make_shared_replacements(vars, model) self.delta_logp = delta_logp(model.logpt, vars, shared) super().__init__(vars, shared)
def advi(vars=None, start=None, model=None, n=5000, accurate_elbo=False, optimizer=None, learning_rate=.001, epsilon=.1, random_seed=None): """Perform automatic differentiation variational inference (ADVI). This function implements the meanfield ADVI, where the variational posterior distribution is assumed to be spherical Gaussian without correlation of parameters and fit to the true posterior distribution. The means and standard deviations of the variational posterior are referred to as variational parameters. The return value of this function is an :code:`ADVIfit` object, which has variational parameters. If you want to draw samples from the variational posterior, you need to pass the :code:`ADVIfit` object to :code:`pymc3.variational.sample_vp()`. The variational parameters are defined on the transformed space, which is required to do ADVI on an unconstrained parameter space as described in [KTR+2016]. The parameters in the :code:`ADVIfit` object are in the transformed space, while traces returned by :code:`sample_vp()` are in the original space as obtained by MCMC sampling methods in PyMC3. The variational parameters are optimized with given optimizer, which is a function that returns a dictionary of parameter updates as provided to Theano function. If no optimizer is provided, optimization is performed with a modified version of adagrad, where only the last (n_window) gradient vectors are used to control the learning rate and older gradient vectors are ignored. n_window denotes the size of time window and fixed to 10. Parameters ---------- vars : object Random variables. start : Dict or None Initial values of parameters (variational means). model : Model Probabilistic model. n : int Number of interations updating parameters. accurate_elbo : bool If true, 100 MC samples are used for accurate calculation of ELBO. optimizer : (loss, tensor) -> dict or OrderedDict A function that returns parameter updates given loss and parameter tensor. If :code:`None` (default), a default Adagrad optimizer is used with parameters :code:`learning_rate` and :code:`epsilon` below. learning_rate: float Base learning rate for adagrad. This parameter is ignored when optimizer is given. epsilon : float Offset in denominator of the scale of learning rate in Adagrad. This parameter is ignored when optimizer is given. random_seed : int or None Seed to initialize random state. None uses current seed. Returns ------- ADVIFit Named tuple, which includes 'means', 'stds', and 'elbo_vals'. 'means' is the mean. 'stds' is the standard deviation. 'elbo_vals' is the trace of ELBO values during optimizaiton. References ---------- .. [KTR+2016] Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A., and Blei, D. M. (2016). Automatic Differentiation Variational Inference. arXiv preprint arXiv:1603.00788. """ model = pm.modelcontext(model) if start is None: start = model.test_point if vars is None: vars = model.vars vars = pm.inputvars(vars) if not pm.model.all_continuous(vars): raise ValueError('Model should not include discrete RVs for ADVI.') n_mcsamples = 100 if accurate_elbo else 1 # Prepare optimizer if optimizer is None: optimizer = adagrad_optimizer(learning_rate, epsilon) # Create variational gradient tensor elbo, shared = _calc_elbo(vars, model, n_mcsamples=n_mcsamples, random_seed=random_seed) # Set starting values for var, share in shared.items(): share.set_value(start[str(var)]) order = pm.ArrayOrdering(vars) bij = pm.DictToArrayBijection(order, start) u_start = bij.map(start) w_start = np.zeros_like(u_start) uw = np.concatenate([u_start, w_start]) # Create parameter update function used in the training loop uw_shared = theano.shared(uw, 'uw_shared') elbo = pm.CallableTensor(elbo)(uw_shared) updates = optimizer(loss=-1 * elbo, param=[uw_shared]) f = theano.function([], [uw_shared, elbo], updates=updates) # Optimization loop elbos = np.empty(n) try: progress = trange(n) for i in progress: uw_i, e = f() elbos[i] = e if i % (n // 10) == 0 and i > 0: avg_elbo = elbos[i - n // 10:i].mean() progress.set_description('Average ELBO = {:,.5g}'.format(avg_elbo)) except KeyboardInterrupt: elbos = elbos[:i] avg_elbo = elbos[i - n // 10:].mean() pm._log.info('Interrupted at {:,d} [{:.0f}%]: Average ELBO = {:,.5g}'.format( i, 100 * i // n, avg_elbo)) else: avg_elbo = elbos[-n // 10:].mean() pm._log.info('Finished [100%]: Average ELBO = {:,.5g}'.format(avg_elbo)) # Estimated parameters l = int(uw_i.size / 2) u = bij.rmap(uw_i[:l]) w = bij.rmap(uw_i[l:]) # w is in log space for var in w.keys(): w[var] = np.exp(w[var]) return ADVIFit(u, w, elbos)
def __init__( self, coarse_models: List[Model], vars: Optional[list] = None, base_sampler="DEMetropolisZ", base_S: Optional = None, base_proposal_dist: Optional[Type[Proposal]] = None, base_scaling: Optional = None, tune: bool = True, base_tune_target: str = "lambda", base_tune_interval: int = 100, base_lamb: Optional = None, base_tune_drop_fraction: float = 0.9, model: Optional[Model] = None, mode: Optional = None, subsampling_rates: List[int] = 5, base_blocked: bool = False, variance_reduction: bool = False, store_Q_fine: bool = False, adaptive_error_model: bool = False, **kwargs, ) -> None: # this variable is used to identify MLDA objects which are # not in the finest level (i.e. child MLDA objects) self.is_child = kwargs.get("is_child", False) if not self.is_child: warnings.warn( "The MLDA implementation in PyMC3 is still immature. You should be particularly critical of its results." ) if not isinstance(coarse_models, list): raise ValueError( "MLDA step method cannot use coarse_models if it is not a list" ) if len(coarse_models) == 0: raise ValueError("MLDA step method was given an empty " "list of coarse models. Give at least " "one coarse model.") # assign internal state model = pm.modelcontext(model) self.model = model self.coarse_models = coarse_models self.model_below = self.coarse_models[-1] self.num_levels = len(self.coarse_models) + 1 # set up variance reduction. self.variance_reduction = variance_reduction self.store_Q_fine = store_Q_fine # check that certain requirements hold # for the variance reduction feature to work if self.variance_reduction or self.store_Q_fine: if not hasattr(self.model, "Q"): raise AttributeError("Model given to MLDA does not contain" "variable 'Q'. You need to include" "the variable in the model definition" "for variance reduction to work or" "for storing the fine Q." "Use pm.Data() to define it.") if not isinstance(self.model.Q, tt.sharedvar.TensorSharedVariable): raise TypeError( "The variable 'Q' in the model definition is not of type " "'TensorSharedVariable'. Use pm.Data() to define the" "variable.") if self.is_child and self.variance_reduction: # this is the subsampling rate applied to the current level # it is stored in the level above and transferred here self.subsampling_rate_above = kwargs.pop("subsampling_rate_above", None) # set up adaptive error model self.adaptive_error_model = adaptive_error_model # check that certain requirements hold # for the adaptive error model feature to work if self.adaptive_error_model: if not hasattr(self.model_below, "mu_B"): raise AttributeError( "Model below in hierarchy does not contain" "variable 'mu_B'. You need to include" "the variable in the model definition" "for adaptive error model to work." "Use pm.Data() to define it.") if not hasattr(self.model_below, "Sigma_B"): raise AttributeError( "Model below in hierarchy does not contain" "variable 'Sigma_B'. You need to include" "the variable in the model definition" "for adaptive error model to work." "Use pm.Data() to define it.") if not (isinstance(self.model_below.mu_B, tt.sharedvar.TensorSharedVariable) and isinstance(self.model_below.Sigma_B, tt.sharedvar.TensorSharedVariable)): raise TypeError( "At least one of the variables 'mu_B' and 'Sigma_B' " "in the definition of the below model is not of type " "'TensorSharedVariable'. Use pm.Data() to define those " "variables.") # this object is used to recursively update the mean and # variance of the bias correction given new differences # between levels self.bias = RecursiveSampleMoments( self.model_below.mu_B.get_value(), self.model_below.Sigma_B.get_value()) # this list holds the bias objects from all levels # it is gradually constructed when MLDA objects are # created and then shared between all levels self.bias_all = kwargs.pop("bias_all", None) if self.bias_all is None: self.bias_all = [self.bias] else: self.bias_all.append(self.bias) # variables used for adaptive error model self.last_synced_output_diff = None self.adaptation_started = False # set up subsampling rates. if isinstance(subsampling_rates, int): self.subsampling_rates = [subsampling_rates] * len( self.coarse_models) else: if len(subsampling_rates) != len(self.coarse_models): raise ValueError( f"List of subsampling rates needs to have the same " f"length as list of coarse models but the lengths " f"were {len(subsampling_rates)}, {len(self.coarse_models)}" ) self.subsampling_rates = subsampling_rates self.subsampling_rate = self.subsampling_rates[-1] self.subchain_selection = None # set up base sampling self.base_sampler = base_sampler # VR is not compatible with compound base samplers so an automatic conversion # to a block sampler happens here if if self.variance_reduction and self.base_sampler == "Metropolis" and not base_blocked: warnings.warn( "Variance reduction is not compatible with non-blocked (compound) samplers." "Automatically switching to a blocked Metropolis sampler.") self.base_blocked = True else: self.base_blocked = base_blocked self.base_S = base_S self.base_proposal_dist = base_proposal_dist if base_scaling is None: if self.base_sampler == "Metropolis": self.base_scaling = 1.0 else: self.base_scaling = 0.001 else: self.base_scaling = float(base_scaling) self.tune = tune if not self.tune and self.base_sampler == "DEMetropolisZ": raise ValueError( f"The argument tune was set to False while using" f" a 'DEMetropolisZ' base sampler. 'DEMetropolisZ' " f" tune needs to be True.") self.base_tune_target = base_tune_target self.base_tune_interval = base_tune_interval self.base_lamb = base_lamb self.base_tune_drop_fraction = float(base_tune_drop_fraction) self.base_tuning_stats = None self.mode = mode # Process model variables if vars is None: vars = model.vars vars = pm.inputvars(vars) self.vars = vars self.var_names = [var.name for var in self.vars] self.accepted = 0 # Construct theano function for current-level model likelihood # (for use in acceptance) shared = pm.make_shared_replacements(vars, model) self.delta_logp = delta_logp_inverse(model.logpt, vars, shared) # Construct theano function for below-level model likelihood # (for use in acceptance) model_below = pm.modelcontext(self.model_below) vars_below = [ var for var in model_below.vars if var.name in self.var_names ] vars_below = pm.inputvars(vars_below) shared_below = pm.make_shared_replacements(vars_below, model_below) self.delta_logp_below = delta_logp(model_below.logpt, vars_below, shared_below) super().__init__(vars, shared) # initialise complete step method hierarchy if self.num_levels == 2: with self.model_below: # make sure the correct variables are selected from model_below vars_below = [ var for var in self.model_below.vars if var.name in self.var_names ] # create kwargs if self.variance_reduction: base_kwargs = { "mlda_subsampling_rate_above": self.subsampling_rate, "mlda_variance_reduction": True, } else: base_kwargs = {} if self.base_sampler == "Metropolis": # MetropolisMLDA sampler in base level (level=0), targeting self.model_below self.step_method_below = pm.MetropolisMLDA( vars=vars_below, proposal_dist=self.base_proposal_dist, S=self.base_S, scaling=self.base_scaling, tune=self.tune, tune_interval=self.base_tune_interval, model=None, mode=self.mode, blocked=self.base_blocked, **base_kwargs, ) else: # DEMetropolisZMLDA sampler in base level (level=0), targeting self.model_below self.step_method_below = pm.DEMetropolisZMLDA( vars=vars_below, S=self.base_S, proposal_dist=self.base_proposal_dist, lamb=self.base_lamb, scaling=self.base_scaling, tune=self.base_tune_target, tune_interval=self.base_tune_interval, tune_drop_fraction=self.base_tune_drop_fraction, model=None, mode=self.mode, **base_kwargs, ) else: # drop the last coarse model coarse_models_below = self.coarse_models[:-1] subsampling_rates_below = self.subsampling_rates[:-1] with self.model_below: # make sure the correct variables are selected from model_below vars_below = [ var for var in self.model_below.vars if var.name in self.var_names ] # create kwargs if self.variance_reduction: mlda_kwargs = { "is_child": True, "subsampling_rate_above": self.subsampling_rate, } else: mlda_kwargs = {"is_child": True} if self.adaptive_error_model: mlda_kwargs = { **mlda_kwargs, **{ "bias_all": self.bias_all } } # MLDA sampler in some intermediate level, targeting self.model_below self.step_method_below = pm.MLDA( vars=vars_below, base_S=self.base_S, base_sampler=self.base_sampler, base_proposal_dist=self.base_proposal_dist, base_scaling=self.base_scaling, tune=self.tune, base_tune_target=self.base_tune_target, base_tune_interval=self.base_tune_interval, base_lamb=self.base_lamb, base_tune_drop_fraction=self.base_tune_drop_fraction, model=None, mode=self.mode, subsampling_rates=subsampling_rates_below, coarse_models=coarse_models_below, base_blocked=self.base_blocked, variance_reduction=self.variance_reduction, store_Q_fine=False, adaptive_error_model=self.adaptive_error_model, **mlda_kwargs, ) # instantiate the recursive DA proposal. # this is the main proposal used for # all levels (Recursive Delayed Acceptance) # (except for level 0 where the step method is MetropolisMLDA # or DEMetropolisZMLDA - not MLDA) self.proposal_dist = RecursiveDAProposal(self.step_method_below, self.model_below, self.tune, self.subsampling_rate) # set up data types of stats. if isinstance(self.step_method_below, MLDA): # get the stat types from the level below if that level is MLDA self.stats_dtypes = self.step_method_below.stats_dtypes else: # otherwise, set it up from scratch. self.stats_dtypes = [{ "accept": np.float64, "accepted": np.bool, "tune": np.bool }] if isinstance(self.step_method_below, MetropolisMLDA): self.stats_dtypes.append({"base_scaling": np.float64}) elif isinstance(self.step_method_below, DEMetropolisZMLDA): self.stats_dtypes.append({ "base_scaling": np.float64, "base_lambda": np.float64 }) elif isinstance(self.step_method_below, CompoundStep): for method in self.step_method_below.methods: if isinstance(method, MetropolisMLDA): self.stats_dtypes.append({"base_scaling": np.float64}) elif isinstance(method, DEMetropolisZMLDA): self.stats_dtypes.append({ "base_scaling": np.float64, "base_lambda": np.float64 }) # initialise necessary variables for doing variance reduction if self.variance_reduction: self.sub_counter = 0 self.Q_diff = [] if self.is_child: self.Q_reg = [np.nan] * self.subsampling_rate_above if self.num_levels == 2: self.Q_base_full = [] if not self.is_child: for level in range(self.num_levels - 1, 0, -1): self.stats_dtypes[0][f"Q_{level}_{level - 1}"] = object self.stats_dtypes[0]["Q_0"] = object # initialise necessary variables for doing variance reduction or storing fine Q if self.variance_reduction or self.store_Q_fine: self.Q_last = np.nan self.Q_diff_last = np.nan if self.store_Q_fine and not self.is_child: self.stats_dtypes[0][f"Q_{self.num_levels - 1}"] = object
def bijection(self): return pm.DictToArrayBijection( pm.ArrayOrdering(pm.inputvars(self.model.cont_vars)), self.model.test_point)
def advi(vars=None, start=None, model=None, n=5000, accurate_elbo=False, optimizer=None, learning_rate=.001, epsilon=.1, mode=None, tol_obj=0.01, eval_elbo=100, random_seed=None, progressbar=True): """Perform automatic differentiation variational inference (ADVI). This function implements the meanfield ADVI, where the variational posterior distribution is assumed to be spherical Gaussian without correlation of parameters and fit to the true posterior distribution. The means and standard deviations of the variational posterior are referred to as variational parameters. The return value of this function is an :code:`ADVIfit` object, which has variational parameters. If you want to draw samples from the variational posterior, you need to pass the :code:`ADVIfit` object to :code:`pymc3.variational.sample_vp()`. The variational parameters are defined on the transformed space, which is required to do ADVI on an unconstrained parameter space as described in [KTR+2016]. The parameters in the :code:`ADVIfit` object are in the transformed space, while traces returned by :code:`sample_vp()` are in the original space as obtained by MCMC sampling methods in PyMC3. The variational parameters are optimized with given optimizer, which is a function that returns a dictionary of parameter updates as provided to Theano function. If no optimizer is provided, optimization is performed with a modified version of adagrad, where only the last (n_window) gradient vectors are used to control the learning rate and older gradient vectors are ignored. n_window denotes the size of time window and fixed to 10. Parameters ---------- vars : object Random variables. start : Dict or None Initial values of parameters (variational means). model : Model Probabilistic model. n : int Number of interations updating parameters. accurate_elbo : bool If true, 100 MC samples are used for accurate calculation of ELBO. optimizer : (loss, tensor) -> dict or OrderedDict A function that returns parameter updates given loss and parameter tensor. If :code:`None` (default), a default Adagrad optimizer is used with parameters :code:`learning_rate` and :code:`epsilon` below. learning_rate: float Base learning rate for adagrad. This parameter is ignored when optimizer is given. epsilon : float Offset in denominator of the scale of learning rate in Adagrad. This parameter is ignored when optimizer is given. tol_obj : float Relative tolerance for testing convergence of ELBO. eval_elbo : int Window for checking convergence of ELBO. Convergence will be checked for every multiple of eval_elbo. random_seed : int or None Seed to initialize random state. None uses current seed. mode : string or `Mode` instance. Compilation mode passed to Theano functions progressbar : bool Whether or not to display a progress bar in the command line. The bar shows the percentage of completion, the sampling speed in samples per second (SPS), the estimated remaining time until completion ("expected time of arrival"; ETA), and the current ELBO. Returns ------- ADVIFit Named tuple, which includes 'means', 'stds', and 'elbo_vals'. 'means' is the mean. 'stds' is the standard deviation. 'elbo_vals' is the trace of ELBO values during optimizaiton. References ---------- .. [KTR+2016] Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A., and Blei, D. M. (2016). Automatic Differentiation Variational Inference. arXiv preprint arXiv:1603.00788. """ import warnings warnings.warn('Old ADVI interface and sample_vp is deprecated and will ' 'be removed in future, use pm.fit and pm.sample_approx instead', DeprecationWarning, stacklevel=2) model = pm.modelcontext(model) if start is None: start = model.test_point if vars is None: vars = model.vars vars = pm.inputvars(vars) if len(vars) == 0: raise ValueError('No free random variables to fit.') if not pm.model.all_continuous(vars): raise ValueError('Model can not include discrete RVs for ADVI.') n_mcsamples = 100 if accurate_elbo else 1 # Prepare optimizer if optimizer is None: optimizer = adagrad_optimizer(learning_rate, epsilon) # Create variational gradient tensor elbo, shared = _calc_elbo(vars, model, n_mcsamples=n_mcsamples, random_seed=random_seed) # Set starting values for var, share in shared.items(): share.set_value(start[str(var)]) order = pm.ArrayOrdering(vars) bij = pm.DictToArrayBijection(order, start) u_start = bij.map(start) w_start = np.zeros_like(u_start) uw = np.concatenate([u_start, w_start]) # Create parameter update function used in the training loop uw_shared = theano.shared(uw, 'uw_shared') elbo = pm.CallableTensor(elbo)(uw_shared) updates = optimizer(loss=-1 * elbo, param=[uw_shared]) f = theano.function([], [uw_shared, elbo], updates=updates, mode=mode) # For tracking convergence of ELBO window_size = int(max(0.1 * n // eval_elbo, 2.0)) circ_buff = deque([], maxlen=window_size) # Optimization loop elbos = np.empty(n) divergence_flag = False progress = trange(n) if progressbar else range(n) try: uw_i, elbo_current = f() if np.isnan(elbo_current): raise FloatingPointError('NaN occurred in ADVI optimization.') for i in progress: uw_i, e = f() if np.isnan(e): raise FloatingPointError('NaN occurred in ADVI optimization.') elbos[i] = e if progressbar: if n < 10: progress.set_description('ELBO = {:,.5g}'.format(elbos[i])) elif i % (n // 10) == 0 and i > 0: avg_elbo = infmean(elbos[i - n // 10:i]) progress.set_description( 'Average ELBO = {:,.5g}'.format(avg_elbo)) if i % eval_elbo == 0: elbo_prev = elbo_current elbo_current = elbos[i] delta_elbo = abs((elbo_current - elbo_prev) / elbo_prev) circ_buff.append(delta_elbo) avg_delta = np.mean(circ_buff) med_delta = np.median(circ_buff) if i > 0 and avg_delta < tol_obj: pm._log.info('Mean ELBO converged.') elbos = elbos[:(i + 1)] break elif i > 0 and med_delta < tol_obj: pm._log.info('Median ELBO converged.') elbos = elbos[:(i + 1)] break if i > 10 * eval_elbo: if med_delta > 0.5 or avg_delta > 0.5: divergence_flag = True else: divergence_flag = False except KeyboardInterrupt: elbos = elbos[:i] if n < 10: pm._log.info('Interrupted at {:,d} [{:.0f}%]: ELBO = {:,.5g}'.format( i, 100 * i // n, elbos[i])) else: avg_elbo = infmean(elbos[i - n // 10:i]) pm._log.info('Interrupted at {:,d} [{:.0f}%]: Average ELBO = {:,.5g}'.format( i, 100 * i // n, avg_elbo)) else: if n < 10: pm._log.info('Finished [100%]: ELBO = {:,.5g}'.format(elbos[-1])) else: avg_elbo = infmean(elbos[-n // 10:]) pm._log.info('Finished [100%]: Average ELBO = {:,.5g}'.format(avg_elbo)) finally: if progressbar: progress.close() if divergence_flag: pm._log.info('Evidence of divergence detected, inspect ELBO.') # Estimated parameters l = int(uw_i.size / 2) u = bij.rmap(uw_i[:l]) w = bij.rmap(uw_i[l:]) # w is in log space for var in w.keys(): w[var] = np.exp(w[var]) return ADVIFit(u, w, elbos)
def advi(vars=None, start=None, model=None, n=5000, accurate_elbo=False, optimizer=None, learning_rate=.001, epsilon=.1, mode=None, tol_obj=0.01, eval_elbo=100, random_seed=None, progressbar=True): """Perform automatic differentiation variational inference (ADVI). This function implements the meanfield ADVI, where the variational posterior distribution is assumed to be spherical Gaussian without correlation of parameters and fit to the true posterior distribution. The means and standard deviations of the variational posterior are referred to as variational parameters. The return value of this function is an :code:`ADVIfit` object, which has variational parameters. If you want to draw samples from the variational posterior, you need to pass the :code:`ADVIfit` object to :code:`pymc3.variational.sample_vp()`. The variational parameters are defined on the transformed space, which is required to do ADVI on an unconstrained parameter space as described in [KTR+2016]. The parameters in the :code:`ADVIfit` object are in the transformed space, while traces returned by :code:`sample_vp()` are in the original space as obtained by MCMC sampling methods in PyMC3. The variational parameters are optimized with given optimizer, which is a function that returns a dictionary of parameter updates as provided to Theano function. If no optimizer is provided, optimization is performed with a modified version of adagrad, where only the last (n_window) gradient vectors are used to control the learning rate and older gradient vectors are ignored. n_window denotes the size of time window and fixed to 10. Parameters ---------- vars : object Random variables. start : Dict or None Initial values of parameters (variational means). model : Model Probabilistic model. n : int Number of interations updating parameters. accurate_elbo : bool If true, 100 MC samples are used for accurate calculation of ELBO. optimizer : (loss, tensor) -> dict or OrderedDict A function that returns parameter updates given loss and parameter tensor. If :code:`None` (default), a default Adagrad optimizer is used with parameters :code:`learning_rate` and :code:`epsilon` below. learning_rate: float Base learning rate for adagrad. This parameter is ignored when optimizer is given. epsilon : float Offset in denominator of the scale of learning rate in Adagrad. This parameter is ignored when optimizer is given. tol_obj : float Relative tolerance for testing convergence of ELBO. eval_elbo : int Window for checking convergence of ELBO. Convergence will be checked for every multiple of eval_elbo. random_seed : int or None Seed to initialize random state. None uses current seed. mode : string or `Mode` instance. Compilation mode passed to Theano functions progressbar : bool Whether or not to display a progress bar in the command line. The bar shows the percentage of completion, the sampling speed in samples per second (SPS), the estimated remaining time until completion ("expected time of arrival"; ETA), and the current ELBO. Returns ------- ADVIFit Named tuple, which includes 'means', 'stds', and 'elbo_vals'. 'means' is the mean. 'stds' is the standard deviation. 'elbo_vals' is the trace of ELBO values during optimizaiton. References ---------- .. [KTR+2016] Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A., and Blei, D. M. (2016). Automatic Differentiation Variational Inference. arXiv preprint arXiv:1603.00788. """ model = pm.modelcontext(model) if start is None: start = model.test_point if vars is None: vars = model.vars vars = pm.inputvars(vars) if len(vars) == 0: raise ValueError('No free random variables to fit.') if not pm.model.all_continuous(vars): raise ValueError('Model can not include discrete RVs for ADVI.') n_mcsamples = 100 if accurate_elbo else 1 # Prepare optimizer if optimizer is None: optimizer = adagrad_optimizer(learning_rate, epsilon) # Create variational gradient tensor elbo, shared = _calc_elbo(vars, model, n_mcsamples=n_mcsamples, random_seed=random_seed) # Set starting values for var, share in shared.items(): share.set_value(start[str(var)]) order = pm.ArrayOrdering(vars) bij = pm.DictToArrayBijection(order, start) u_start = bij.map(start) w_start = np.zeros_like(u_start) uw = np.concatenate([u_start, w_start]) # Create parameter update function used in the training loop uw_shared = theano.shared(uw, 'uw_shared') elbo = pm.CallableTensor(elbo)(uw_shared) updates = optimizer(loss=-1 * elbo, param=[uw_shared]) f = theano.function([], [uw_shared, elbo], updates=updates, mode=mode) # For tracking convergence of ELBO window_size = int(max(0.1 * n // eval_elbo, 2.0)) circ_buff = deque([], maxlen=window_size) # Optimization loop elbos = np.empty(n) divergence_flag = False progress = trange(n) if progressbar else range(n) try: uw_i, elbo_current = f() if np.isnan(elbo_current): raise FloatingPointError('NaN occurred in ADVI optimization.') for i in progress: uw_i, e = f() if np.isnan(e): raise FloatingPointError('NaN occurred in ADVI optimization.') elbos[i] = e if progressbar: if n < 10: progress.set_description('ELBO = {:,.5g}'.format(elbos[i])) elif i % (n // 10) == 0 and i > 0: avg_elbo = infmean(elbos[i - n // 10:i]) progress.set_description( 'Average ELBO = {:,.5g}'.format(avg_elbo)) if i % eval_elbo == 0: elbo_prev = elbo_current elbo_current = elbos[i] delta_elbo = abs((elbo_current - elbo_prev) / elbo_prev) circ_buff.append(delta_elbo) avg_delta = np.mean(circ_buff) med_delta = np.median(circ_buff) if i > 0 and avg_delta < tol_obj: pm._log.info('Mean ELBO converged.') elbos = elbos[:(i + 1)] break elif i > 0 and med_delta < tol_obj: pm._log.info('Median ELBO converged.') elbos = elbos[:(i + 1)] break if i > 10 * eval_elbo: if med_delta > 0.5 or avg_delta > 0.5: divergence_flag = True else: divergence_flag = False except KeyboardInterrupt: elbos = elbos[:i] if n < 10: pm._log.info( 'Interrupted at {:,d} [{:.0f}%]: ELBO = {:,.5g}'.format( i, 100 * i // n, elbos[i])) else: avg_elbo = infmean(elbos[i - n // 10:i]) pm._log.info( 'Interrupted at {:,d} [{:.0f}%]: Average ELBO = {:,.5g}'. format(i, 100 * i // n, avg_elbo)) else: if n < 10: pm._log.info('Finished [100%]: ELBO = {:,.5g}'.format(elbos[-1])) else: avg_elbo = infmean(elbos[-n // 10:]) pm._log.info( 'Finished [100%]: Average ELBO = {:,.5g}'.format(avg_elbo)) finally: if progressbar: progress.close() if divergence_flag: pm._log.info('Evidence of divergence detected, inspect ELBO.') # Estimated parameters l = int(uw_i.size / 2) u = bij.rmap(uw_i[:l]) w = bij.rmap(uw_i[l:]) # w is in log space for var in w.keys(): w[var] = np.exp(w[var]) return ADVIFit(u, w, elbos)
def __init__(self, vars, values=None, model=None): model = pm.modelcontext(model) if len(vars) > 1: raise ValueError("This sampler only takes one variable.") (beta, ) = pm.inputvars(vars) if not isinstance(beta.distribution, HorseShoe): raise TypeError("This sampler only samples `HorseShoe`s.") other_model_vars = [ value for attr, value in model.named_vars.items() if value != beta ] y_X_fn, X_fn = None, None for var in other_model_vars: # Look through all the attributes of the variable and see if any of # the parameters have a multiplication relationship with the # Horseshoe variable if hasattr(var, "distribution"): try: y_X_fn, eta = hs_regression_model(var.distribution, var, model) except NotImplementedError: continue elif isinstance(var, pm.model.DeterministicWrapper): eta = var.owner.inputs[0] dense_dot = eta.owner and isinstance(eta.owner.op, Dot) sparse_dot = eta.owner and isinstance(eta.owner.op, StructuredDot) dense_inputs = dense_dot and beta in eta.owner.inputs sparse_inputs = sparse_dot and beta in eta.owner.inputs[ 1].owner.inputs if not (dense_inputs or sparse_inputs): continue if not y_X_fn: # We don't have the observation distribution, so we need to # find it. This happens when a `Deterministic` bridges a # `Horseshoe` parameter with it's observation distribution's # mean. y_X_fn = None obs_mu = None for obs_rv in model.observed_RVs: try: y_X_fn, obs_mu = hs_regression_model( obs_rv.distribution, obs_rv, model) break except NotImplementedError: continue # The `Deterministic` should be the mean parameter of the # observed distribution if var != obs_mu: continue if dense_inputs: X_fn = model.fn(eta.owner.inputs[1].T) else: X_fn = model.fn(eta.owner.inputs[0]) if not (X_fn and y_X_fn): raise NotImplementedError( f"Cannot find a design matrix or dependent variable associated with {beta}" # noqa: E501 ) self.vars = [beta] M = model.test_point[beta.name].shape[-1] # if observation dist is normal then y_aug_fn = y_fn when it is NB # then, hs_regression_model, dispatch i.distribution... self.vi = np.full(M, 1) self.lambda2 = np.full(M, 1) self.beta = np.full(M, 1) self.tau2 = 1 self.xi = 1 self.y_X_fn = y_X_fn self.X_fn = X_fn