Ejemplo n.º 1
0
def _make_elbo_t(
    observed_RVs, global_RVs, local_RVs, potentials, n_mcsamples, random_seed):
    global_order = pm.ArrayOrdering([v for v in global_RVs])
    local_order = pm.ArrayOrdering([v for v in local_RVs])

    inarray_g, uw_g, replace_g, c_g = _join_global_RVs(global_RVs, global_order)
    inarray_l, uw_l, replace_l, c_l = _join_local_RVs(local_RVs, local_order)

    logpt = _make_logpt(global_RVs, local_RVs, observed_RVs, potentials)
    replace = replace_g
    replace.update(replace_l)
    logpt = theano.clone(logpt, replace, strict=False)

    elbo = _elbo_t(logpt, uw_g, uw_l, inarray_g, inarray_l, c_g, c_l,
                   n_mcsamples, random_seed)

    return elbo, uw_l, uw_g
Ejemplo n.º 2
0
def _init_uw_global_shared(start, global_RVs):
    global_order = pm.ArrayOrdering([v for v in global_RVs])
    start = {v.name: start[v.name] for v in global_RVs}
    bij = pm.DictToArrayBijection(global_order, start)
    u_start = bij.map(start)
    w_start = np.zeros_like(u_start)
    uw_start = floatX(np.concatenate([u_start, w_start]))
    uw_global_shared = theano.shared(uw_start, 'uw_global_shared')

    return uw_global_shared, bij
Ejemplo n.º 3
0
    def __init__(self, model, observed):
        self.model = model
        self.observed = observed

        vars = pm.inputvars(model.cont_vars)

        bij = pm.DictToArrayBijection(pm.ArrayOrdering(vars), model.test_point)
        self.logp = bij.mapf(model.fastlogp)
        self.dlogp = bij.mapf(model.fastdlogp(vars))

        self.num_vars = len(vars)
Ejemplo n.º 4
0
    def __init__(self, model):
        """
        Parameters
        ----------
        model : pymc3.Model
            The probability model, written with Theano shared
            variables to form any observations. The Theano shared
            variables are set during inference.
        """
        self.model = model

        vars = pm.inputvars(model.cont_vars)
        self.n_vars = len(vars)

        bij = pm.DictToArrayBijection(pm.ArrayOrdering(vars), model.test_point)
        self.logp = bij.mapf(model.fastlogp)
        self.dlogp = bij.mapf(model.fastdlogp(vars))
Ejemplo n.º 5
0
    def __init__(self, model):
        """
        Parameters
        ----------
        model : pymc3.Model
            The probability model, written with Theano shared
            variables to form any observations and with
            `transform=None` for any latent variables. The Theano
            shared variables are set during inference, and all latent
            variables live on their original (constrained) space.
        """
        self.model = model
        self.n_vars = None

        vars = pm.inputvars(model.cont_vars)
        bij = pm.DictToArrayBijection(pm.ArrayOrdering(vars), model.test_point)
        self.logp = bij.mapf(model.fastlogp)
        self.dlogp = bij.mapf(model.fastdlogp(vars))
Ejemplo n.º 6
0
def advi(vars=None, start=None, model=None, n=5000, accurate_elbo=False,
         optimizer=None, learning_rate=.001, epsilon=.1, random_seed=None):
    """Perform automatic differentiation variational inference (ADVI).

    This function implements the meanfield ADVI, where the variational
    posterior distribution is assumed to be spherical Gaussian without
    correlation of parameters and fit to the true posterior distribution.
    The means and standard deviations of the variational posterior are referred
    to as variational parameters.

    The return value of this function is an :code:`ADVIfit` object, which has
    variational parameters. If you want to draw samples from the variational
    posterior, you need to pass the :code:`ADVIfit` object to
    :code:`pymc3.variational.sample_vp()`.

    The variational parameters are defined on the transformed space, which is
    required to do ADVI on an unconstrained parameter space as described in
    [KTR+2016]. The parameters in the :code:`ADVIfit` object are in the
    transformed space, while traces returned by :code:`sample_vp()` are in
    the original space as obtained by MCMC sampling methods in PyMC3.

    The variational parameters are optimized with given optimizer, which is a
    function that returns a dictionary of parameter updates as provided to
    Theano function. If no optimizer is provided, optimization is performed
    with a modified version of adagrad, where only the last (n_window) gradient
    vectors are used to control the learning rate and older gradient vectors
    are ignored. n_window denotes the size of time window and fixed to 10.

    Parameters
    ----------
    vars : object
        Random variables.
    start : Dict or None
        Initial values of parameters (variational means).
    model : Model
        Probabilistic model.
    n : int
        Number of interations updating parameters.
    accurate_elbo : bool
        If true, 100 MC samples are used for accurate calculation of ELBO.
    optimizer : (loss, tensor) -> dict or OrderedDict
        A function that returns parameter updates given loss and parameter
        tensor. If :code:`None` (default), a default Adagrad optimizer is
        used with parameters :code:`learning_rate` and :code:`epsilon` below.
    learning_rate: float
        Base learning rate for adagrad. This parameter is ignored when
        optimizer is given.
    epsilon : float
        Offset in denominator of the scale of learning rate in Adagrad.
        This parameter is ignored when optimizer is given.
    random_seed : int or None
        Seed to initialize random state. None uses current seed.

    Returns
    -------
    ADVIFit
        Named tuple, which includes 'means', 'stds', and 'elbo_vals'.

    'means' is the mean. 'stds' is the standard deviation.
    'elbo_vals' is the trace of ELBO values during optimizaiton.

    References
    ----------
    .. [KTR+2016] Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A.,
        and Blei, D. M. (2016). Automatic Differentiation Variational
        Inference. arXiv preprint arXiv:1603.00788.
    """
    model = pm.modelcontext(model)
    if start is None:
        start = model.test_point

    if vars is None:
        vars = model.vars
    vars = pm.inputvars(vars)

    if not pm.model.all_continuous(vars):
        raise ValueError('Model should not include discrete RVs for ADVI.')

    n_mcsamples = 100 if accurate_elbo else 1

    # Prepare optimizer
    if optimizer is None:
        optimizer = adagrad_optimizer(learning_rate, epsilon)

    # Create variational gradient tensor
    elbo, shared = _calc_elbo(vars, model, n_mcsamples=n_mcsamples,
                              random_seed=random_seed)

    # Set starting values
    for var, share in shared.items():
        share.set_value(start[str(var)])

    order = pm.ArrayOrdering(vars)
    bij = pm.DictToArrayBijection(order, start)
    u_start = bij.map(start)
    w_start = np.zeros_like(u_start)
    uw = np.concatenate([u_start, w_start])

    # Create parameter update function used in the training loop
    uw_shared = theano.shared(uw, 'uw_shared')
    elbo = pm.CallableTensor(elbo)(uw_shared)
    updates = optimizer(loss=-1 * elbo, param=[uw_shared])
    f = theano.function([], [uw_shared, elbo], updates=updates)

    # Optimization loop
    elbos = np.empty(n)
    try:
        progress = trange(n)
        for i in progress:
            uw_i, e = f()
            elbos[i] = e
            if i % (n // 10) == 0 and i > 0:
                avg_elbo = elbos[i - n // 10:i].mean()
                progress.set_description('Average ELBO = {:,.5g}'.format(avg_elbo))
    except KeyboardInterrupt:
        elbos = elbos[:i]
        avg_elbo = elbos[i - n // 10:].mean()
        pm._log.info('Interrupted at {:,d} [{:.0f}%]: Average ELBO = {:,.5g}'.format(
            i, 100 * i // n, avg_elbo))
    else:
        avg_elbo = elbos[-n // 10:].mean()
        pm._log.info('Finished [100%]: Average ELBO = {:,.5g}'.format(avg_elbo))

    # Estimated parameters
    l = int(uw_i.size / 2)
    u = bij.rmap(uw_i[:l])
    w = bij.rmap(uw_i[l:])
    # w is in log space
    for var in w.keys():
        w[var] = np.exp(w[var])

    return ADVIFit(u, w, elbos)
Ejemplo n.º 7
0
def advi(vars=None,
         start=None,
         model=None,
         n=5000,
         accurate_elbo=False,
         optimizer=None,
         learning_rate=.001,
         epsilon=.1,
         mode=None,
         tol_obj=0.01,
         eval_elbo=100,
         random_seed=None,
         progressbar=True):
    """Perform automatic differentiation variational inference (ADVI).

    This function implements the meanfield ADVI, where the variational
    posterior distribution is assumed to be spherical Gaussian without
    correlation of parameters and fit to the true posterior distribution.
    The means and standard deviations of the variational posterior are referred
    to as variational parameters.

    The return value of this function is an :code:`ADVIfit` object, which has
    variational parameters. If you want to draw samples from the variational
    posterior, you need to pass the :code:`ADVIfit` object to
    :code:`pymc3.variational.sample_vp()`.

    The variational parameters are defined on the transformed space, which is
    required to do ADVI on an unconstrained parameter space as described in
    [KTR+2016]. The parameters in the :code:`ADVIfit` object are in the
    transformed space, while traces returned by :code:`sample_vp()` are in
    the original space as obtained by MCMC sampling methods in PyMC3.

    The variational parameters are optimized with given optimizer, which is a
    function that returns a dictionary of parameter updates as provided to
    Theano function. If no optimizer is provided, optimization is performed
    with a modified version of adagrad, where only the last (n_window) gradient
    vectors are used to control the learning rate and older gradient vectors
    are ignored. n_window denotes the size of time window and fixed to 10.

    Parameters
    ----------
    vars : object
        Random variables.
    start : Dict or None
        Initial values of parameters (variational means).
    model : Model
        Probabilistic model.
    n : int
        Number of interations updating parameters.
    accurate_elbo : bool
        If true, 100 MC samples are used for accurate calculation of ELBO.
    optimizer : (loss, tensor) -> dict or OrderedDict
        A function that returns parameter updates given loss and parameter
        tensor. If :code:`None` (default), a default Adagrad optimizer is
        used with parameters :code:`learning_rate` and :code:`epsilon` below.
    learning_rate: float
        Base learning rate for adagrad. This parameter is ignored when
        optimizer is given.
    epsilon : float
        Offset in denominator of the scale of learning rate in Adagrad.
        This parameter is ignored when optimizer is given.
    tol_obj : float
        Relative tolerance for testing convergence of ELBO.
    eval_elbo : int
        Window for checking convergence of ELBO. Convergence will be checked
        for every multiple of eval_elbo.
    random_seed : int or None
        Seed to initialize random state. None uses current seed.
    mode :  string or `Mode` instance.
        Compilation mode passed to Theano functions
    progressbar : bool
        Whether or not to display a progress bar in the command line. The
        bar shows the percentage of completion, the sampling speed in
        samples per second (SPS), the estimated remaining time until
        completion ("expected time of arrival"; ETA), and the current ELBO.

    Returns
    -------
    ADVIFit
        Named tuple, which includes 'means', 'stds', and 'elbo_vals'.

    'means' is the mean. 'stds' is the standard deviation.
    'elbo_vals' is the trace of ELBO values during optimizaiton.

    References
    ----------
    .. [KTR+2016] Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A.,
        and Blei, D. M. (2016). Automatic Differentiation Variational
        Inference. arXiv preprint arXiv:1603.00788.
    """
    model = pm.modelcontext(model)
    if start is None:
        start = model.test_point

    if vars is None:
        vars = model.vars
    vars = pm.inputvars(vars)

    if len(vars) == 0:
        raise ValueError('No free random variables to fit.')

    if not pm.model.all_continuous(vars):
        raise ValueError('Model can not include discrete RVs for ADVI.')

    n_mcsamples = 100 if accurate_elbo else 1

    # Prepare optimizer
    if optimizer is None:
        optimizer = adagrad_optimizer(learning_rate, epsilon)

    # Create variational gradient tensor
    elbo, shared = _calc_elbo(vars,
                              model,
                              n_mcsamples=n_mcsamples,
                              random_seed=random_seed)

    # Set starting values
    for var, share in shared.items():
        share.set_value(start[str(var)])

    order = pm.ArrayOrdering(vars)
    bij = pm.DictToArrayBijection(order, start)
    u_start = bij.map(start)
    w_start = np.zeros_like(u_start)
    uw = np.concatenate([u_start, w_start])

    # Create parameter update function used in the training loop
    uw_shared = theano.shared(uw, 'uw_shared')
    elbo = pm.CallableTensor(elbo)(uw_shared)
    updates = optimizer(loss=-1 * elbo, param=[uw_shared])
    f = theano.function([], [uw_shared, elbo], updates=updates, mode=mode)

    # For tracking convergence of ELBO
    window_size = int(max(0.1 * n // eval_elbo, 2.0))
    circ_buff = deque([], maxlen=window_size)

    # Optimization loop
    elbos = np.empty(n)
    divergence_flag = False
    progress = trange(n) if progressbar else range(n)
    try:
        uw_i, elbo_current = f()
        if np.isnan(elbo_current):
            raise FloatingPointError('NaN occurred in ADVI optimization.')
        for i in progress:
            uw_i, e = f()
            if np.isnan(e):
                raise FloatingPointError('NaN occurred in ADVI optimization.')
            elbos[i] = e

            if progressbar:
                if n < 10:
                    progress.set_description('ELBO = {:,.5g}'.format(elbos[i]))
                elif i % (n // 10) == 0 and i > 0:
                    avg_elbo = infmean(elbos[i - n // 10:i])
                    progress.set_description(
                        'Average ELBO = {:,.5g}'.format(avg_elbo))

            if i % eval_elbo == 0:
                elbo_prev = elbo_current
                elbo_current = elbos[i]
                delta_elbo = abs((elbo_current - elbo_prev) / elbo_prev)
                circ_buff.append(delta_elbo)
                avg_delta = np.mean(circ_buff)
                med_delta = np.median(circ_buff)

                if i > 0 and avg_delta < tol_obj:
                    pm._log.info('Mean ELBO converged.')
                    elbos = elbos[:(i + 1)]
                    break
                elif i > 0 and med_delta < tol_obj:
                    pm._log.info('Median ELBO converged.')
                    elbos = elbos[:(i + 1)]
                    break
                if i > 10 * eval_elbo:
                    if med_delta > 0.5 or avg_delta > 0.5:
                        divergence_flag = True
                    else:
                        divergence_flag = False

    except KeyboardInterrupt:
        elbos = elbos[:i]
        if n < 10:
            pm._log.info(
                'Interrupted at {:,d} [{:.0f}%]: ELBO = {:,.5g}'.format(
                    i, 100 * i // n, elbos[i]))
        else:
            avg_elbo = infmean(elbos[i - n // 10:i])
            pm._log.info(
                'Interrupted at {:,d} [{:.0f}%]: Average ELBO = {:,.5g}'.
                format(i, 100 * i // n, avg_elbo))
    else:
        if n < 10:
            pm._log.info('Finished [100%]: ELBO = {:,.5g}'.format(elbos[-1]))
        else:
            avg_elbo = infmean(elbos[-n // 10:])
            pm._log.info(
                'Finished [100%]: Average ELBO = {:,.5g}'.format(avg_elbo))
    finally:
        if progressbar:
            progress.close()

    if divergence_flag:
        pm._log.info('Evidence of divergence detected, inspect ELBO.')

    # Estimated parameters
    l = int(uw_i.size / 2)
    u = bij.rmap(uw_i[:l])
    w = bij.rmap(uw_i[l:])
    # w is in log space
    for var in w.keys():
        w[var] = np.exp(w[var])

    return ADVIFit(u, w, elbos)
Ejemplo n.º 8
0
def advi_minibatch(vars=None,
                   start=None,
                   model=None,
                   n=5000,
                   n_mcsamples=1,
                   minibatch_RVs=None,
                   minibatch_tensors=None,
                   minibatches=None,
                   local_RVs=None,
                   observed_RVs=None,
                   encoder_params=None,
                   total_size=None,
                   optimizer=None,
                   learning_rate=.001,
                   epsilon=.1,
                   random_seed=None):
    """Perform mini-batch ADVI.

    This function implements a mini-batch ADVI with the meanfield
    approximation. Autoencoding variational inference is also supported.

    The log probability terms for mini-batches, corresponding to RVs in
    minibatch_RVs, are scaled to (total_size) / (the number of samples in each
    mini-batch), where total_size is an argument for the total data size.

    minibatch_tensors is a list of tensors (can be shared variables) to which
    mini-batch samples are set during the optimization. In most cases, these
    tensors are observations for RVs in the model.

    local_RVs and observed_RVs are used for autoencoding variational Bayes.
    Both of these RVs are associated with each of given samples.
    The difference is that local_RVs are unkown and their posterior
    distributions are approximated.

    local_RVs are Ordered dict, whose keys and values are RVs and a tuple of
    two objects. The first is the theano expression of variational parameters
    (mean and log of std) of the approximate posterior, which are encoded from
    given samples by an arbitrary deterministic function, e.g., MLP. The other
    one is a scaling constant to be multiplied to the log probability term
    corresponding to the RV.

    observed_RVs are also Ordered dict with RVs as the keys, but whose values
    are only the scaling constant as in local_RVs. In this case, total_size is
    ignored.

    If local_RVs is None (thus not using autoencoder), the following two
    settings are equivalent:

    - observed_RVs=OrderedDict([(rv, total_size / minibatch_size)])
    - minibatch_RVs=[rv], total_size=total_size

    where minibatch_size is minibatch_tensors[0].shape[0].

    The variational parameters and the parameters of the autoencoder are
    simultaneously optimized with given optimizer, which is a function that
    returns a dictionary of parameter updates as provided to Theano function.
    See the docstring of pymc3.variational.advi().

    Parameters
    ----------
    vars : object
        List of random variables. If None, variational posteriors (normal
        distribution) are fit for all RVs in the given model.
    start : Dict or None
        Initial values of parameters (variational means).
    model : Model
        Probabilistic model.
    n : int
        Number of iterations updating parameters.
    n_mcsamples : int
        Number of Monte Carlo samples to approximate ELBO.
    minibatch_RVs : list of ObservedRVs
        Random variables in the model for which mini-batch tensors are set.
        When this argument is given, both of arguments local_RVs and
        observed_RVs must be None.
    minibatch_tensors : list of (tensors or shared variables)
        Tensors used to create ObservedRVs in minibatch_RVs.
    minibatches : generator of list
        Generates a set of minibatches when calling next().
        The length of the returned list must be the same with the number of
        random variables in `minibatch_tensors`.
    total_size : int
        Total size of training samples. This is used to appropriately scale the
        log likelihood terms corresponding to mini-batches in ELBO.
    local_RVs : Ordered dict
        Include encoded variational parameters and a scaling constant for
        the corresponding RV. See the above description.
    observed_RVs : Ordered dict
        Include a scaling constant for the corresponding RV. See the above
        description
    encoder_params : list of theano shared variables
        Parameters of encoder.
    optimizer : (loss, list of shared variables) -> dict or OrderedDict
        A function that returns parameter updates given loss and shared
        variables of parameters. If :code:`None` (default), a default
        Adagrad optimizer is used with parameters :code:`learning_rate`
        and :code:`epsilon` below.
    learning_rate: float
        Base learning rate for adagrad. This parameter is ignored when
        an optimizer is given.
    epsilon : float
        Offset in denominator of the scale of learning rate in Adagrad.
        This parameter is ignored when an optimizer is given.
    random_seed : int
        Seed to initialize random state.

    Returns
    -------
    ADVIFit
        Named tuple, which includes 'means', 'stds', and 'elbo_vals'.
    """
    theano.config.compute_test_value = 'ignore'

    model = pm.modelcontext(model)
    vars = inputvars(vars if vars is not None else model.vars)
    start = start if start is not None else model.test_point
    check_discrete_rvs(vars)
    _check_minibatches(minibatch_tensors, minibatches)

    if encoder_params is None:
        encoder_params = []

    # Prepare optimizer
    if optimizer is None:
        optimizer = adagrad_optimizer(learning_rate, epsilon)

    # For backward compatibility in how input arguments are given
    local_RVs, observed_RVs = _get_rvss(minibatch_RVs, local_RVs, observed_RVs,
                                        minibatch_tensors, total_size)

    # Replace local_RVs with transformed variables
    ds = model.deterministics

    def get_transformed(v):
        if v in ds:
            return v.transformed
        return v

    local_RVs = OrderedDict([(get_transformed(v), (uw, s))
                             for v, (uw, s) in local_RVs.items()])

    # Get global variables
    global_RVs = list(set(vars) - set(list(local_RVs) + list(observed_RVs)))

    # Ordering for concatenation of random variables
    global_order = pm.ArrayOrdering([v for v in global_RVs])
    local_order = pm.ArrayOrdering([v for v in local_RVs])

    # ELBO wrt variational parameters
    inarray_g, uw_g, replace_g = _join_global_RVs(global_RVs, global_order)
    inarray_l, uw_l, replace_l = _join_local_RVs(local_RVs, local_order)
    logpt = _make_logpt(global_RVs, local_RVs, observed_RVs, model)
    replace = replace_g
    replace.update(replace_l)
    logp = theano.clone(logpt, replace, strict=False)
    elbo = _elbo_t(logp, uw_g, uw_l, inarray_g, inarray_l, n_mcsamples,
                   random_seed)
    del logpt

    # Replacements tensors of variational parameters in the graph
    replaces = dict()

    # Variational parameters for global RVs
    if 0 < len(global_RVs):
        uw_global_shared, bij = _init_uw_global_shared(start, global_RVs,
                                                       global_order)
        replaces.update({uw_g: uw_global_shared})

    # Variational parameters for local RVs, encoded from samples in
    # mini-batches
    if 0 < len(local_RVs):
        uws = [uw for _, (uw, _) in local_RVs.items()]
        uw_local_encoded = tt.concatenate([uw[0].ravel() for uw in uws] +
                                          [uw[1].ravel() for uw in uws])
        replaces.update({uw_l: uw_local_encoded})

    # Replace tensors of variational parameters in ELBO
    elbo = theano.clone(elbo, OrderedDict(replaces), strict=False)

    # Replace input shared variables with tensors
    def is_shared(t):
        return isinstance(t, theano.compile.sharedvalue.SharedVariable)

    tensors = [(t.type() if is_shared(t) else t) for t in minibatch_tensors]
    updates = OrderedDict(
        {t: t_
         for t, t_ in zip(minibatch_tensors, tensors) if is_shared(t)})
    elbo = theano.clone(elbo, updates, strict=False)

    # Create parameter update function used in the training loop
    params = encoder_params
    if 0 < len(global_RVs):
        params += [uw_global_shared]
    updates = OrderedDict(optimizer(loss=-1 * elbo, param=params))
    f = theano.function(tensors, elbo, updates=updates)

    # Optimization loop
    elbos = np.empty(n)
    progress = tqdm.trange(n)
    for i in progress:
        e = f(*next(minibatches))
        elbos[i] = e
        if i % (n // 10) == 0 and i > 0:
            avg_elbo = elbos[i - n // 10:i].mean()
            progress.set_description('Average ELBO = {:,.2f}'.format(avg_elbo))

    pm._log.info('Finished minibatch ADVI: ELBO = {:,.2f}'.format(elbos[-1]))

    # Variational parameters of global RVs
    if 0 < len(global_RVs):
        l = int(uw_global_shared.get_value(borrow=True).size / 2)
        u = bij.rmap(uw_global_shared.get_value(borrow=True)[:l])
        w = bij.rmap(uw_global_shared.get_value(borrow=True)[l:])
        # w is in log space
        for var in w.keys():
            w[var] = np.exp(w[var])
    else:
        u = dict()
        w = dict()

    return ADVIFit(u, w, elbos)
Ejemplo n.º 9
0
 def bijection(self):
     return pm.DictToArrayBijection(
         pm.ArrayOrdering(pm.inputvars(self.model.cont_vars)),
         self.model.test_point)