Esempio n. 1
0
def delta_logp(logp, vars, shared):
    [logp0], inarray0 = pm.join_nonshared_inputs([logp], vars, shared)

    tensor_type = inarray0.type
    inarray1 = tensor_type("inarray1")

    logp1 = pm.CallableTensor(logp0)(inarray1)

    f = aesara.function([inarray1, inarray0], logp1 - logp0)
    f.trust_input = True
    return f
Esempio n. 2
0
def delta_logp(point, logp, vars, shared):
    [logp0], inarray0 = pm.join_nonshared_inputs(point, [logp], vars, shared)

    tensor_type = inarray0.type
    inarray1 = tensor_type("inarray1")

    logp1 = pm.CallableTensor(logp0)(inarray1)

    f = compile_rv_inplace([inarray1, inarray0], logp1 - logp0)
    f.trust_input = True
    return f
Esempio n. 3
0
def advi(vars=None, start=None, model=None, n=5000, accurate_elbo=False,
         optimizer=None, learning_rate=.001, epsilon=.1, random_seed=None):
    """Perform automatic differentiation variational inference (ADVI).

    This function implements the meanfield ADVI, where the variational
    posterior distribution is assumed to be spherical Gaussian without
    correlation of parameters and fit to the true posterior distribution.
    The means and standard deviations of the variational posterior are referred
    to as variational parameters.

    The return value of this function is an :code:`ADVIfit` object, which has
    variational parameters. If you want to draw samples from the variational
    posterior, you need to pass the :code:`ADVIfit` object to
    :code:`pymc3.variational.sample_vp()`.

    The variational parameters are defined on the transformed space, which is
    required to do ADVI on an unconstrained parameter space as described in
    [KTR+2016]. The parameters in the :code:`ADVIfit` object are in the
    transformed space, while traces returned by :code:`sample_vp()` are in
    the original space as obtained by MCMC sampling methods in PyMC3.

    The variational parameters are optimized with given optimizer, which is a
    function that returns a dictionary of parameter updates as provided to
    Theano function. If no optimizer is provided, optimization is performed
    with a modified version of adagrad, where only the last (n_window) gradient
    vectors are used to control the learning rate and older gradient vectors
    are ignored. n_window denotes the size of time window and fixed to 10.

    Parameters
    ----------
    vars : object
        Random variables.
    start : Dict or None
        Initial values of parameters (variational means).
    model : Model
        Probabilistic model.
    n : int
        Number of interations updating parameters.
    accurate_elbo : bool
        If true, 100 MC samples are used for accurate calculation of ELBO.
    optimizer : (loss, tensor) -> dict or OrderedDict
        A function that returns parameter updates given loss and parameter
        tensor. If :code:`None` (default), a default Adagrad optimizer is
        used with parameters :code:`learning_rate` and :code:`epsilon` below.
    learning_rate: float
        Base learning rate for adagrad. This parameter is ignored when
        optimizer is given.
    epsilon : float
        Offset in denominator of the scale of learning rate in Adagrad.
        This parameter is ignored when optimizer is given.
    random_seed : int or None
        Seed to initialize random state. None uses current seed.

    Returns
    -------
    ADVIFit
        Named tuple, which includes 'means', 'stds', and 'elbo_vals'.

    'means' is the mean. 'stds' is the standard deviation.
    'elbo_vals' is the trace of ELBO values during optimizaiton.

    References
    ----------
    .. [KTR+2016] Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A.,
        and Blei, D. M. (2016). Automatic Differentiation Variational
        Inference. arXiv preprint arXiv:1603.00788.
    """
    model = pm.modelcontext(model)
    if start is None:
        start = model.test_point

    if vars is None:
        vars = model.vars
    vars = pm.inputvars(vars)

    if not pm.model.all_continuous(vars):
        raise ValueError('Model should not include discrete RVs for ADVI.')

    n_mcsamples = 100 if accurate_elbo else 1

    # Prepare optimizer
    if optimizer is None:
        optimizer = adagrad_optimizer(learning_rate, epsilon)

    # Create variational gradient tensor
    elbo, shared = _calc_elbo(vars, model, n_mcsamples=n_mcsamples,
                              random_seed=random_seed)

    # Set starting values
    for var, share in shared.items():
        share.set_value(start[str(var)])

    order = pm.ArrayOrdering(vars)
    bij = pm.DictToArrayBijection(order, start)
    u_start = bij.map(start)
    w_start = np.zeros_like(u_start)
    uw = np.concatenate([u_start, w_start])

    # Create parameter update function used in the training loop
    uw_shared = theano.shared(uw, 'uw_shared')
    elbo = pm.CallableTensor(elbo)(uw_shared)
    updates = optimizer(loss=-1 * elbo, param=[uw_shared])
    f = theano.function([], [uw_shared, elbo], updates=updates)

    # Optimization loop
    elbos = np.empty(n)
    try:
        progress = trange(n)
        for i in progress:
            uw_i, e = f()
            elbos[i] = e
            if i % (n // 10) == 0 and i > 0:
                avg_elbo = elbos[i - n // 10:i].mean()
                progress.set_description('Average ELBO = {:,.5g}'.format(avg_elbo))
    except KeyboardInterrupt:
        elbos = elbos[:i]
        avg_elbo = elbos[i - n // 10:].mean()
        pm._log.info('Interrupted at {:,d} [{:.0f}%]: Average ELBO = {:,.5g}'.format(
            i, 100 * i // n, avg_elbo))
    else:
        avg_elbo = elbos[-n // 10:].mean()
        pm._log.info('Finished [100%]: Average ELBO = {:,.5g}'.format(avg_elbo))

    # Estimated parameters
    l = int(uw_i.size / 2)
    u = bij.rmap(uw_i[:l])
    w = bij.rmap(uw_i[l:])
    # w is in log space
    for var in w.keys():
        w[var] = np.exp(w[var])

    return ADVIFit(u, w, elbos)
Esempio n. 4
0
def advi(vars=None,
         start=None,
         model=None,
         n=5000,
         accurate_elbo=False,
         optimizer=None,
         learning_rate=.001,
         epsilon=.1,
         mode=None,
         tol_obj=0.01,
         eval_elbo=100,
         random_seed=None,
         progressbar=True):
    """Perform automatic differentiation variational inference (ADVI).

    This function implements the meanfield ADVI, where the variational
    posterior distribution is assumed to be spherical Gaussian without
    correlation of parameters and fit to the true posterior distribution.
    The means and standard deviations of the variational posterior are referred
    to as variational parameters.

    The return value of this function is an :code:`ADVIfit` object, which has
    variational parameters. If you want to draw samples from the variational
    posterior, you need to pass the :code:`ADVIfit` object to
    :code:`pymc3.variational.sample_vp()`.

    The variational parameters are defined on the transformed space, which is
    required to do ADVI on an unconstrained parameter space as described in
    [KTR+2016]. The parameters in the :code:`ADVIfit` object are in the
    transformed space, while traces returned by :code:`sample_vp()` are in
    the original space as obtained by MCMC sampling methods in PyMC3.

    The variational parameters are optimized with given optimizer, which is a
    function that returns a dictionary of parameter updates as provided to
    Theano function. If no optimizer is provided, optimization is performed
    with a modified version of adagrad, where only the last (n_window) gradient
    vectors are used to control the learning rate and older gradient vectors
    are ignored. n_window denotes the size of time window and fixed to 10.

    Parameters
    ----------
    vars : object
        Random variables.
    start : Dict or None
        Initial values of parameters (variational means).
    model : Model
        Probabilistic model.
    n : int
        Number of interations updating parameters.
    accurate_elbo : bool
        If true, 100 MC samples are used for accurate calculation of ELBO.
    optimizer : (loss, tensor) -> dict or OrderedDict
        A function that returns parameter updates given loss and parameter
        tensor. If :code:`None` (default), a default Adagrad optimizer is
        used with parameters :code:`learning_rate` and :code:`epsilon` below.
    learning_rate: float
        Base learning rate for adagrad. This parameter is ignored when
        optimizer is given.
    epsilon : float
        Offset in denominator of the scale of learning rate in Adagrad.
        This parameter is ignored when optimizer is given.
    tol_obj : float
        Relative tolerance for testing convergence of ELBO.
    eval_elbo : int
        Window for checking convergence of ELBO. Convergence will be checked
        for every multiple of eval_elbo.
    random_seed : int or None
        Seed to initialize random state. None uses current seed.
    mode :  string or `Mode` instance.
        Compilation mode passed to Theano functions
    progressbar : bool
        Whether or not to display a progress bar in the command line. The
        bar shows the percentage of completion, the sampling speed in
        samples per second (SPS), the estimated remaining time until
        completion ("expected time of arrival"; ETA), and the current ELBO.

    Returns
    -------
    ADVIFit
        Named tuple, which includes 'means', 'stds', and 'elbo_vals'.

    'means' is the mean. 'stds' is the standard deviation.
    'elbo_vals' is the trace of ELBO values during optimizaiton.

    References
    ----------
    .. [KTR+2016] Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A.,
        and Blei, D. M. (2016). Automatic Differentiation Variational
        Inference. arXiv preprint arXiv:1603.00788.
    """
    model = pm.modelcontext(model)
    if start is None:
        start = model.test_point

    if vars is None:
        vars = model.vars
    vars = pm.inputvars(vars)

    if len(vars) == 0:
        raise ValueError('No free random variables to fit.')

    if not pm.model.all_continuous(vars):
        raise ValueError('Model can not include discrete RVs for ADVI.')

    n_mcsamples = 100 if accurate_elbo else 1

    # Prepare optimizer
    if optimizer is None:
        optimizer = adagrad_optimizer(learning_rate, epsilon)

    # Create variational gradient tensor
    elbo, shared = _calc_elbo(vars,
                              model,
                              n_mcsamples=n_mcsamples,
                              random_seed=random_seed)

    # Set starting values
    for var, share in shared.items():
        share.set_value(start[str(var)])

    order = pm.ArrayOrdering(vars)
    bij = pm.DictToArrayBijection(order, start)
    u_start = bij.map(start)
    w_start = np.zeros_like(u_start)
    uw = np.concatenate([u_start, w_start])

    # Create parameter update function used in the training loop
    uw_shared = theano.shared(uw, 'uw_shared')
    elbo = pm.CallableTensor(elbo)(uw_shared)
    updates = optimizer(loss=-1 * elbo, param=[uw_shared])
    f = theano.function([], [uw_shared, elbo], updates=updates, mode=mode)

    # For tracking convergence of ELBO
    window_size = int(max(0.1 * n // eval_elbo, 2.0))
    circ_buff = deque([], maxlen=window_size)

    # Optimization loop
    elbos = np.empty(n)
    divergence_flag = False
    progress = trange(n) if progressbar else range(n)
    try:
        uw_i, elbo_current = f()
        if np.isnan(elbo_current):
            raise FloatingPointError('NaN occurred in ADVI optimization.')
        for i in progress:
            uw_i, e = f()
            if np.isnan(e):
                raise FloatingPointError('NaN occurred in ADVI optimization.')
            elbos[i] = e

            if progressbar:
                if n < 10:
                    progress.set_description('ELBO = {:,.5g}'.format(elbos[i]))
                elif i % (n // 10) == 0 and i > 0:
                    avg_elbo = infmean(elbos[i - n // 10:i])
                    progress.set_description(
                        'Average ELBO = {:,.5g}'.format(avg_elbo))

            if i % eval_elbo == 0:
                elbo_prev = elbo_current
                elbo_current = elbos[i]
                delta_elbo = abs((elbo_current - elbo_prev) / elbo_prev)
                circ_buff.append(delta_elbo)
                avg_delta = np.mean(circ_buff)
                med_delta = np.median(circ_buff)

                if i > 0 and avg_delta < tol_obj:
                    pm._log.info('Mean ELBO converged.')
                    elbos = elbos[:(i + 1)]
                    break
                elif i > 0 and med_delta < tol_obj:
                    pm._log.info('Median ELBO converged.')
                    elbos = elbos[:(i + 1)]
                    break
                if i > 10 * eval_elbo:
                    if med_delta > 0.5 or avg_delta > 0.5:
                        divergence_flag = True
                    else:
                        divergence_flag = False

    except KeyboardInterrupt:
        elbos = elbos[:i]
        if n < 10:
            pm._log.info(
                'Interrupted at {:,d} [{:.0f}%]: ELBO = {:,.5g}'.format(
                    i, 100 * i // n, elbos[i]))
        else:
            avg_elbo = infmean(elbos[i - n // 10:i])
            pm._log.info(
                'Interrupted at {:,d} [{:.0f}%]: Average ELBO = {:,.5g}'.
                format(i, 100 * i // n, avg_elbo))
    else:
        if n < 10:
            pm._log.info('Finished [100%]: ELBO = {:,.5g}'.format(elbos[-1]))
        else:
            avg_elbo = infmean(elbos[-n // 10:])
            pm._log.info(
                'Finished [100%]: Average ELBO = {:,.5g}'.format(avg_elbo))
    finally:
        if progressbar:
            progress.close()

    if divergence_flag:
        pm._log.info('Evidence of divergence detected, inspect ELBO.')

    # Estimated parameters
    l = int(uw_i.size / 2)
    u = bij.rmap(uw_i[:l])
    w = bij.rmap(uw_i[l:])
    # w is in log space
    for var in w.keys():
        w[var] = np.exp(w[var])

    return ADVIFit(u, w, elbos)