def _statistical_inefficiency(time_series: np.ndarray,
                              minimum_samples: int = 3) -> float:
    """Calculates the statistical inefficiency of a time series.

    Parameters
    ----------
    time_series
        The time series to analyse with shape=(n_data_points, n_dimensions).
    minimum_samples: int
        The minimum number of data points to consider in the calculation.

    Notes
    -----
    This method is based on the paper by J. D. Chodera [1]_ and the implementation at
    https://github.com/choderalab/pymbar. Here the code is extended support
    multidimensional data such as dipole moments.

    References
    ----------
    [1] J. D. Chodera, W. C. Swope, J. W. Pitera, C. Seok, and K. A. Dill. Use of the
        weighted histogram analysis method for the analysis of simulated and parallel
        tempering simulations. JCTC 3(1):26-41, 2007.

    Returns
    -------
        The statistical inefficiency.
    """

    # Make sure the time series has a consistent shape of (n_data_points, n_dimensions)
    dimension = 1 if len(time_series.shape) == 1 else time_series.shape[1]
    standardised_time_series = time_series.reshape(
        (len(time_series), dimension))

    number_of_timesteps = standardised_time_series.shape[0]

    time_series_mean = standardised_time_series.mean(axis=0)
    time_series_shifted = standardised_time_series - time_series_mean

    sigma_squared = np.mean(np.sum(time_series_shifted * time_series_shifted,
                                   axis=1),
                            axis=0)

    if sigma_squared == 0:
        raise ParameterError(
            "Sample covariance sigma_AB^2 = 0 -- cannot compute statistical inefficiency"
        )

    current_timestep = 1
    statistical_inefficiency = 1.0

    while current_timestep < number_of_timesteps - 1:

        autocorrelation_function = (np.sum(
            np.sum(
                time_series_shifted[0:(number_of_timesteps - current_timestep)]
                * time_series_shifted[current_timestep:number_of_timesteps],
                axis=1,
            ),
            axis=0,
        ) / (float(number_of_timesteps - current_timestep) * sigma_squared))

        if autocorrelation_function <= 0.0 and current_timestep > minimum_samples:
            break

        statistical_inefficiency += (
            2.0 * autocorrelation_function *
            (1.0 - float(current_timestep) / float(number_of_timesteps)))

        current_timestep += 1

    # Enforce a minimum autocorrelation time of 0.
    if statistical_inefficiency < 1.0:
        statistical_inefficiency = 1.0

    return statistical_inefficiency
Beispiel #2
0
def statisticalInefficiency(A_n, B_n=None, fast=False, mintime=3, fft=False):
    """Compute the (cross) statistical inefficiency of (two) timeseries.

    Parameters
    ----------
    A_n : np.ndarray, float
        A_n[n] is nth value of timeseries A.  Length is deduced from vector.
    B_n : np.ndarray, float, optional, default=None
        B_n[n] is nth value of timeseries B.  Length is deduced from vector.
        If supplied, the cross-correlation of timeseries A and B will be estimated instead of the
        autocorrelation of timeseries A.  
    fast : bool, optional, default=False
        f True, will use faster (but less accurate) method to estimate correlation
        time, described in Ref. [1] (default: False).  This is ignored
        when B_n=None and fft=True.
    mintime : int, optional, default=3
        minimum amount of correlation function to compute (default: 3)
        The algorithm terminates after computing the correlation time out to mintime when the
        correlation function first goes negative.  Note that this time may need to be increased
        if there is a strong initial negative peak in the correlation function.
    fft : bool, optional, default=False
        If fft=True and B_n=None, then use the fft based approach, as
        implemented in statisticalInefficiency_fft().

    Returns
    -------
    g : np.ndarray,
        g is the estimated statistical inefficiency (equal to 1 + 2 tau, where tau is the correlation time).
        We enforce g >= 1.0.

    Notes
    -----
    The same timeseries can be used for both A_n and B_n to get the autocorrelation statistical inefficiency.
    The fast method described in Ref [1] is used to compute g.

    References
    ----------
    [1] J. D. Chodera, W. C. Swope, J. W. Pitera, C. Seok, and K. A. Dill. Use of the weighted
        histogram analysis method for the analysis of simulated and parallel tempering simulations.
        JCTC 3(1):26-41, 2007.

    Examples
    --------

    Compute statistical inefficiency of timeseries data with known correlation time.  

    >>> from pymbar.testsystems import correlated_timeseries_example
    >>> A_n = correlated_timeseries_example(N=100000, tau=5.0)
    >>> g = statisticalInefficiency(A_n, fast=True)

    """

    # Create numpy copies of input arguments.
    A_n = np.array(A_n)

    if fft and B_n is None:
        return statisticalInefficiency_fft(A_n, mintime=mintime)

    if B_n is not None:
        B_n = np.array(B_n)
    else:
        B_n = np.array(A_n)

    # Get the length of the timeseries.
    N = A_n.size

    # Be sure A_n and B_n have the same dimensions.
    if (A_n.shape != B_n.shape):
        raise ParameterError('A_n and B_n must have same dimensions.')

    # Initialize statistical inefficiency estimate with uncorrelated value.
    g = 1.0

    # Compute mean of each timeseries.
    mu_A = A_n.mean()
    mu_B = B_n.mean()

    # Make temporary copies of fluctuation from mean.
    dA_n = A_n.astype(np.float64) - mu_A
    dB_n = B_n.astype(np.float64) - mu_B

    # Compute estimator of covariance of (A,B) using estimator that will ensure C(0) = 1.
    sigma2_AB = (dA_n * dB_n).mean()  # standard estimator to ensure C(0) = 1

    # Trap the case where this covariance is zero, and we cannot proceed.
    if (sigma2_AB == 0):
        raise ParameterError(
            'Sample covariance sigma_AB^2 = 0 -- cannot compute statistical inefficiency'
        )

    # Accumulate the integrated correlation time by computing the normalized correlation time at
    # increasing values of t.  Stop accumulating if the correlation function goes negative, since
    # this is unlikely to occur unless the correlation function has decayed to the point where it
    # is dominated by noise and indistinguishable from zero.
    t = 1
    increment = 1
    while (t < N - 1):

        # compute normalized fluctuation correlation function at time t
        C = np.sum(dA_n[0:(N - t)] * dB_n[t:N] + dB_n[0:(N - t)] *
                   dA_n[t:N]) / (2.0 * float(N - t) * sigma2_AB)
        # Terminate if the correlation function has crossed zero and we've computed the correlation
        # function at least out to 'mintime'.
        if (C <= 0.0) and (t > mintime):
            break

        # Accumulate contribution to the statistical inefficiency.
        g += 2.0 * C * (1.0 - float(t) / float(N)) * float(increment)

        # Increment t and the amount by which we increment t.
        t += increment

        # Increase the interval if "fast mode" is on.
        if fast:
            increment += 1

    # g must be at least unity
    if (g < 1.0):
        g = 1.0

    # Return the computed statistical inefficiency.
    return g
Beispiel #3
0
def normalizedFluctuationCorrelationFunctionMultiple(A_kn,
                                                     B_kn=None,
                                                     N_max=None,
                                                     norm=True,
                                                     truncate=False):
    """Compute the normalized fluctuation (cross) correlation function of (two) timeseries from multiple timeseries samples.

    C(t) = (<A(t) B(t)> - <A><B>) / (<AB> - <A><B>)
    This may be useful in diagnosing odd time-correlations in timeseries data.

    Parameters
    ----------
    A_kn : Python list of numpy arrays
        A_kn[k] is the kth timeseries, and A_kn[k][n] is nth value of timeseries k.  Length is deduced from arrays.
    B_kn : Python list of numpy arrays
        B_kn[k] is the kth timeseries, and B_kn[k][n] is nth value of timeseries k.  B_kn[k] must have same length as A_kn[k]
    N_max : int, optional, default=None
        if specified, will only compute correlation function out to time lag of N_max
    norm: bool, optional, default=True
        if False, will return unnormalized D(t) = <A(t) B(t)>
    truncate: bool, optional, default=False
        if True, will stop calculating the correlation function when it goes below 0

    Returns
    -------
    C_n[n] : np.ndarray
        The normalized fluctuation auto- or cross-correlation function for timeseries A(t) and B(t).

    Notes
    -----
    The same timeseries can be used for both A_n and B_n to get the autocorrelation statistical inefficiency.
    This procedure may be slow.
    The statistical error in C_n[n] will grow with increasing n.  No effort is made here to estimate the uncertainty.

    References
    ----------
    [1] J. D. Chodera, W. C. Swope, J. W. Pitera, C. Seok, and K. A. Dill. Use of the weighted
    histogram analysis method for the analysis of simulated and parallel tempering simulations.
    JCTC 3(1):26-41, 2007.

    Examples
    --------

    Estimate a portion of the normalized fluctuation autocorrelation function from multiple timeseries of different length.

    >>> from pymbar import testsystems
    >>> N_k = [1000, 2000, 3000, 4000, 5000]
    >>> tau = 5.0 # exponential relaxation time
    >>> A_kn = [ testsystems.correlated_timeseries_example(N=N, tau=tau) for N in N_k ]
    >>> C_n = normalizedFluctuationCorrelationFunctionMultiple(A_kn, N_max=25)

    """

    # If B_kn is not specified, define it to be identical with A_kn.
    if B_kn is None:
        B_kn = A_kn

    # TODO: Change this to support other iterable types, like sets.
    # Make sure A_kn and B_kn are both lists
    if (type(A_kn) is not list) or (type(B_kn) is not list):
        raise ParameterError(
            "A_kn and B_kn must each be a list of numpy arrays.")

    # Ensure the same number of timeseries are stored in A_kn and B_kn.
    if (len(A_kn) != len(B_kn)):
        raise ParameterError(
            "A_kn and B_kn must contain corresponding timeseries -- different numbers of timeseries detected in each."
        )

    # Determine number of timeseries stored.
    K = len(A_kn)

    # Ensure both observable trajectories in each timeseries are of the same length.
    for k in range(K):
        A_n = A_kn[k]
        B_n = B_kn[k]
        if A_n.size != B_n.size:
            raise ParameterError(
                "A_kn and B_kn must contain corresponding timeseries -- lack of correspondence in timeseries lenghts detected."
            )

    # Get the length of each timeseries.
    N_k = np.zeros([K], np.int32)
    for k in range(K):
        N_k[k] = A_kn[k].size

    # Determine total number of samples.
    N = np.sum(N_k)

    # Set maximum time to compute correlation functon for.
    if (not N_max) or (N_max > max(N_k) - 1):
        N_max = max(N_k) - 1

    # Compute means.
    mu_A = 0.0
    mu_B = 0.0
    for k in range(K):
        mu_A += np.sum(A_kn[k])
        mu_B += np.sum(B_kn[k])
    mu_A /= float(N)
    mu_B /= float(N)

    # Compute fluctuation timeseries.
    dA_kn = list()
    dB_kn = list()
    for k in range(K):
        dA_n = A_kn[k] - mu_A
        dB_n = B_kn[k] - mu_B
        dA_kn.append(dA_n)
        dB_kn.append(dB_n)

    # Compute covariance.
    sigma2_AB = 0.0
    for k in range(K):
        sigma2_AB += np.sum(dA_kn[k] * dB_kn[k])
    sigma2_AB /= float(N)

    # allocate storage for normalized fluctuation correlation function
    C_n = np.zeros([N_max + 1], np.float64)

    # Accumulate the integrated correlation time by computing the normalized correlation time at
    # increasing values of t.  Stop accumulating if the correlation function goes negative, since
    # this is unlikely to occur unless the correlation function has decayed to the point where it
    # is dominated by noise and indistinguishable from zero.
    t = 0
    negative = False
    for t in range(0, N_max + 1):
        # compute unnormalized correlation function
        numerator = 0.0
        denominator = 0.0
        for k in range(K):
            if (t >= N_k[k]):
                continue  # skip this trajectory if t is longer than the timeseries
            numerator += np.sum(dA_kn[k][0:(N_k[k] - t)] * dB_kn[k][t:N_k[k]])
            denominator += float(N_k[k] - t)
            if truncate and numerator < 0:
                negative = True
        C = numerator / denominator

        # compute normalized fluctuation correlation function at time t
        C /= sigma2_AB

        # Store correlation function.
        C_n[t] = C

        if negative:
            break

    # Return the computed fluctuation correlation function.
    if norm:
        return C_n[:t]
    else:
        return C_n[:t] * sigma2_AB + mu_A * mu_B
Beispiel #4
0
def normalizedFluctuationCorrelationFunction(A_n,
                                             B_n=None,
                                             N_max=None,
                                             norm=True):
    """Compute the normalized fluctuation (cross) correlation function of (two) stationary timeseries.

    C(t) = (<A(t) B(t)> - <A><B>) / (<AB> - <A><B>)

    This may be useful in diagnosing odd time-correlations in timeseries data.

    Parameters
    ----------
    A_n : np.ndarray
        A_n[n] is nth value of timeseries A.  Length is deduced from vector.
    B_n : np.ndarray
        B_n[n] is nth value of timeseries B.  Length is deduced from vector.
    N_max : int, default=None
        if specified, will only compute correlation function out to time lag of N_max
    norm: bool, optional, default=True
        if False will return the unnormalized correlation function D(t) = <A(t) B(t)>

    Returns
    -------
    C_n : np.ndarray
        C_n[n] is the normalized fluctuation auto- or cross-correlation function for timeseries A(t) and B(t).

    Notes
    -----
    The same timeseries can be used for both A_n and B_n to get the autocorrelation statistical inefficiency.
    This procedure may be slow.
    The statistical error in C_n[n] will grow with increasing n.  No effort is made here to estimate the uncertainty.

    References
    ----------
    [1] J. D. Chodera, W. C. Swope, J. W. Pitera, C. Seok, and K. A. Dill. Use of the weighted
    histogram analysis method for the analysis of simulated and parallel tempering simulations.
    JCTC 3(1):26-41, 2007.

    Examples
    --------

    Estimate normalized fluctuation correlation function.

    >>> from pymbar import testsystems
    >>> A_t = testsystems.correlated_timeseries_example(N=10000, tau=5.0)
    >>> C_t = normalizedFluctuationCorrelationFunction(A_t, N_max=25)

    """

    # If B_n is not specified, set it to be identical to A_n.
    if B_n is None:
        B_n = A_n

    # Create np copies of input arguments.
    A_n = np.array(A_n)
    B_n = np.array(B_n)

    # Get the length of the timeseries.
    N = A_n.size

    # Set maximum time to compute correlation functon for.
    if (not N_max) or (N_max > N - 1):
        N_max = N - 1

    # Be sure A_n and B_n have the same dimensions.
    if (A_n.shape != B_n.shape):
        raise ParameterError('A_n and B_n must have same dimensions.')

    # Initialize statistical inefficiency estimate with uncorrelated value.
    g = 1.0

    # Compute means and variance.
    mu_A = A_n.mean()
    mu_B = B_n.mean()

    # Make temporary copies at high precision with means subtracted off.
    dA_n = A_n.astype(np.float64) - mu_A
    dB_n = B_n.astype(np.float64) - mu_B

    # sigma2_AB = sum((A_n-mu_A) * (B_n-mu_B)) / (float(N)-1.0) # unbiased estimator
    sigma2_AB = (dA_n * dB_n).mean()  # standard estimator to ensure C(0) = 1
    if (sigma2_AB == 0):
        raise ParameterError(
            'Sample covariance sigma_AB^2 = 0 -- cannot compute statistical inefficiency'
        )

    # allocate storage for normalized fluctuation correlation function
    C_n = np.zeros([N_max + 1], np.float64)

    # Compute normalized correlation function.
    t = 0
    for t in range(0, N_max + 1):
        # compute normalized fluctuation correlation function at time t
        C_n[t] = np.sum(dA_n[0:(N - t)] * dB_n[t:N] + dB_n[0:(N - t)] *
                        dA_n[t:N]) / (2.0 * float(N - t) * sigma2_AB)

    # Return the computed correlation function
    if norm:
        return C_n
    else:
        return C_n * sigma2_AB + mu_A * mu_B
Beispiel #5
0
def BAR(w_F, w_R, DeltaF=0.0, compute_uncertainty=True, uncertainty_method='BAR',maximum_iterations=500, relative_tolerance=1.0e-12, verbose=False, method='false-position', iterated_solution=True, return_dict=False):
    """Compute free energy difference using the Bennett acceptance ratio (BAR) method.

    Parameters
    ----------
    w_F : np.ndarray
        w_F[t] is the forward work value from snapshot t.
        t = 0...(T_F-1)  Length T_F is deduced from vector.
    w_R : np.ndarray
        w_R[t] is the reverse work value from snapshot t.
        t = 0...(T_R-1)  Length T_R is deduced from vector.
    DeltaF : float, optional, default=0.0
        DeltaF can be set to initialize the free energy difference with a guess
    compute_uncertainty : bool, optional, default=True
        if False, only the free energy is returned
    uncertainty_method: string, optional, default=BAR
        There are two possible uncertainty estimates for BAR.  One agrees with MBAR for two states exactly;
        The other only agrees with MBAR in the limit of good overlap. See below.
    maximum_iterations : int, optional, default=500
        can be set to limit the maximum number of iterations performed
    relative_tolerance : float, optional, default=1E-11
        can be set to determine the relative tolerance convergence criteria (defailt 1.0e-11)
    verbose : bool
        should be set to True if verbse debug output is desired (default False)
    method : str, optional, defualt='false-position'
        choice of method to solve BAR nonlinear equations, one of 'self-consistent-iteration' or 'false-position' (default: 'false-position')
    iterated_solution : bool, optional, default=True
        whether to fully solve the optimized BAR equation to consistency, or to stop after one step, to be 
        equivalent to transition matrix sampling.
    return_dict : bool, default False
        If true, returns are a dict, else they are a tuple

    Returns
    -------
    'Delta_f' : float
        Free energy difference
        If return_dict, key is 'Delta_f'
    'dDelta_f': float
        Estimated standard deviation of free energy difference
        If return_dict, key is 'dDelta_f'


    References
    ----------

    [1] Shirts MR, Bair E, Hooker G, and Pande VS. Equilibrium free energies from nonequilibrium
    measurements using maximum-likelihood methods. PRL 91(14):140601, 2003.

    Notes
    -----
    The false position method is used to solve the implicit equation.

    Examples
    --------
    Compute free energy difference between two specified samples of work values.

    >>> from pymbar import testsystems
    >>> [w_F, w_R] = testsystems.gaussian_work_example(mu_F=None, DeltaF=1.0, seed=0)
    >>> results = BAR(w_F, w_R, return_dict=True)
    >>> print('Free energy difference is {:.3f} +- {:.3f} kT'.format(results['Delta_f'], results['dDelta_f']))
    Free energy difference is 1.088 +- 0.050 kT

    Test completion of various other schemes.

    >>> results = BAR(w_F, w_R, method='self-consistent-iteration', return_dict=True)
    >>> results = BAR(w_F, w_R, method='false-position', return_dict=True)
    >>> results = BAR(w_F, w_R, method='bisection', return_dict=True)

    """

    result_vals = dict()
    # if computing nonoptimized, one step value, we set the max-iterations
    # to 1, and the method to 'self-consistent-iteration'

    if not iterated_solution:
        maximum_iterations = 1
        method = 'self-consistent-iteration'
        DeltaF_initial = DeltaF

    if method == 'self-consistent-iteration':
        nfunc = 0

    if method == 'bisection' or method == 'false-position':
        UpperB = EXP(w_F, return_dict=True)['Delta_f']
        LowerB = -EXP(w_R, return_dict=True)['Delta_f']

        FUpperB = BARzero(w_F, w_R, UpperB)
        FLowerB = BARzero(w_F, w_R, LowerB)
        nfunc = 2

        if (np.isnan(FUpperB) or np.isnan(FLowerB)):
            # this data set is returning NAN -- will likely not work.  Return 0, print a warning:
            # consider returning more information about failure
            print("Warning: BAR is likely to be inaccurate because of poor overlap. Improve the sampling, or decrease the spacing betweeen states.  For now, guessing that the free energy difference is 0 with no uncertainty.")
            if compute_uncertainty:
                result_vals['Delta_f'] = 0.0 
                result_vals['dDelta_f'] = 0.0
                if return_dict:
                    return result_vals
                return 0.0, 0.0
            else:
                result_vals['Delta_f'] = 0.0
                if return_dict:
                    return result_vals
                return 0.0

        while FUpperB * FLowerB > 0:
            # if they have the same sign, they do not bracket.  Widen the bracket until they have opposite signs.
            # There may be a better way to do this, and the above bracket should rarely fail.
            if verbose:
                print('Initial brackets did not actually bracket, widening them')
            FAve = (UpperB + LowerB) / 2
            UpperB = UpperB - max(abs(UpperB - FAve), 0.1)
            LowerB = LowerB + max(abs(LowerB - FAve), 0.1)
            FUpperB = BARzero(w_F, w_R, UpperB)
            FLowerB = BARzero(w_F, w_R, LowerB)
            nfunc += 2

    # Iterate to convergence or until maximum number of iterations has been exceeded.

    for iteration in range(maximum_iterations):

        DeltaF_old = DeltaF

        if method == 'false-position':
            # Predict the new value
            if (LowerB == 0.0) and (UpperB == 0.0):
                DeltaF = 0.0
                FNew = 0.0
            else:
                DeltaF = UpperB - FUpperB * (UpperB - LowerB) / (FUpperB - FLowerB)
                FNew = BARzero(w_F, w_R, DeltaF)
            nfunc += 1

            if FNew == 0:
                # Convergence is achieved.
                if verbose:
                    print('Convergence achieved.')
                relative_change = 10 ** (-15)
                break

        if method == 'bisection':
            # Predict the new value
            DeltaF = (UpperB + LowerB) / 2
            FNew = BARzero(w_F, w_R, DeltaF)
            nfunc += 1

        if method == 'self-consistent-iteration':
            DeltaF = -BARzero(w_F, w_R, DeltaF) + DeltaF
            nfunc += 1

        # Check for convergence.
        if (DeltaF == 0.0):
            # The free energy difference appears to be zero -- return.
            if verbose:
                print('The free energy difference appears to be zero.')
            break

        if iterated_solution:
            relative_change = abs((DeltaF - DeltaF_old) / DeltaF)
            if verbose:
                print("relative_change = {:12.3f}".format(relative_change))

            if ((iteration > 0) and (relative_change < relative_tolerance)):
                # Convergence is achieved.
                if verbose:
                    print("Convergence achieved.")
                break

        if method == 'false-position' or method == 'bisection':
            if FUpperB * FNew < 0:
                # these two now bracket the root
                LowerB = DeltaF
                FLowerB = FNew
            elif FLowerB * FNew <= 0:
                # these two now bracket the root
                UpperB = DeltaF
                FUpperB = FNew
            else:
                message = 'WARNING: Cannot determine bound on free energy'
                raise BoundsError(message)

        if verbose:
            print("iteration {:5d}: DeltaF = {:16.3f}".format(iteration, DeltaF))

    # Report convergence, or warn user if not achieved.
    if iterated_solution:
        if iteration < maximum_iterations:
            if verbose:
                print('Converged to tolerance of {:e} in {:d} iterations ({:d} function evaluations)'.format(relative_change, iteration, nfunc))
        else:
            message = 'WARNING: Did not converge to within specified tolerance. max_delta = {:f}, TOLERANCE = {:f}, MAX_ITS = %d'.format(relative_change, relative_tolerance, maximum_iterations)
            raise ConvergenceError(message)

    if compute_uncertainty:

        '''

        Compute asymptotic variance estimate using Eq. 10a of Bennett,
        1976 (except with n_1<f>_1^2 in the second denominator, it is
        an error in the original NOTE: The 'BAR' and 'MBAR' estimators
        do not agree for poor overlap. This is not because of
        numerical precision, but because they are fundamentally
        different estimators. For poor overlap, 'MBAR' diverges high,
        and 'BAR' diverges by being too low. In situations they are
        noticeably from each other, they are also pretty different
        from the true answer (obtained by calculating the standard
        deviation over lots of realizations).

        First, we examine the 'BAR' equation. Rederive from Bennett, substituting (8) into (7)

        (8)    -> W = [q0/n0 exp(-U1) + q1/n1 exp(-U0)]^-1
                    <(W exp(-U1))^2 >_0         <(W exp(-U0))^2 >_1
        (7)    -> -----------------------  +   -----------------------   - 1/n0 - 1/n1
                   n_0 [<(W exp(-U1)>_0]^2      n_1 [<(W exp(-U0)>_1]^2

            Const cancels out of top and bottom.   Wexp(-U0) = [q0/n0 exp(-(U1-U0)) + q1/n1]^-1
                                                             =  n1/q1 [n1/n0 q0/q1 exp(-(U1-U0)) + 1]^-1
                                                             =  n1/q1 [exp (M+(F1-F0)-(U1-U0)+1)^-1]
                                                             =  n1/q1 f(x)
                                                   Wexp(-U1) = [q0/n0 + q1/n1 exp(-(U0-U1))]^-1
                                                             =  n0/q0 [1 + n0/n1 q1/q0 exp(-(U0-U1))]^-1
                                                             =  n0/q0 [1 + exp(-M+[F0-F1)-(U0-U1))]^-1
                                                             =  n0/q0 f(-x)


                  <(W exp(-U1))^2 >_0          <(W exp(-U0))^2 >_1
         (7) -> -----------------------   +  -----------------------   - 1/n0 - 1/n1
                n_0 [<(W exp(-U1)>_0]^2      n_1 [<(W exp(-U0)>_1]^2

                   <[n0/q0 f(-x)]^2>_0        <[n1/q1 f(x)]^2>_1
                -----------------------  +  ------------------------   -1/n0 -1/n1
                  n_0 <n0/q0 f(-x)>_0^2      n_1 <n1/q1 f(x)>_1^2

               1      <[f(-x)]^2>_0                 1        <[f(x)]^2>_1
               -  [-----------------------  - 1]  + -  [------------------------  - 1]
               n0      <f(-x)>_0^2                  n1      n_1<f(x)>_1^2

        where f = the fermi function, 1/(1+exp(-x))

        This formula the 'BAR' equation works for works for free
        energies (F0-F1) that don't satisfy the BAR equation.  The
        'MBAR' equation, detailed below, only works for free energies
        that satisfy the equation.


        Now, let's look at the MBAR version of the uncertainty.  This
        is written (from Shirts and Chodera, JPC, 129, 124105, Equation E9) as

              [ n0<f(x)f(-x)>_0 + n1<f(x)f(-x)_1 ]^-1 - n0^-1 - n1^-1

              we note the f(-x) + f(x)  = 1, and change this to:

              [ n0<(1-f(-x)f(-x)>_0 + n1<f(x)(1-f(x))_1 ]^-1 - n0^-1 - n1^-1

              [ n0<f(-x)-f(-x)^2)>_0 + n1<f(x)-f(x)^2)_1 ]^-1 - n0^-1 - n1^-1

                                                1                                         1     1
              --------------------------------------------------------------------    -  --- - ---
                 n0 <f(-x)>_0 - n0 <[f(-x)]^2>_0 + n1 <f(x)>_1 + n1 <[f(x)]^2>_1          n0    n1


        Removing the factor of - (T_F + T_R)/(T_F*T_R)) from both, we compare:

                  <[f(-x)]^2>_0          <[f(x)]^2>_1
              [------------------]  + [---------------]
                 n0 <f(-x)>_0^2          n1 <f(x)>_1^2

                                                1
              --------------------------------------------------------------------
                 n0 <f(-x)>_0 - n0 <[f(-x)]^2>_0 + n1 <f(x)>_1 + n1 <[f(x)]^2>_1

        denote: <f(-x)>_0 = afF
                <f(-x)^2>_0 = afF2
                <f(x)>_1 = afR
                <f(x)^2>_1 = afF2

        Then we can look at both of these as:

        variance_BAR = (afF2/afF**2)/T_F + (afR2/afR**2)/T_R
        variance_MBAR = 1/(afF*T_F - afF2*T_F + afR*T_R - afR2*T_R)

        Rearranging:

        variance_BAR = (afF2/afF**2)/T_F + (afR2/afR**2)/T_R
        variance_MBAR = 1/(afF*T_F + afR*T_R - (afF2*T_F +  afR2*T_R))

        # check the steps below?  Not quite sure.
        variance_BAR = (afF2/afF**2) + (afR2/afR**2)  = (afF2 + afR2)/afR**2
        variance_MBAR = 1/(afF + afR - (afF2 +  afR2)) = 1/(2*afR-(afF2+afR2))

        Definitely not the same.  Now, the reason that they both work
        for high overlap is still not clear. We will determine the
        difference at some point.

        see https://github.com/choderalab/pymbar/issues/281 for more information.

        Now implement the two computations.
        '''

        # Determine number of forward and reverse work values provided.
        T_F = float(w_F.size)  # number of forward work values
        T_R = float(w_R.size)  # number of reverse work values

        # Compute log ratio of forward and reverse counts.
        M = np.log(T_F / T_R)

        if iterated_solution:
            C = M - DeltaF
        else:
            C = M - DeltaF_initial

        # In theory, overflow handling should not be needed now, because we use numlogexp or a custom routine?

        # fF = 1 / (1 + np.exp(w_F + C)), but we need to handle overflows
        exp_arg_F = (w_F + C)
        max_arg_F  = np.max(exp_arg_F)
        log_fF = - np.log(np.exp(-max_arg_F) + np.exp(exp_arg_F - max_arg_F))
        afF  = np.exp(logsumexp(log_fF)-max_arg_F)/T_F

        # fR = 1 / (1 + np.exp(w_R - C)), but we need to handle overflows
        exp_arg_R = (w_R - C)
        max_arg_R  = np.max(exp_arg_R)
        log_fR = - np.log(np.exp(-max_arg_R) + np.exp(exp_arg_R - max_arg_R))
        afR = np.exp(logsumexp(log_fR)-max_arg_R)/T_R

        afF2 = np.exp(logsumexp(2*log_fF)-2*max_arg_F)/T_F
        afR2 = np.exp(logsumexp(2*log_fR)-2*max_arg_R)/T_R

        nrat = (T_F + T_R)/(T_F * T_R)   # same for both methods

        if uncertainty_method == 'BAR':
            variance = (afF2/afF**2)/T_F + (afR2/afR**2)/T_R - nrat
            dDeltaF = np.sqrt(variance)
        elif uncertainty_method == 'MBAR':
            # OR equivalently
            vartemp = ((afF - afF2)*T_F + (afR - afR2)*T_R)
            dDeltaF = np.sqrt(1.0/vartemp - nrat)
        else:
            message = 'ERROR: BAR uncertainty method {:s} is not defined'.format(uncertainty_method)
            raise ParameterError(message)

        if verbose:
            print("DeltaF = {:8.3f} +- {:8.3f}".format(DeltaF, dDeltaF))
        result_vals['Delta_f'] = DeltaF
        result_vals['dDelta_f'] = dDeltaF
        if return_dict:
            return result_vals
        return DeltaF, dDeltaF
    else:
        if verbose:
            print("DeltaF = {:8.3f}".format(DeltaF))
        result_vals['Delta_f'] = DeltaF
        if return_dict:
            return result_vals
        return DeltaF
Beispiel #6
0
O_extra = numpy.array([ 0.5, 1.5, 2.5, 3.5, 4.5])
observables = ['position','position^2','potential energy','RMS displacement']

seed = None
# Uncomment the following line to seed the random number generated to produce reproducible output.
seed = 0
numpy.random.seed(seed)

#=============================================================================================
# MAIN
#=============================================================================================

# Determine number of simulations.
K = numpy.size(N_k)
if numpy.shape(K_k) != numpy.shape(N_k): 
  raise ParameterError("K_k (%d) and N_k (%d) must have same dimensions." % (numpy.shape(K_k), numpy.shape(N_k)))
if numpy.shape(O_k) != numpy.shape(N_k): 
  raise ParameterError("O_k (%d) and N_k (%d) must have same dimensions." % (numpy.shape(K_k), numpy.shape(N_k)))

# Determine maximum number of samples to be drawn for any state.
N_max = numpy.max(N_k)

(f_k_analytical, Delta_f_ij_analytical, A_k_analytical, A_ij_analytical) = GetAnalytical(beta,K_k,O_k,observables)

print("This script will draw samples from %d harmonic oscillators." % (K))
print("The harmonic oscillators have equilibrium positions")
print(O_k)
print("and spring constants")
print(K_k)
print("and the following number of samples will be drawn from each (can be zero if no samples drawn):")
print(N_k)