def calculate_statistical_inefficiency(time_series, minimum_samples=3): """Calculates the statistical inefficiency of a time series. Notes ----- The statistical inefficiency g, is related to the autocorrelation time by g = 1+2*tau This method is based on the paper by J. D. Chodera [1], and the implementation at https://github.com/choderalab/pymbar - extending the code to support multidimensional data. References ---------- [1] J. D. Chodera, W. C. Swope, J. W. Pitera, C. Seok, and K. A. Dill. Use of the weighted histogram analysis method for the analysis of simulated and parallel tempering simulations. JCTC 3(1):26-41, 2007. Parameters ---------- time_series: np.ndarray, shape=(num_frames, num_dimensions), dtype=float The time series to calculate the statistical inefficiency of. minimum_samples: int The minimum number of data points to consider in the calculation. Returns ------- float: The statistical inefficiency. """ # Make sure the time series has a consistent shape of # (N_frames, dimension) dimension = 1 if len(time_series.shape) == 1 else time_series.shape[1] standardised_time_series = time_series.reshape( (len(time_series), dimension)) number_of_timesteps = standardised_time_series.shape[0] time_series_mean = standardised_time_series.mean(axis=0) time_series_shifted = standardised_time_series - time_series_mean sigma_squared = np.mean(np.sum(time_series_shifted * time_series_shifted, axis=1), axis=0) if sigma_squared == 0: raise ParameterError( 'Sample covariance sigma_AB^2 = 0 -- cannot compute statistical inefficiency' ) current_timestep = 1 statistical_inefficiency = 1.0 while current_timestep < number_of_timesteps - 1: autocorrelation_function = np.sum( np.sum( time_series_shifted[0:(number_of_timesteps - current_timestep)] * time_series_shifted[current_timestep:number_of_timesteps], axis=1), axis=0) / (float(number_of_timesteps - current_timestep) * sigma_squared) if autocorrelation_function <= 0.0 and current_timestep > minimum_samples: break statistical_inefficiency += ( 2.0 * autocorrelation_function * (1.0 - float(current_timestep) / float(number_of_timesteps))) current_timestep += 1 # Enforce a minimum autocorrelation time of 0. if statistical_inefficiency < 1.0: statistical_inefficiency = 1.0 return statistical_inefficiency
Deltaf_ij_analytical = numpy.zeros([K,K], dtype = numpy.float64) for i in range(0,K): for j in range(0,K): Deltaf_ij_analytical[i,j] = f_k_analytical[j] - f_k_analytical[i] # Compute ensemble averages analytically if observe == 'RMS displacement': A_k_analytical = sigma_k # mean square displacement elif observe == 'potential energy': A_k_analytical = 1/(2*beta)*numpy.ones([K],float) # By eqipartition elif observe == 'position': A_k_analytical = O_k # observable is the position elif observe == 'position^2': A_k_analytical = (1+ beta*K_k*O_k**2)/(beta*K_k) # observable is the position^2 else: raise ParameterError("Observable %s not known." % observe) # DEBUG info print("This script will perform %d replicates of an experiment where samples are drawn from %d harmonic oscillators." % (nreplicates, K)) print("The harmonic oscillators have equilibrium positions") print(O_k) print("and spring constants") print(K_k) print("and the following number of samples will be drawn from each (can be zero if no samples drawn):") print(N_k) print("") # Conduct a number of replicates of the same experiment replicates_observable = [] # storage for one hash for each replicate replicates_standobservable = [] # storage for one hash for each replicate replicates_df = [] # storage for one hash for each replicate
def statisticalInefficiency(A_n, B_n=None, fast=False, mintime=3): """ Compute the (cross) statistical inefficiency of (two) timeseries. REQUIRED ARGUMENTS A_n (numpy array) - A_n[n] is nth value of timeseries A. Length is deduced from vector. OPTIONAL ARGUMENTS B_n (numpy array) - B_n[n] is nth value of timeseries B. Length is deduced from vector. If supplied, the cross-correlation of timeseries A and B will be estimated instead of the autocorrelation of timeseries A. fast (boolean) - if True, will use faster (but less accurate) method to estimate correlation time, described in Ref. [1] (default: False) mintime (int) - minimum amount of correlation function to compute (default: 3) The algorithm terminates after computing the correlation time out to mintime when the correlation function furst goes negative. Note that this time may need to be increased if there is a strong initial negative peak in the correlation function. RETURNS g is the estimated statistical inefficiency (equal to 1 + 2 tau, where tau is the correlation time). We enforce g >= 1.0. NOTES The same timeseries can be used for both A_n and B_n to get the autocorrelation statistical inefficiency. The fast method described in Ref [1] is used to compute g. REFERENCES [1] J. D. Chodera, W. C. Swope, J. W. Pitera, C. Seok, and K. A. Dill. Use of the weighted histogram analysis method for the analysis of simulated and parallel tempering simulations. JCTC 3(1):26-41, 2007. EXAMPLES Compute statistical inefficiency of timeseries data with known correlation time. >>> from pymbar.testsystems import correlated_timeseries_example >>> A_n = correlated_timeseries_example(N=100000, tau=5.0) >>> g = statisticalInefficiency(A_n, fast=True) """ # Create numpy copies of input arguments. A_n = numpy.array(A_n) if B_n is not None: B_n = numpy.array(B_n) else: B_n = numpy.array(A_n) # Get the length of the timeseries. N = A_n.size # Be sure A_n and B_n have the same dimensions. if(A_n.shape != B_n.shape): raise ParameterError('A_n and B_n must have same dimensions.') # Initialize statistical inefficiency estimate with uncorrelated value. g = 1.0 # Compute mean of each timeseries. mu_A = A_n.mean() mu_B = B_n.mean() # Make temporary copies of fluctuation from mean. dA_n = A_n.astype(numpy.float64) - mu_A dB_n = B_n.astype(numpy.float64) - mu_B # Compute estimator of covariance of (A,B) using estimator that will ensure C(0) = 1. sigma2_AB = (dA_n * dB_n).mean() # standard estimator to ensure C(0) = 1 # Trap the case where this covariance is zero, and we cannot proceed. if(sigma2_AB == 0): raise ParameterError('Sample covariance sigma_AB^2 = 0 -- cannot compute statistical inefficiency') # Accumulate the integrated correlation time by computing the normalized correlation time at # increasing values of t. Stop accumulating if the correlation function goes negative, since # this is unlikely to occur unless the correlation function has decayed to the point where it # is dominated by noise and indistinguishable from zero. t = 1 increment = 1 while (t < N-1): # compute normalized fluctuation correlation function at time t C = numpy.sum( dA_n[0:(N-t)]*dB_n[t:N] + dB_n[0:(N-t)]*dA_n[t:N] ) / (2.0 * float(N-t) * sigma2_AB) # Terminate if the correlation function has crossed zero and we've computed the correlation # function at least out to 'mintime'. if (C <= 0.0) and (t > mintime): break # Accumulate contribution to the statistical inefficiency. g += 2.0 * C * (1.0 - float(t)/float(N)) * float(increment) # Increment t and the amount by which we increment t. t += increment # Increase the interval if "fast mode" is on. if fast: increment += 1 # g must be at least unity if (g < 1.0): g = 1.0 # Return the computed statistical inefficiency. return g
def normalizedFluctuationCorrelationFunctionMultiple(A_kn, B_kn=None, N_max=None): """ Compute the normalized fluctuation (cross) correlation function of (two) timeseries from multiple timeseries samples. C(t) = (<A(t) B(t)> - <A><B>) / (<AB> - <A><B>) This may be useful in diagnosing odd time-correlations in timeseries data. REQUIRED ARGUMENTS A_kn (Python list of numpy arrays) - A_kn[k] is the kth timeseries, and A_kn[k][n] is nth value of timeseries k. Length is deduced from arrays. B_kn (Python list of numpy arrays) - B_kn[k] is the kth timeseries, and B_kn[k][n] is nth value of timeseries k. B_kn[k] must have same length as A_kn[k] OPTIONAL ARGUMENTS N_max - if specified, will only compute correlation function out to time lag of N_max RETURNS C_n[n] is the normalized fluctuation auto- or cross-correlation function for timeseries A(t) and B(t). NOTES The same timeseries can be used for both A_n and B_n to get the autocorrelation statistical inefficiency. This procedure may be slow. The statistical error in C_n[n] will grow with increasing n. No effort is made here to estimate the uncertainty. REFERENCES [1] J. D. Chodera, W. C. Swope, J. W. Pitera, C. Seok, and K. A. Dill. Use of the weighted histogram analysis method for the analysis of simulated and parallel tempering simulations. JCTC 3(1):26-41, 2007. EXAMPLES Estimate a portion of the normalized fluctuation autocorrelation function from multiple timeseries of different length. >>> from pymbar import testsystems >>> N_k = [1000, 2000, 3000, 4000, 5000] >>> tau = 5.0 # exponential relaxation time >>> A_kn = [ testsystems.correlated_timeseries_example(N=N, tau=tau) for N in N_k ] >>> C_n = normalizedFluctuationCorrelationFunctionMultiple(A_kn, N_max=25) """ # If B_kn is not specified, define it to be identical with A_kn. if B_kn is None: B_kn = A_kn # TODO: Change this to support other iterable types, like sets. # Make sure A_kn and B_kn are both lists if (type(A_kn) is not list) or (type(B_kn) is not list): raise ParameterError("A_kn and B_kn must each be a list of numpy arrays.") # Ensure the same number of timeseries are stored in A_kn and B_kn. if (len(A_kn) != len(B_kn)): raise ParameterError("A_kn and B_kn must contain corresponding timeseries -- different numbers of timeseries detected in each.") # Determine number of timeseries stored. K = len(A_kn) # Ensure both observable trajectories in each timeseries are of the same length. for k in range(K): A_n = A_kn[k] B_n = B_kn[k] if A_n.size != B_n.size: raise ParameterError("A_kn and B_kn must contain corresponding timeseries -- lack of correspondence in timeseries lenghts detected.") # Get the length of each timeseries. N_k = numpy.zeros([K], numpy.int32) for k in range(K): N_k[k] = A_kn[k].size # Determine total number of samples. N = numpy.sum(N_k) # Set maximum time to compute correlation functon for. if (not N_max) or (N_max > max(N_k) - 1): N_max = max(N_k) - 1 # Compute means. mu_A = 0.0 mu_B = 0.0 for k in range(K): mu_A += numpy.sum(A_kn[k]) mu_B += numpy.sum(B_kn[k]) mu_A /= float(N) mu_B /= float(N) # Compute fluctuation timeseries. dA_kn = list() dB_kn = list() for k in range(K): dA_n = A_kn[k] - mu_A dB_n = B_kn[k] - mu_B dA_kn.append(dA_n) dB_kn.append(dB_n) # Compute covariance. sigma2_AB = 0.0 for k in range(K): sigma2_AB += numpy.sum(dA_kn[k] * dB_kn[k]) sigma2_AB /= float(N) # allocate storage for normalized fluctuation correlation function C_n = numpy.zeros([N_max+1], numpy.float64) # Accumulate the integrated correlation time by computing the normalized correlation time at # increasing values of t. Stop accumulating if the correlation function goes negative, since # this is unlikely to occur unless the correlation function has decayed to the point where it # is dominated by noise and indistinguishable from zero. t = 0 for t in range(0,N_max+1): # compute unnormalized correlation function numerator = 0.0 denominator = 0.0 for k in range(K): if (t >= N_k[k]): continue # skip this trajectory if t is longer than the timeseries numerator += numpy.sum(dA_kn[k][0:(N_k[k]-t)] * dB_kn[k][t:N_k[k]]) denominator += float(N_k[k]-t) C = numerator / denominator # compute normalized fluctuation correlation function at time t C /= sigma2_AB # Store correlation function. C_n[t] = C # Return the computed fluctuation correlation function. return C_n
def normalizedFluctuationCorrelationFunction(A_n, B_n=None, N_max=None): """ Compute the normalized fluctuation (cross) correlation function of (two) stationary timeseries. C(t) = (<A(t) B(t)> - <A><B>) / (<AB> - <A><B>) This may be useful in diagnosing odd time-correlations in timeseries data. REQUIRED ARGUMENTS A_n[n] is nth value of timeseries A. Length is deduced from vector. B_n[n] is nth value of timeseries B. Length is deduced from vector. OPTIONAL ARGUMENTS N_max - if specified, will only compute correlation function out to time lag of N_max RETURNS C_n[n] is the normalized fluctuation auto- or cross-correlation function for timeseries A(t) and B(t). NOTES The same timeseries can be used for both A_n and B_n to get the autocorrelation statistical inefficiency. This procedure may be slow. The statistical error in C_n[n] will grow with increasing n. No effort is made here to estimate the uncertainty. REFERENCES [1] J. D. Chodera, W. C. Swope, J. W. Pitera, C. Seok, and K. A. Dill. Use of the weighted histogram analysis method for the analysis of simulated and parallel tempering simulations. JCTC 3(1):26-41, 2007. EXAMPLES Estimate normalized fluctuation correlation function. >>> from pymbar import testsystems >>> A_t = testsystems.correlated_timeseries_example(N=10000, tau=5.0) >>> C_t = normalizedFluctuationCorrelationFunction(A_t, N_max=25) """ # If B_n is not specified, set it to be identical to A_n. if B_n is None: B_n = A_n # Create numpy copies of input arguments. A_n = numpy.array(A_n) B_n = numpy.array(B_n) # Get the length of the timeseries. N = A_n.size # Set maximum time to compute correlation functon for. if (not N_max) or (N_max > N-1): N_max = N-1 # Be sure A_n and B_n have the same dimensions. if(A_n.shape != B_n.shape): raise ParameterError('A_n and B_n must have same dimensions.') # Initialize statistical inefficiency estimate with uncorrelated value. g = 1.0 # Compute means and variance. mu_A = A_n.mean() mu_B = B_n.mean() # Make temporary copies at high precision with means subtracted off. dA_n = A_n.astype(numpy.float64) - mu_A dB_n = B_n.astype(numpy.float64) - mu_B # sigma2_AB = sum((A_n-mu_A) * (B_n-mu_B)) / (float(N)-1.0) # unbiased estimator sigma2_AB = (dA_n * dB_n).mean() # standard estimator to ensure C(0) = 1 if(sigma2_AB == 0): raise ParameterError('Sample covariance sigma_AB^2 = 0 -- cannot compute statistical inefficiency') # allocate storage for normalized fluctuation correlation function C_n = numpy.zeros([N_max+1], numpy.float64) # Compute normalized correlation funtion. t = 0 for t in range(0,N_max+1): # compute normalized fluctuation correlation function at time t C_n[t] = numpy.sum( dA_n[0:(N-t)]*dB_n[t:N] + dB_n[0:(N-t)]*dA_n[t:N] ) / (2.0 * float(N-t) * sigma2_AB) # Return the computed correlation function return C_n
def BAR(w_F, w_R, DeltaF=0.0, compute_uncertainty=True, uncertainty_method='BAR',maximum_iterations=500, relative_tolerance=1.0e-12, verbose=False, method='false-position', iterated_solution=True, return_dict=False): """Compute free energy difference using the Bennett acceptance ratio (BAR) method. Parameters ---------- w_F : np.ndarray w_F[t] is the forward work value from snapshot t. t = 0...(T_F-1) Length T_F is deduced from vector. w_R : np.ndarray w_R[t] is the reverse work value from snapshot t. t = 0...(T_R-1) Length T_R is deduced from vector. DeltaF : float, optional, default=0.0 DeltaF can be set to initialize the free energy difference with a guess compute_uncertainty : bool, optional, default=True if False, only the free energy is returned uncertainty_method: string, optional, default=BAR There are two possible uncertainty estimates for BAR. One agrees with MBAR for two states exactly; The other only agrees with MBAR in the limit of good overlap. See below. maximum_iterations : int, optional, default=500 can be set to limit the maximum number of iterations performed relative_tolerance : float, optional, default=1E-11 can be set to determine the relative tolerance convergence criteria (defailt 1.0e-11) verbose : bool should be set to True if verbse debug output is desired (default False) method : str, optional, defualt='false-position' choice of method to solve BAR nonlinear equations, one of 'self-consistent-iteration' or 'false-position' (default: 'false-position') iterated_solution : bool, optional, default=True whether to fully solve the optimized BAR equation to consistency, or to stop after one step, to be equivalent to transition matrix sampling. return_dict : bool, default False If true, returns are a dict, else they are a tuple Returns ------- 'Delta_f' : float Free energy difference If return_dict, key is 'Delta_f' 'dDelta_f': float Estimated standard deviation of free energy difference If return_dict, key is 'dDelta_f' References ---------- [1] Shirts MR, Bair E, Hooker G, and Pande VS. Equilibrium free energies from nonequilibrium measurements using maximum-likelihood methods. PRL 91(14):140601, 2003. Notes ----- The false position method is used to solve the implicit equation. Examples -------- Compute free energy difference between two specified samples of work values. >>> from pymbar import testsystems >>> [w_F, w_R] = testsystems.gaussian_work_example(mu_F=None, DeltaF=1.0, seed=0) >>> results = BAR(w_F, w_R, return_dict=True) >>> print('Free energy difference is {:.3f} +- {:.3f} kT'.format(results['Delta_f'], results['dDelta_f'])) Free energy difference is 1.088 +- 0.050 kT Test completion of various other schemes. >>> results = BAR(w_F, w_R, method='self-consistent-iteration', return_dict=True) >>> results = BAR(w_F, w_R, method='false-position', return_dict=True) >>> results = BAR(w_F, w_R, method='bisection', return_dict=True) """ result_vals = dict() # if computing nonoptimized, one step value, we set the max-iterations # to 1, and the method to 'self-consistent-iteration' if not iterated_solution: maximum_iterations = 1 method = 'self-consistent-iteration' DeltaF_initial = DeltaF if method == 'self-consistent-iteration': nfunc = 0 if method == 'bisection' or method == 'false-position': UpperB = EXP(w_F, return_dict=True)['Delta_f'] LowerB = -EXP(w_R, return_dict=True)['Delta_f'] FUpperB = BARzero(w_F, w_R, UpperB) FLowerB = BARzero(w_F, w_R, LowerB) nfunc = 2 if (np.isnan(FUpperB) or np.isnan(FLowerB)): # this data set is returning NAN -- will likely not work. Return 0, print a warning: # consider returning more information about failure print("Warning: BAR is likely to be inaccurate because of poor overlap. Improve the sampling, or decrease the spacing betweeen states. For now, guessing that the free energy difference is 0 with no uncertainty.") if compute_uncertainty: result_vals['Delta_f'] = 0.0 result_vals['dDelta_f'] = 0.0 if return_dict: return result_vals return 0.0, 0.0 else: result_vals['Delta_f'] = 0.0 if return_dict: return result_vals return 0.0 while FUpperB * FLowerB > 0: # if they have the same sign, they do not bracket. Widen the bracket until they have opposite signs. # There may be a better way to do this, and the above bracket should rarely fail. if verbose: print('Initial brackets did not actually bracket, widening them') FAve = (UpperB + LowerB) / 2 UpperB = UpperB - max(abs(UpperB - FAve), 0.1) LowerB = LowerB + max(abs(LowerB - FAve), 0.1) FUpperB = BARzero(w_F, w_R, UpperB) FLowerB = BARzero(w_F, w_R, LowerB) nfunc += 2 # Iterate to convergence or until maximum number of iterations has been exceeded. for iteration in range(maximum_iterations): DeltaF_old = DeltaF if method == 'false-position': # Predict the new value if (LowerB == 0.0) and (UpperB == 0.0): DeltaF = 0.0 FNew = 0.0 else: DeltaF = UpperB - FUpperB * (UpperB - LowerB) / (FUpperB - FLowerB) FNew = BARzero(w_F, w_R, DeltaF) nfunc += 1 if FNew == 0: # Convergence is achieved. if verbose: print('Convergence achieved.') relative_change = 10 ** (-15) break if method == 'bisection': # Predict the new value DeltaF = (UpperB + LowerB) / 2 FNew = BARzero(w_F, w_R, DeltaF) nfunc += 1 if method == 'self-consistent-iteration': DeltaF = -BARzero(w_F, w_R, DeltaF) + DeltaF nfunc += 1 # Check for convergence. if (DeltaF == 0.0): # The free energy difference appears to be zero -- return. if verbose: print('The free energy difference appears to be zero.') break if iterated_solution: relative_change = abs((DeltaF - DeltaF_old) / DeltaF) if verbose: print("relative_change = {:12.3f}".format(relative_change)) if ((iteration > 0) and (relative_change < relative_tolerance)): # Convergence is achieved. if verbose: print("Convergence achieved.") break if method == 'false-position' or method == 'bisection': if FUpperB * FNew < 0: # these two now bracket the root LowerB = DeltaF FLowerB = FNew elif FLowerB * FNew <= 0: # these two now bracket the root UpperB = DeltaF FUpperB = FNew else: message = 'WARNING: Cannot determine bound on free energy' raise BoundsError(message) if verbose: print("iteration {:5d}: DeltaF = {:16.3f}".format(iteration, DeltaF)) # Report convergence, or warn user if not achieved. if iterated_solution: if iteration < maximum_iterations: if verbose: print('Converged to tolerance of {:e} in {:d} iterations ({:d} function evaluations)'.format(relative_change, iteration, nfunc)) else: message = 'WARNING: Did not converge to within specified tolerance. max_delta = {:f}, TOLERANCE = {:f}, MAX_ITS = %d'.format(relative_change, relative_tolerance, maximum_iterations) raise ConvergenceError(message) if compute_uncertainty: ''' Compute asymptotic variance estimate using Eq. 10a of Bennett, 1976 (except with n_1<f>_1^2 in the second denominator, it is an error in the original NOTE: The 'BAR' and 'MBAR' estimators do not agree for poor overlap. This is not because of numerical precision, but because they are fundamentally different estimators. For poor overlap, 'MBAR' diverges high, and 'BAR' diverges by being too low. In situations they are noticeably from each other, they are also pretty different from the true answer (obtained by calculating the standard deviation over lots of realizations). First, we examine the 'BAR' equation. Rederive from Bennett, substituting (8) into (7) (8) -> W = [q0/n0 exp(-U1) + q1/n1 exp(-U0)]^-1 <(W exp(-U1))^2 >_0 <(W exp(-U0))^2 >_1 (7) -> ----------------------- + ----------------------- - 1/n0 - 1/n1 n_0 [<(W exp(-U1)>_0]^2 n_1 [<(W exp(-U0)>_1]^2 Const cancels out of top and bottom. Wexp(-U0) = [q0/n0 exp(-(U1-U0)) + q1/n1]^-1 = n1/q1 [n1/n0 q0/q1 exp(-(U1-U0)) + 1]^-1 = n1/q1 [exp (M+(F1-F0)-(U1-U0)+1)^-1] = n1/q1 f(x) Wexp(-U1) = [q0/n0 + q1/n1 exp(-(U0-U1))]^-1 = n0/q0 [1 + n0/n1 q1/q0 exp(-(U0-U1))]^-1 = n0/q0 [1 + exp(-M+[F0-F1)-(U0-U1))]^-1 = n0/q0 f(-x) <(W exp(-U1))^2 >_0 <(W exp(-U0))^2 >_1 (7) -> ----------------------- + ----------------------- - 1/n0 - 1/n1 n_0 [<(W exp(-U1)>_0]^2 n_1 [<(W exp(-U0)>_1]^2 <[n0/q0 f(-x)]^2>_0 <[n1/q1 f(x)]^2>_1 ----------------------- + ------------------------ -1/n0 -1/n1 n_0 <n0/q0 f(-x)>_0^2 n_1 <n1/q1 f(x)>_1^2 1 <[f(-x)]^2>_0 1 <[f(x)]^2>_1 - [----------------------- - 1] + - [------------------------ - 1] n0 <f(-x)>_0^2 n1 n_1<f(x)>_1^2 where f = the fermi function, 1/(1+exp(-x)) This formula the 'BAR' equation works for works for free energies (F0-F1) that don't satisfy the BAR equation. The 'MBAR' equation, detailed below, only works for free energies that satisfy the equation. Now, let's look at the MBAR version of the uncertainty. This is written (from Shirts and Chodera, JPC, 129, 124105, Equation E9) as [ n0<f(x)f(-x)>_0 + n1<f(x)f(-x)_1 ]^-1 - n0^-1 - n1^-1 we note the f(-x) + f(x) = 1, and change this to: [ n0<(1-f(-x)f(-x)>_0 + n1<f(x)(1-f(x))_1 ]^-1 - n0^-1 - n1^-1 [ n0<f(-x)-f(-x)^2)>_0 + n1<f(x)-f(x)^2)_1 ]^-1 - n0^-1 - n1^-1 1 1 1 -------------------------------------------------------------------- - --- - --- n0 <f(-x)>_0 - n0 <[f(-x)]^2>_0 + n1 <f(x)>_1 + n1 <[f(x)]^2>_1 n0 n1 Removing the factor of - (T_F + T_R)/(T_F*T_R)) from both, we compare: <[f(-x)]^2>_0 <[f(x)]^2>_1 [------------------] + [---------------] n0 <f(-x)>_0^2 n1 <f(x)>_1^2 1 -------------------------------------------------------------------- n0 <f(-x)>_0 - n0 <[f(-x)]^2>_0 + n1 <f(x)>_1 + n1 <[f(x)]^2>_1 denote: <f(-x)>_0 = afF <f(-x)^2>_0 = afF2 <f(x)>_1 = afR <f(x)^2>_1 = afF2 Then we can look at both of these as: variance_BAR = (afF2/afF**2)/T_F + (afR2/afR**2)/T_R variance_MBAR = 1/(afF*T_F - afF2*T_F + afR*T_R - afR2*T_R) Rearranging: variance_BAR = (afF2/afF**2)/T_F + (afR2/afR**2)/T_R variance_MBAR = 1/(afF*T_F + afR*T_R - (afF2*T_F + afR2*T_R)) # check the steps below? Not quite sure. variance_BAR = (afF2/afF**2) + (afR2/afR**2) = (afF2 + afR2)/afR**2 variance_MBAR = 1/(afF + afR - (afF2 + afR2)) = 1/(2*afR-(afF2+afR2)) Definitely not the same. Now, the reason that they both work for high overlap is still not clear. We will determine the difference at some point. see https://github.com/choderalab/pymbar/issues/281 for more information. Now implement the two computations. ''' # Determine number of forward and reverse work values provided. T_F = float(w_F.size) # number of forward work values T_R = float(w_R.size) # number of reverse work values # Compute log ratio of forward and reverse counts. M = np.log(T_F / T_R) if iterated_solution: C = M - DeltaF else: C = M - DeltaF_initial # In theory, overflow handling should not be needed now, because we use numlogexp or a custom routine? # fF = 1 / (1 + np.exp(w_F + C)), but we need to handle overflows exp_arg_F = (w_F + C) max_arg_F = np.max(exp_arg_F) log_fF = - np.log(np.exp(-max_arg_F) + np.exp(exp_arg_F - max_arg_F)) afF = np.exp(logsumexp(log_fF)-max_arg_F)/T_F # fR = 1 / (1 + np.exp(w_R - C)), but we need to handle overflows exp_arg_R = (w_R - C) max_arg_R = np.max(exp_arg_R) log_fR = - np.log(np.exp(-max_arg_R) + np.exp(exp_arg_R - max_arg_R)) afR = np.exp(logsumexp(log_fR)-max_arg_R)/T_R afF2 = np.exp(logsumexp(2*log_fF)-2*max_arg_F)/T_F afR2 = np.exp(logsumexp(2*log_fR)-2*max_arg_R)/T_R nrat = (T_F + T_R)/(T_F * T_R) # same for both methods if uncertainty_method == 'BAR': variance = (afF2/afF**2)/T_F + (afR2/afR**2)/T_R - nrat dDeltaF = np.sqrt(variance) elif uncertainty_method == 'MBAR': # OR equivalently vartemp = ((afF - afF2)*T_F + (afR - afR2)*T_R) dDeltaF = np.sqrt(1.0/vartemp - nrat) else: message = 'ERROR: BAR uncertainty method {:s} is not defined'.format(uncertainty_method) raise ParameterError(message) if verbose: print("DeltaF = {:8.3f} +- {:8.3f}".format(DeltaF, dDeltaF)) result_vals['Delta_f'] = DeltaF result_vals['dDelta_f'] = dDeltaF if return_dict: return result_vals return DeltaF, dDeltaF else: if verbose: print("DeltaF = {:8.3f}".format(DeltaF)) result_vals['Delta_f'] = DeltaF if return_dict: return result_vals return DeltaF
O_extra = numpy.array([ 0.5, 1.5, 2.5, 3.5, 4.5]) observables = ['position','position^2','potential energy','RMS displacement'] seed = None # Uncomment the following line to seed the random number generated to produce reproducible output. seed = 0 numpy.random.seed(seed) #============================================================================================= # MAIN #============================================================================================= # Determine number of simulations. K = numpy.size(N_k) if numpy.shape(K_k) != numpy.shape(N_k): raise ParameterError("K_k (%d) and N_k (%d) must have same dimensions." % (numpy.shape(K_k), numpy.shape(N_k))) if numpy.shape(O_k) != numpy.shape(N_k): raise ParameterError("O_k (%d) and N_k (%d) must have same dimensions." % (numpy.shape(K_k), numpy.shape(N_k))) # Determine maximum number of samples to be drawn for any state. N_max = numpy.max(N_k) (f_k_analytical, Delta_f_ij_analytical, A_k_analytical, A_ij_analytical) = GetAnalytical(beta,K_k,O_k,observables) print("This script will draw samples from %d harmonic oscillators." % (K)) print("The harmonic oscillators have equilibrium positions") print(O_k) print("and spring constants") print(K_k) print("and the following number of samples will be drawn from each (can be zero if no samples drawn):") print(N_k)