def _detect_equilibration(self, A_t): """ Automatically detect equilibrated region. ARGUMENTS A_t (numpy.array) - timeseries RETURNS t (int) - start of equilibrated data g (float) - statistical inefficiency of equilibrated data Neff_max (float) - number of uncorrelated samples """ T = A_t.size # Special case if timeseries is constant. if A_t.std() == 0.0: return (0, 1, T) g_t = numpy.ones([T-1], numpy.float32) Neff_t = numpy.ones([T-1], numpy.float32) print T for t in range(T-1): print t g_t[t] = timeseries.statisticalInefficiency(A_t[t:T]) Neff_t[t] = (T-t+1) / g_t[t] Neff_max = Neff_t.max() t = Neff_t.argmax() g = g_t[t] return (t, g, Neff_max)
def detect_equilibration(A_t): """ Automatically detect equilibrated region. ARGUMENTS A_t (numpy.array) - timeseries RETURNS t (int) - start of equilibrated data g (float) - statistical inefficiency of equilibrated data Neff_max (float) - number of uncorrelated samples """ T = A_t.size # Special case if timeseries is constant. if A_t.std() == 0.0: return (0, 1, T) g_t = numpy.ones([T - 1], numpy.float32) Neff_t = numpy.ones([T - 1], numpy.float32) for t in range(T - 1): g_t[t] = timeseries.statisticalInefficiency(A_t[t:T]) Neff_t[t] = (T - t + 1) / g_t[t] Neff_max = Neff_t.max() t = Neff_t.argmax() g = g_t[t] return (t, g, Neff_max)
def subsample(Q_n,localQ): print 'Subsampling the data' g = timeseries.statisticalInefficiency(Q_n) indices = numpy.array(timeseries.subsampleCorrelatedData(Q_n,g)) print '%i uncorrelated samples found of %i original samples' %(len(indices),len(Q_n)) localQ = localQ[:,indices] return localQ
def EXPgauss(w_F, compute_uncertainty=True, is_timeseries=False): """ Estimate free energy difference using gaussian approximation to one-sided (unidirectional) exponential averaging. ARGUMENTS w_F (numpy array) - w_F[t] is the forward work value from snapshot t. t = 0...(T-1) Length T is deduced from vector. OPTIONAL ARGUMENTS compute_uncertainty (boolean) - if False, will disable computation of the statistical uncertainty (default: True) is_timeseries (boolean) - if True, correlation in data is corrected for by estimation of statisitcal inefficiency (default: False) Use this option if you are providing correlated timeseries data and have not subsampled the data to produce uncorrelated samples. RETURNS DeltaF (float) - DeltaF is the free energy difference between the two states. dDeltaF (float) - dDeltaF is the uncertainty, and is only returned if compute_uncertainty is set to True NOTE If you are prodividing correlated timeseries data, be sure to set the 'timeseries' flag to True EXAMPLES Compute the free energy difference given a sample of forward work values. >>> import testsystems >>> [w_F, w_R] = testsystems.gaussian_work_example(mu_F=None, DeltaF=1.0, seed=0) >>> [DeltaF, dDeltaF] = EXPgauss(w_F) >>> print 'Forward Gaussian approximated free energy difference is %.3f +- %.3f kT' % (DeltaF, dDeltaF) Forward Gaussian approximated free energy difference is 1.049 +- 0.089 kT >>> [DeltaF, dDeltaF] = EXPgauss(w_R) >>> print 'Reverse Gaussian approximated free energy difference is %.3f +- %.3f kT' % (DeltaF, dDeltaF) Reverse Gaussian approximated free energy difference is -1.073 +- 0.080 kT """ # Get number of work measurements. T = float(np.size(w_F)) # number of work measurements var = np.var(w_F) # Estimate free energy difference by Gaussian approximation, dG = <U> - 0.5*var(U) DeltaF = np.average(w_F) - 0.5*var if compute_uncertainty: # Compute effective number of uncorrelated samples. g = 1.0 # statistical inefficiency T_eff = T if is_timeseries: # Estimate statistical inefficiency of x timeseries. import timeseries g = timeseries.statisticalInefficiency(w_F, w_F) T_eff = T/g # Estimate standard error of E[x]. dx2 = var/ T_eff + 0.5*var*var/(T_eff - 1) dDeltaF = np.sqrt(dx2) # Return estimate of free energy difference and uncertainty. return (DeltaF, dDeltaF) else: return DeltaF
def EXPgauss(w_F, compute_uncertainty=True, is_timeseries=False): """ Estimate free energy difference using gaussian approximation to one-sided (unidirectional) exponential averaging. ARGUMENTS w_F (numpy array) - w_F[t] is the forward work value from snapshot t. t = 0...(T-1) Length T is deduced from vector. OPTIONAL ARGUMENTS compute_uncertainty (boolean) - if False, will disable computation of the statistical uncertainty (default: True) is_timeseries (boolean) - if True, correlation in data is corrected for by estimation of statisitcal inefficiency (default: False) Use this option if you are providing correlated timeseries data and have not subsampled the data to produce uncorrelated samples. RETURNS DeltaF (float) - DeltaF is the free energy difference between the two states. dDeltaF (float) - dDeltaF is the uncertainty, and is only returned if compute_uncertainty is set to True NOTE If you are prodividing correlated timeseries data, be sure to set the 'timeseries' flag to True EXAMPLES Compute the free energy difference given a sample of forward work values. >>> import testsystems >>> [w_F, w_R] = testsystems.gaussian_work_example(mu_F=None, DeltaF=1.0, seed=0) >>> [DeltaF, dDeltaF] = EXPgauss(w_F) >>> print 'Forward Gaussian approximated free energy difference is %.3f +- %.3f kT' % (DeltaF, dDeltaF) Forward Gaussian approximated free energy difference is 1.049 +- 0.089 kT >>> [DeltaF, dDeltaF] = EXPgauss(w_R) >>> print 'Reverse Gaussian approximated free energy difference is %.3f +- %.3f kT' % (DeltaF, dDeltaF) Reverse Gaussian approximated free energy difference is -1.073 +- 0.080 kT """ # Get number of work measurements. T = float(np.size(w_F)) # number of work measurements var = np.var(w_F) # Estimate free energy difference by Gaussian approximation, dG = <U> - 0.5*var(U) DeltaF = np.average(w_F) - 0.5 * var if compute_uncertainty: # Compute effective number of uncorrelated samples. g = 1.0 # statistical inefficiency T_eff = T if is_timeseries: # Estimate statistical inefficiency of x timeseries. import timeseries g = timeseries.statisticalInefficiency(w_F, w_F) T_eff = T / g # Estimate standard error of E[x]. dx2 = var / T_eff + 0.5 * var * var / (T_eff - 1) dDeltaF = np.sqrt(dx2) # Return estimate of free energy difference and uncertainty. return (DeltaF, dDeltaF) else: return DeltaF
def getNkandUkln(do_dhdl=False): """Identifies uncorrelated samples and updates the arrays of the reduced potential energy and dhdlt retaining data entries of these samples only. Assumes that 'dhdlt' and 'u_klt' are in memory, as well as proper values for 'sta' and 'fin', i.e. the starting and final snapshot positions to be read, both are arrays of dimension K.""" u_kln = numpy.zeros([K,K,max(fin-sta)], numpy.float64) # u_kln[k,m,n] is the reduced potential energy of uncorrelated sample index n from state k evaluated at state m N_k = numpy.zeros(K, int) # N_k[k] is the number of uncorrelated samples from state k g = numpy.zeros(K,float) # autocorrelation times for the data if do_dhdl: dhdl = numpy.zeros([K,n_components,max(fin-sta)], float) #dhdl is value for dhdl for each component in the file at each time. print "\n\nNumber of correlated and uncorrelated samples:\n\n%6s %12s %12s %12s\n" % ('State', 'N', 'N_k', 'N/N_k') for k in range(K): # Sum up over the energy components; notice, that only the relevant data is being used in the third dimension. dhdl_sum = numpy.sum(dhdlt[k,:,sta[k]:fin[k]], axis=0) # Determine indices of uncorrelated samples from potential autocorrelation analysis at state k # (alternatively, could use the energy differences -- here, we will use total dhdl). g[k] = timeseries.statisticalInefficiency(dhdl_sum) indices = numpy.array(timeseries.subsampleCorrelatedData(dhdl_sum, g=g[k])) # indices of uncorrelated samples N = len(indices) # number of uncorrelated samples # Handle case where we end up with too few. if N < 50: if do_dhdl: print "WARNING: Only %s uncorrelated samples found at lambda number %s; proceeding with analysis using correlated samples..." % (N, k) indices = numpy.arange(len(dhdl_sum)) N = len(indices) N_k[k] = N # Store the number of uncorrelated samples from state k. for l in range(K): u_kln[k,l,0:N] = u_klt[k,l,indices] if do_dhdl: print "%6s %12s %12s %12.2f" % (k, fin[k], N_k[k], g[k]) for n in range(n_components): dhdl[k,n,0:N] = dhdlt[k,n,indices] if do_dhdl: return (dhdl, N_k, u_kln) return (N_k, u_kln)
def compute_stat_inefficiency2D(pos_xkn,pos_ykn,N_k): ''' computes iacts ''' logger.info("computing IACTS") K = pos_xkn.shape[0] ineff = np.zeros(K,dtype=np.float) ineffx = np.zeros(K,dtype=np.float) ineffy = np.zeros(K,dtype=np.float) for i in range(K): ineffx[i] = timeseries.statisticalInefficiency( pos_xkn[i,0:N_k[i]] ) ineffy[i] = timeseries.statisticalInefficiency( pos_ykn[i,0:N_k[i]] ) if ineffx[i] > ineffy[i]: ineff[i] = ineffx[i] else: ineff[i] = ineffy[i] #logger.debug("IACT X and Y %s %s %s",iactx,iacty,i ) logger.info("IACTS computed") return ineff,ineffx,ineffy
def subsample_series(series, g_t=None, return_g_t=False): if g_t is None: g_t = timeseries.statisticalInefficiency(series) state_indices = timeseries.subsampleCorrelatedData(series, g = g_t, conservative=True) N_k = len(state_indices) transfer_series = series[state_indices] if return_g_t: return state_indices, transfer_series, g_t else: return state_indices, transfer_series
def getefficiency(N_k, U_kn, V_kn, N_kn, type): K = len(N_k) g = numpy.ones(K) ge = numpy.ones(K) gv = numpy.ones(K) gn = numpy.ones(K) if (type != 'volume') and (type != 'number'): for k in range(K): ge[k] = timeseries.statisticalInefficiency(U_kn[k, 0:N_k[k]], fast=False) print "Calculating [" for k in range(K): print " %.3f " % (ge[k]) print "] as the statistical inefficiencies of the energy" if type in requireV: for k in range(K): gv[k] = timeseries.statisticalInefficiency(V_kn[k, 0:N_k[k]], fast=False) print "Calculating [" for k in range(K): print " %.3f " % (gv[k]) print "] as the statistical inefficiencies of the volume" if type in requireN: for k in range(K): gn[k] = timeseries.statisticalInefficiency(N_kn[k, 0:N_k[k]], fast=False) print "Calculating [" for k in range(K): print " %.3f " % (gn[k]) print "] as the statistical inefficiencies of the particle number" for k in range(K): g[k] = numpy.max([ge[k], gv[k], gn[k]]) print "Using [" for k in range(K): print " %.3f " % (g[k]) print "] as the statistical inefficiencies" return g
def compute_stat_inefficiency1D(pos_kn,N_k): ''' computes iacts ''' logger.info("computing IACTS") K = pos_kn.shape[0] ineff = np.zeros(K,dtype=np.float) for i in range(K): ineff[i] = timeseries.statisticalInefficiency( pos_kn[i,0:N_k[i]] ) logger.debug("%d %f\n",i,ineff[i]) logger.info("IACTS computed") return ineff
def getefficiency(N_k,U_kn,V_kn,N_kn,type): K = len(N_k) g = numpy.ones(K) ge = numpy.ones(K); gv = numpy.ones(K); gn = numpy.ones(K); if (type != 'volume') and (type != 'number'): for k in range(K): ge[k] = timeseries.statisticalInefficiency(U_kn[k,0:N_k[k]],fast=False) print "Calculating [" for k in range(K): print " %.3f " % (ge[k]) print "] as the statistical inefficiencies of the energy" if type in requireV: for k in range(K): gv[k] = timeseries.statisticalInefficiency(V_kn[k,0:N_k[k]],fast=False) print "Calculating [" for k in range(K): print " %.3f " % (gv[k]) print "] as the statistical inefficiencies of the volume" if type in requireN: for k in range(K): gn[k] = timeseries.statisticalInefficiency(N_kn[k,0:N_k[k]],fast=False) print "Calculating [" for k in range(K): print " %.3f " % (gn[k]) print "] as the statistical inefficiencies of the particle number" for k in range(K): g[k] = numpy.max([ge[k],gv[k],gn[k]]) print "Using [" for k in range(K): print " %.3f " % (g[k]) print "] as the statistical inefficiencies" return g
def compute_stat_inefficiency(observ): ''' computes iacts ''' logger.info("computing IACTS") ineff = [] for sim in observ: simIneff = [] for j in range(sim.shape[1]): simIneff.append( timeseries.statisticalInefficiency( sim[:,j] ) ) ineff.append(simIneff) ineff = np.array(ineff,dtype=np.float) logger.info("IACTS computed") return ineff
def subsample(U_kn,Q_kn,K,N_max): assume_uncorrelated = False if assume_uncorrelated: print 'Assuming data is uncorrelated' N_k = numpy.zeros(K, numpy.int32) N_k[:] = N_max else: print 'Subsampling the data...' N_k = numpy.zeros(K,numpy.int32) g = numpy.zeros(K,numpy.float64) for k in range(K): # subsample the energies g[k] = timeseries.statisticalInefficiency(Q_kn[k])#,suppress_warning=True) indices = numpy.array(timeseries.subsampleCorrelatedData(Q_kn[k],g=g[k])) # indices of uncorrelated samples N_k[k] = len(indices) # number of uncorrelated samplesadsf U_kn[k,0:N_k[k]] = U_kn[k,indices] Q_kn[k,0:N_k[k]] = Q_kn[k,indices] return U_kn, Q_kn, N_k
def find_g_t_states(u_kln, states=None, nequil=None): #Subsample multiple states, this assumes you want to subsample independent of what was fed in if states is None: states = numpy.array(range(nstates)) num_sample = len(states) if nequil is None: gen_nequil = True nequil = numpy.zeroes(num_sample, dtype=numpy.int32) else: if len(nequil) != num_sample: print "nequil length needs to be the same as length as states!" raise else: gen_nequl = False g_t = numpy.zeros([num_sample]) Neff_max = numpy.zeros([num_sample]) for state in states: g_t[state] = timeseries.statisticalInefficiency(u_kln[k,k,nequil[state]:]) Neff_max[k] = (u_kln[k,k,:].size + 1) / g_t[state] return g_t, Neff_max
def EXP(w_F, compute_uncertainty=True, is_timeseries=False): """Estimate free energy difference using one-sided (unidirectional) exponential averaging (EXP). Parameters ---------- w_F : np.ndarray, float w_F[t] is the forward work value from snapshot t. t = 0...(T-1) Length T is deduced from vector. compute_uncertainty : bool, optional, default=True if False, will disable computation of the statistical uncertainty (default: True) is_timeseries : bool, default=False if True, correlation in data is corrected for by estimation of statisitcal inefficiency (default: False) Use this option if you are providing correlated timeseries data and have not subsampled the data to produce uncorrelated samples. Returns ------- DeltaF : float DeltaF is the free energy difference between the two states. dDeltaF : float dDeltaF is the uncertainty, and is only returned if compute_uncertainty is set to True Notes ----- If you are prodividing correlated timeseries data, be sure to set the 'timeseries' flag to True Examples -------- Compute the free energy difference given a sample of forward work values. >>> from pymbar import testsystems >>> [w_F, w_R] = testsystems.gaussian_work_example(mu_F=None, DeltaF=1.0, seed=0) >>> [DeltaF, dDeltaF] = EXP(w_F) >>> print('Forward free energy difference is %.3f +- %.3f kT' % (DeltaF, dDeltaF)) Forward free energy difference is 1.088 +- 0.076 kT >>> [DeltaF, dDeltaF] = EXP(w_R) >>> print('Reverse free energy difference is %.3f +- %.3f kT' % (DeltaF, dDeltaF)) Reverse free energy difference is -1.073 +- 0.082 kT """ # Get number of work measurements. T = float(np.size(w_F)) # number of work measurements # Estimate free energy difference by exponential averaging using DeltaF = - log < exp(-w_F) > DeltaF = - (logsumexp(- w_F) - np.log(T)) if compute_uncertainty: # Compute x_i = np.exp(-w_F_i - max_arg) max_arg = np.max(-w_F) # maximum argument x = np.exp(-w_F - max_arg) # Compute E[x] = <x> and dx Ex = x.mean() # Compute effective number of uncorrelated samples. g = 1.0 # statistical inefficiency if is_timeseries: # Estimate statistical inefficiency of x timeseries. import timeseries g = timeseries.statisticalInefficiency(x, x) # Estimate standard error of E[x]. dx = np.std(x) / np.sqrt(T / g) # dDeltaF = <x>^-1 dx dDeltaF = (dx / Ex) # Return estimate of free energy difference and uncertainty. return (DeltaF, dDeltaF) else: return DeltaF
T_k = T[-2::] files = ['%s/energy%i.npy' % (direc, T[-2]), '%s/energy%i.npy' % (direc, T[-1])] #file=['/home/edz3fz/checkensemble_high/CE_high.txt','/home/edz3fz/checkensemble_low/CE_low.txt'] #file=[direc+'/energy426.txt',direc+'/energy442.txt'] #file = ['/home/edz3fz/surface_replica_exchange/replica0/energy300.txt', '/home/edz3fz/surface_replica_exchange/replica3/energy356.txt'] down=load(files[0]) up=load(files[1]) length = len(down) down = down[length/2::] up = up[length/2::] #up=up[-50000::] #down=down[-50000::] #up=up[::100] #down=down[::100] g_up = timeseries.statisticalInefficiency(up) indices_up = numpy.array(timeseries.subsampleCorrelatedData(up,g=g_up)) print len(indices_up), 'samples' g_down = timeseries.statisticalInefficiency(down) indices_down = numpy.array(timeseries.subsampleCorrelatedData(up,g=g_down)) print len(indices_down), 'samples' type='total' U_kn=zeros([2,len(up)]) U_kn[0,0:len(indices_down)] = down[indices_down] U_kn[1,0:len(indices_up)] = up[indices_up] #T_k=array([300.,336.8472786]) #T_k=array([426.81933819,442.13650313])
# dtau_end = tau_end_i.std() / numpy.sqrt(float(nblocks)) # Print. print "tau_end = %.3f+-%.3f iterations" % (tau_end, dtau_end) del states # Compute statistical inefficiency for reduced potential energies = ncfile.variables['energies'][:, :, :].copy() states = ncfile.variables['states'][:, :].copy() u_n = numpy.zeros([niterations], numpy.float64) for iteration in range(niterations): u_n[iteration] = 0.0 for replica in range(nstates): state = states[iteration, replica] u_n[iteration] += energies[iteration, replica, state] del energies, states g_u = timeseries.statisticalInefficiency(u_n) tau_u = (g_u - 1.0) / 2.0 print "g_u = %8.1f iterations" % g_u print "tau_u = %8.1f iterations" % tau_u # DEBUG for lactalbumin #continue # Compute torsions. print "Computing torsions..." positions = ncfile.variables['positions'][:, :, :, :] coordinates = units.Quantity( numpy.zeros([natoms, ndim], numpy.float32), units.angstroms) phi_it = units.Quantity( numpy.zeros([nstates, niterations], numpy.float32), units.radians) psi_it = units.Quantity(
) # N_k[k] is the number of uncorrelated samples from simulation index k reduced_expectation_data = [] if len(expectation_columns) > 0: for i in range(len(expectation_columns)): reduced_expectation_data.append( numpy.zeros([K, N_samples], numpy.float64)) reduced_fep_data = [] if len(fep_columns) > 0: for i in range(len(fep_columns)): reduced_fep_data.append(numpy.zeros([K, N_samples], numpy.float64)) for k in range(K): # Extract timeseries. A_t = biasing_variable_kt[0][k, :] # Compute statistical inefficiency. try: g = timeseries.statisticalInefficiency(A_t) except Exception as e: print str(e) print A_t # Subsample data. if subsample_trajectories: indices = timeseries.subsampleCorrelatedData(A_t, g=g) else: indices = timeseries.subsampleCorrelatedData(A_t, g=1) N = len(indices) # number of uncorrelated samples print "k = %5d : g = %.1f, N = %d" % (k, g, N) for i in range(nbiases): biasing_variable_kn[i][k, 0:N] = biasing_variable_kt[i][k, indices] for i in range(nperturbations + 1): U_kn[i][k, 0:N] = U_kt[i][k, indices]
#!/usr/bin/python import string import os import sys import numpy as np import timeseries dat1 = np.loadtxt("t1.dat", delimiter=None) dat2 = np.loadtxt("t2.dat", delimiter=None) g1 = timeseries.statisticalInefficiency(dat1, fast=True) g2 = timeseries.statisticalInefficiency(dat2, fast=True) nsamp1 = dat1.size / g1 nsamp2 = dat1.size / g2 avg1 = np.mean(dat1) avg2 = np.mean(dat2) sig1 = np.power(np.var(dat1) / nsamp1, 0.5) sig2 = np.power(np.var(dat2) / nsamp2, 0.5) print avg1, 3. * sig1 print avg2, 3. * sig2
def _subsample_kln(self, u_kln): #Try to load in the data if self.save_equil_data: #Check if we want to save/load equilibration data try: equil_data = numpy.load( os.path.join( self.source_directory, self.save_prefix + self.phase + '_equil_data_%s.npz' % self.subsample_method)) if self.nequil is None: self.nequil = equil_data['nequil'] elif type(self.nequil ) is int and self.subsample_method == 'per-state': print "WARRNING: Per-state subsampling requested with only single value for equilibration..." try: self.nequil = equil_data['nequil'] print "Loading equilibration from file with %i states read" % self.nstates except: print "Assuming equal equilibration per state of %i" % self.nequil self.nequil = numpy.array([self.nequil] * self.nstates) self.g_t = equil_data['g_t'] Neff_max = equil_data['Neff_max'] #Do equilibration if we have not already if self.subsample_method == 'per-state' and ( len(self.g_t) < self.nstates or len(self.nequil) < self.nstates): equil_loaded = False raise IndexError else: equil_loaded = True except: if self.subsample_method == 'per-state': self.nequil = numpy.zeros([self.nstates], dtype=numpy.int32) self.g_t = numpy.zeros([self.nstates]) Neff_max = numpy.zeros([self.nstates]) for k in xrange(self.nstates): if self.verbose: print "Computing timeseries for state %i/%i" % ( k, self.nstates - 1) self.nequil[k] = 0 self.g_t[k] = timeseries.statisticalInefficiency( u_kln[k, k, :]) Neff_max[k] = (u_kln[k, k, :].size + 1) / self.g_t[k] #[self.nequil[k], self.g_t[k], Neff_max[k]] = self._detect_equilibration(u_kln[k,k,:]) else: if self.nequil is None: [self.nequil, self.g_t, Neff_max] = self._detect_equilibration(self.u_n) else: [self.nequil_timeseries, self.g_t, Neff_max] = self._detect_equilibration(self.u_n) equil_loaded = False if not equil_loaded: numpy.savez(os.path.join( self.source_directory, self.save_prefix + self.phase + '_equil_data_%s.npz' % self.subsample_method), nequil=self.nequil, g_t=self.g_t, Neff_max=Neff_max) elif self.nequil is None: if self.subsample_method == 'per-state': self.nequil = numpy.zeros([self.nstates], dtype=numpy.int32) self.g_t = numpy.zeros([self.nstates]) Neff_max = numpy.zeros([self.nstates]) for k in xrange(self.nstates): [self.nequil[k], self.g_t[k], Neff_max[k]] = self._detect_equilibration(u_kln[k, k, :]) if self.verbose: print "State %i equilibrated with %i samples" % ( k, int(Neff_max[k])) else: [self.nequil, self.g_t, Neff_max] = self._detect_equilibration(self.u_n) if self.verbose: print[self.nequil, Neff_max] # 1) Discard equilibration data # 2) Subsample data to obtain uncorrelated samples self.N_k = numpy.zeros(self.nstates, numpy.int32) if self.subsample_method == 'per-state': # Discard samples nsamples_equil = self.niterations - self.nequil self.u_kln = numpy.zeros( [self.nstates, self.nstates, nsamples_equil.max()]) for k in xrange(self.nstates): self.u_kln[k, :, :nsamples_equil[k]] = u_kln[k, :, self.nequil[k]:] #Subsample transfer_retained_indices = numpy.zeros( [self.nstates, nsamples_equil.max()], dtype=numpy.int32) for k in xrange(self.nstates): state_indices = timeseries.subsampleCorrelatedData( self.u_kln[k, k, :], g=self.g_t[k]) self.N_k[k] = len(state_indices) transfer_retained_indices[k, :self.N_k[k]] = state_indices transfer_kln = numpy.zeros( [self.nstates, self.nstates, self.N_k.max()]) self.retained_indices = numpy.zeros( [self.nstates, self.N_k.max()], dtype=numpy.int32) for k in xrange(self.nstates): self.retained_indices[ k, :self.N_k[k]] = transfer_retained_indices[ k, :self.N_k[k]] #Memory reduction transfer_kln[k, :, :self.N_k[k]] = self.u_kln[ k, :, self.retained_indices[k, :self.N_k[ k]]].T #Have to transpose since indexing in this way causes issues #Cut down on memory, once function is done, transfer_kln should be released self.u_kln = transfer_kln self.retained_iters = self.N_k else: #Discard Samples self.u_kln = u_kln[:, :, self.nequil:] self.u_n = self.u_n[self.nequil:] #Subsamples indices = timeseries.subsampleCorrelatedData( self.u_n, g=self.g_t) # indices of uncorrelated samples self.u_kln = self.u_kln[:, :, indices] self.N_k[:] = len(indices) self.retained_indices = indices self.retained_iters = len(indices) return
mask_kt[k,0:T_k[k]] = True # Create a list from this mask. all_data_indices = where(mask_kt) # Construct equal-frequency extension bins print "binning data..." bin_kt = zeros([K, T_max], int32) (bin_left_boundary_i, bin_center_i, bin_width_i, bin_assignments) = construct_nonuniform_bins(x_kt[all_data_indices], nbins) bin_kt[all_data_indices] = bin_assignments # Compute correlation times. N_max = 0 g_k = zeros([K], float64) for k in range(K): # Compute statistical inefficiency for extension timeseries g = timeseries.statisticalInefficiency(x_kt[k,0:T_k[k]], x_kt[k,0:T_k[k]]) # store statistical inefficiency g_k[k] = g print "timeseries %d : g = %.1f, %.0f uncorrelated samples (of %d total samples)" % (k+1, g, floor(T_k[k] / g), T_k[k]) N_max = max(N_max, ceil(T_k[k] / g) + 1) # Subsample trajectory position data. x_kn = zeros([K, N_max], float64) bin_kn = zeros([K, N_max], int32) N_k = zeros([K], int32) for k in range(K): # Compute correlation times for potential energy and chi timeseries. indices = timeseries.subsampleCorrelatedData(x_kt[k,0:T_k[k]]) # Store subsampled positions. N_k[k] = len(indices) x_kn[k,0:N_k[k]] = x_kt[k,indices]
#!/usr/bin/env python #Usage: calc_acf file [colnr [ndiscard] ] import sys, timeseries from numpy import * if len(sys.argv) > 2: colnr = int(sys.argv[2]) else: colnr = 1 if len(sys.argv) > 3: ndiscard = int(sys.argv[3]) else: ndiscard = 0 lines = [line for line in open(sys.argv[1]).readlines() if line[0] != '#'] f = array([float(line.split()[colnr - 1]) for line in lines[ndiscard:]]) print timeseries.statisticalInefficiency(f)
infile = open(filename, 'r') lines = infile.readlines() infile.close() # Parse data. n = 0 for line in lines: if line[0] != '#' and line[0] != '@': tokens = line.split() u_kn[k,n] = beta_k[k] * (float(tokens[2]) - float(tokens[1])) # reduced potential energy without umbrella restraint n += 1 # Compute correlation times for potential energy and chi # timeseries. If the temperatures differ, use energies to determine samples; otherwise, use the cosine of chi if (DifferentTemperatures): g_k[k] = timeseries.statisticalInefficiency(u_kn[k,:], u_kn[k,0:N_k[k]]) print "Correlation time for set %5d is %10.3f" % (k,g_k[k]) indices = timeseries.subsampleCorrelatedData(u_kn[k,0:N_k[k]]) else: chi_radians = chi_kn[k,0:N_k[k]]/(180.0/numpy.pi) g_cos = timeseries.statisticalInefficiency(numpy.cos(chi_radians)) g_sin = timeseries.statisticalInefficiency(numpy.sin(chi_radians)) print "g_cos = %.1f | g_sin = %.1f" % (g_cos, g_sin) g_k[k] = max(g_cos, g_sin) print "Correlation time for set %5d is %10.3f" % (k,g_k[k]) indices = timeseries.subsampleCorrelatedData(chi_radians, g=g_k[k]) # Subsample data. N_k[k] = len(indices) u_kn[k,0:N_k[k]] = u_kn[k,indices] chi_kn[k,0:N_k[k]] = chi_kn[k,indices]
# infile = open(filename, 'r') # lines = infile.readlines() # infile.close() # Parse data. # n = 0 # for line in lines: # if line[0] != '#' and line[0] != '@': # tokens = line.split() # u_kn[k,n] = beta_k[k] * (float(tokens[2]) - float(tokens[1])) # reduced potential energy without umbrella restraint # n += 1 # Compute correlation times for potential energy and val # timeseries. If the temperatures differ, use energies to determine samples; otherwise, use the cosine of val if (DifferentTemperatures): g_k[k] = timeseries.statisticalInefficiency(u_kn[k,:], u_kn[k,:]) print "Correlation time for set %5d is %10.3f" % (k,g_k[k]) indices = timeseries.subsampleCorrelatedData(u_kn[k,:]) else: #g_k[k] = timeseries.statisticalInefficiency(val_kn[k,:], val_kn[k,:]) #print "Correlation time for set %5d is %10.3f" % (k,g_k[k]) indices = timeseries.subsampleCorrelatedData(val_kn[k,0:N_k[k]], fast=True, verbose=True) # Subsample data. N_k[k] = len(indices) u_kn[k,0:N_k[k]] = u_kn[k,indices] val_kn[k,0:N_k[k]] = val_kn[k,indices] # print val_kn[k,0:N_k[k]] # Set zero of u_kn -- this is arbitrary. u_kn -= u_kn.min()
cluster_bin_kn = -1*numpy.ones([K,N_samples], numpy.int32) # cluster_bin_kn[k,n] is the cluster bin index of snapshot n of umbrella simulation k N_k = numpy.zeros([K], numpy.int32) # N_k[k] is the number of uncorrelated samples from simulation index k reduced_expectation_data = [] if len(expectation_columns) > 0: for i in range(len(expectation_columns)): reduced_expectation_data.append(numpy.zeros([K,N_samples], numpy.float64)) reduced_fep_data = [] if len(fep_columns) > 0: for i in range(len(fep_columns)): reduced_fep_data.append(numpy.zeros([K,N_samples], numpy.float64)) for k in range(K): # Extract timeseries. A_t = biasing_variable_kt[0][k,:] # Compute statistical inefficiency. try: g = timeseries.statisticalInefficiency(A_t) except Exception as e: print str(e) print A_t # Subsample data. if subsample_trajectories: indices = timeseries.subsampleCorrelatedData(A_t, g=g) else: indices = timeseries.subsampleCorrelatedData(A_t, g=1) N = len(indices) # number of uncorrelated samples print "k = %5d : g = %.1f, N = %d" % (k, g, N) for i in range(nbiases): biasing_variable_kn[i][k,0:N] = biasing_variable_kt[i][k,indices] for i in range(nperturbations+1): U_kn[i][k,0:N] = U_kt[i][k,indices]
'%s/energy%i.npy' % (direc, T[-1]) ] #file=['/home/edz3fz/checkensemble_high/CE_high.txt','/home/edz3fz/checkensemble_low/CE_low.txt'] #file=[direc+'/energy426.txt',direc+'/energy442.txt'] #file = ['/home/edz3fz/surface_replica_exchange/replica0/energy300.txt', '/home/edz3fz/surface_replica_exchange/replica3/energy356.txt'] down = load(files[0]) up = load(files[1]) length = len(down) down = down[length / 2::] up = up[length / 2::] #up=up[-50000::] #down=down[-50000::] #up=up[::100] #down=down[::100] g_up = timeseries.statisticalInefficiency(up) indices_up = numpy.array(timeseries.subsampleCorrelatedData(up, g=g_up)) print len(indices_up), 'samples' g_down = timeseries.statisticalInefficiency(down) indices_down = numpy.array(timeseries.subsampleCorrelatedData(up, g=g_down)) print len(indices_down), 'samples' type = 'total' U_kn = zeros([2, len(up)]) U_kn[0, 0:len(indices_down)] = down[indices_down] U_kn[1, 0:len(indices_up)] = up[indices_up] #T_k=array([300.,336.8472786]) #T_k=array([426.81933819,442.13650313]) #T_k=array([424.67492585,450]) #T_k=array([437.99897735,450])
n_samples, n_bins = bp.shape bi = np.arange(n_bins) pmf = -0.6*np.log(bp) pmf_mean = pmf.mean(axis=0) pmf_mean -= np.min(pmf_mean) pmf_std = pmf.std(axis=0) # Calculate statistical inefficiency try: g = np.load(stat_ineff_file) except: g = np.zeros((n_bins,)) for k in xrange(n_bins): g[k] = timeseries.statisticalInefficiency(pmf[:,k]) np.save(stat_ineff_file, g) pmf_err = 1.96*g*pmf_std/np.sqrt(n_samples) offset = np.min(pmf_mean - pmf_err) pmf_mean -= offset fig = plt.figure(1, figsize=fsize) ax = fig.add_subplot(111) ax.plot(bi, pmf_mean, color='black', lw=2, zorder=1000) ax.fill_between(bi, pmf_mean + pmf_err, pmf_mean - pmf_err, alpha=.4, facecolor='gray', edgecolor='black', lw=1, zorder=900) ax.set_xlabel(r'$\alpha$') ax.set_ylabel(r'$G_{\alpha}$ (kcal/mol)')
def EXP(w_F, compute_uncertainty=True, is_timeseries=False): """ Estimate free energy difference using one-sided (unidirectional) exponential averaging (EXP). ARGUMENTS w_F (numpy array) - w_F[t] is the forward work value from snapshot t. t = 0...(T-1) Length T is deduced from vector. OPTIONAL ARGUMENTS compute_uncertainty (boolean) - if False, will disable computation of the statistical uncertainty (default: True) is_timeseries (boolean) - if True, correlation in data is corrected for by estimation of statisitcal inefficiency (default: False) Use this option if you are providing correlated timeseries data and have not subsampled the data to produce uncorrelated samples. RETURNS DeltaF (float) - DeltaF is the free energy difference between the two states. dDeltaF (float) - dDeltaF is the uncertainty, and is only returned if compute_uncertainty is set to True NOTE If you are prodividing correlated timeseries data, be sure to set the 'timeseries' flag to True EXAMPLES Compute the free energy difference given a sample of forward work values. >>> import testsystems >>> [w_F, w_R] = testsystems.gaussian_work_example(mu_F=None, DeltaF=1.0, seed=0) >>> [DeltaF, dDeltaF] = EXP(w_F) >>> print 'Forward free energy difference is %.3f +- %.3f kT' % (DeltaF, dDeltaF) Forward free energy difference is 1.088 +- 0.076 kT >>> [DeltaF, dDeltaF] = EXP(w_R) >>> print 'Reverse free energy difference is %.3f +- %.3f kT' % (DeltaF, dDeltaF) Reverse free energy difference is -1.073 +- 0.082 kT """ # Get number of work measurements. T = float(np.size(w_F)) # number of work measurements # Estimate free energy difference by exponential averaging using DeltaF = - log < exp(-w_F) > DeltaF = -(_logsum(-w_F) - np.log(T)) if compute_uncertainty: # Compute x_i = np.exp(-w_F_i - max_arg) max_arg = np.max(-w_F) # maximum argument x = np.exp(-w_F - max_arg) # Compute E[x] = <x> and dx Ex = x.mean() # Compute effective number of uncorrelated samples. g = 1.0 # statistical inefficiency if is_timeseries: # Estimate statistical inefficiency of x timeseries. import timeseries g = timeseries.statisticalInefficiency(x, x) # Estimate standard error of E[x]. dx = np.std(x) / np.sqrt(T / g) # dDeltaF = <x>^-1 dx dDeltaF = (dx / Ex) # Return estimate of free energy difference and uncertainty. return (DeltaF, dDeltaF) else: return DeltaF
def main(): options = parse_args() kB = 0.00831447/4.184 #Boltzmann constant (Gas constant) in kJ/(mol*K) dT = 2.5 # Temperature increment for calculating Cv(T) T = numpy.loadtxt(options.tfile) print 'Initial temperature states are', T K = len(T) U_kn, Q_kn, N_max = read_data(options,T,K) print 'Subsampling Q...' N_k = numpy.zeros(K,numpy.int32) g = numpy.zeros(K,numpy.float64) for k in range(K): # subsample the energies g[k] = timeseries.statisticalInefficiency(Q_kn[k])#,suppress_warning=True) indices = numpy.array(timeseries.subsampleCorrelatedData(Q_kn[k],g=g[k])) # indices of uncorrelated samples N_k[k] = len(indices) # number of uncorrelated samplesadsf print '%i uncorrelated samples out of %i total samples' %(len(indices),options.N_max/options.skip) U_kn[k,0:N_k[k]] = U_kn[k,indices] Q_kn[k,0:N_k[k]] = Q_kn[k,indices] insert = True if insert: #------------------------------------------------------------------------ # Insert Intermediate T's and corresponding blank U's and E's #------------------------------------------------------------------------ # Set up variables Temp_k = T currentT = T[0] + dT maxT = T[-1] i = 1 print("--Inserting intermediate temperatures...") # Loop, inserting T's at which we are interested in the properties while (currentT < maxT) : if (currentT < Temp_k[i]): Temp_k = numpy.insert(Temp_k, i, currentT) currentT = currentT + dT else: currentT = Temp_k[i] + dT i = i + 1 # Update number of states K = len(Temp_k) print("--Inserting blank energies to match up with inserted temperatures...") # Loop, inserting E's into blank matrix (leaving blanks only where new Ts are inserted) Q_fromfile = Q_kn Nall_k = numpy.zeros([K], numpy.int32) # Number of samples (n) for each state (k) = number of iterations/energies E_kn = numpy.zeros([K, N_max], numpy.float64) Q_kn = numpy.zeros([K, N_max], numpy.float64) i = 0 for k in range(K): if (Temp_k[k] == T[i]): E_kn[k,0:N_k[i]] = U_kn[i,0:N_k[i]] Q_kn[k,0:N_k[i]] = Q_fromfile[i,0:N_k[i]] Nall_k[k] = N_k[i] i = i + 1 else: print 'Not inserting intermediate temperatures' Temp_k = T E_kn = U_kn Nall_k = N_k #------------------------------------------------------------------------ # Compute inverse temperatures #------------------------------------------------------------------------ beta_k = 1 / (kB * Temp_k) #------------------------------------------------------------------------ # Compute reduced potential energies #------------------------------------------------------------------------ print "--Computing reduced energies..." u_kln = numpy.zeros([K,K,N_max], numpy.float64) # u_kln is reduced pot. ener. of segment n of temp k evaluated at temp l for k in range(K): for l in range(K): u_kln[k,l,0:Nall_k[k]] = beta_k[l] * E_kn[k,0:Nall_k[k]] #------------------------------------------------------------------------ # Initialize MBAR #------------------------------------------------------------------------ # Initialize MBAR with Newton-Raphson print "" print "Initializing MBAR:" print "--K = number of Temperatures" print "--L = number of Temperatures" print "--N = number of Energies per Temperature" # Use Adaptive Method (Both Newton-Raphson and Self-Consistent, testing which is better) if insert: mbar = pymbar.MBAR(u_kln, Nall_k, method = 'adaptive', verbose=True, relative_tolerance=1e-12) else: f_k = wham.histogram_wham(beta_k, U_kn, Nall_k, relative_tolerance = 1.0e-4) mbar = pymbar.MBAR(u_kln, Nall_k, initial_f_k = f_k, verbose=True) #------------------------------------------------------------------------ # Compute Expectations for E_kt and E2_kt as E_expect and E2_expect #------------------------------------------------------------------------ print "" print "Computing Expectations for E..." (E_expect, dE_expect) = mbar.computeExpectations(u_kln)*(beta_k)**(-1) print "Computing Expectations for E^2..." (E2_expect,dE2_expect) = mbar.computeExpectations(u_kln*u_kln)*(beta_k)**(-2) print "Computing Expectations for Q..." (Q,dQ) = mbar.computeExpectations(Q_kn) #------------------------------------------------------------------------ # Compute Cv for NVT simulations as <E^2> - <E>^2 / (RT^2) #------------------------------------------------------------------------ #print "" #print "Computing Heat Capacity as ( <E^2> - <E>^2 ) / ( R*T^2 )..." Cv_expect = numpy.zeros([K], numpy.float64) dCv_expect = numpy.zeros([K], numpy.float64) for i in range(K): Cv_expect[i] = (E2_expect[i] - (E_expect[i]*E_expect[i])) / ( kB * Temp_k[i] * Temp_k[i]) dCv_expect[i] = 2*dE_expect[i]**2 / (kB *Temp_k[i]*Temp_k[i]) # from propagation of error #print "Temperature dA <E> +/- d<E> <E^2> +/- d<E^2> Cv +/- dCv" #print "-------------------------------------------------------------------------------" #for k in range(K): # print "%8.3f %8.3f %9.3f +/- %5.3f %9.1f +/- %5.1f %7.4f +/- %6.4f" % (Temp_k[k],mbar.f_k[k],E_expect[k],dE_expect[k],E2_expect[k],dE2_expect[k],Cv_expect[k], dCv_expect[k]) #numpy.savetxt('/home/edz3fz/Qsurf_int.txt',Q) #numpy.savetxt('/home/edz3fz/dQsurf_int.txt',dQ) #numpy.savetxt('/home/edz3fz/dQsol.txt',dQ) #numpy.savetxt('/home/edz3fz/Qtemp.tt',Temp_k) import matplotlib.pyplot as plt #ncavg = numpy.average(Q_fromfile, axis=1) plt.figure(1) #plt.plot(T, ncavg, 'ko') plt.plot(Temp_k,Q,'k') plt.errorbar(Temp_k, Q, yerr=dQ) plt.xlabel('Temperature (K)') plt.ylabel('Q fraction native contacts') #plt.title('Heat Capacity from Go like model MC simulation of 1BSQ') plt.savefig(options.direc+'/foldingcurve.png') numpy.save(options.direc+'/foldingcurve',numpy.array([Temp_k, Q, dQ])) numpy.save(options.direc+'/heatcap',numpy.array([Temp_k, Cv_expect, dCv_expect])) if options.show: plt.show()
# get state and reduced potentials line = lines[t] elements = line.split() state = int(elements[1]) - 1 # state in range(K) current_reduced_potential = float(elements[2]) for k in range(K): u_tk[t, k] = float(elements[3 + k]) # store state_t[t] = state u_t[t] = current_reduced_potential - g_k[state] print "u_t = " # outfile = open('%s.out' % datafile_directory,'w') # for t in range(T): # outfile.write("%16d %16.8f\n" % (t, u_t[t])) # outfile.close() g = timeseries.statisticalInefficiency(u_t, u_t) print "g = %16.8f" % g # compute correlation function print "Computing correlation function..." C_t = timeseries.normalizedFluctuationCorrelationFunction( u_t, u_t, int(3 * g)) # outfile = open('corrfun-%s.dat' % datafile_directory, 'w') # for t in range(len(C_t)): # outfile.write('%8d %16.8f\n' % (t, C_t[t])) # outfile.close() # Test MRS's hypothesis. u_t_singlestate = zeros([T], float64) for state in range(K): # construct timeseries Nstate = 0
#======================================================================== #------------------------------------------------------------------------ # Read Data From File #------------------------------------------------------------------------ print("") print("Preparing data:") T_from_file = read_simulation_temps(simulation,NumTemps) E_from_file = read_total_energies(simulation,TE_COL_NUM) K = len(T_from_file) N_k = numpy.zeros(K,numpy.int32) g = numpy.zeros(K,numpy.float64) for k in range(K): # subsample the energies g[k] = timeseries.statisticalInefficiency(E_from_file[k]) indices = numpy.array(timeseries.subsampleCorrelatedData(E_from_file[k],g=g[k])) # indices of uncorrelated samples N_k[k] = len(indices) # number of uncorrelated samples E_from_file[k,0:N_k[k]] = E_from_file[k,indices] #------------------------------------------------------------------------ # Insert Intermediate T's and corresponding blank U's and E's #------------------------------------------------------------------------ Temp_k = T_from_file minT = T_from_file[0] maxT = T_from_file[len(T_from_file) - 1] #beta = 1/(k*BT) #T = 1/(kB*beta) if dtype == 'temperature': minv = minT maxv = maxT
# get state and reduced potentials line = lines[t] elements = line.split() state = int(elements[1]) - 1 # state in range(K) current_reduced_potential = float(elements[2]) for k in range(K): u_tk[t,k] = float(elements[3 + k]) # store state_t[t] = state u_t[t] = current_reduced_potential - g_k[state] print "u_t = " # outfile = open('%s.out' % datafile_directory,'w') # for t in range(T): # outfile.write("%16d %16.8f\n" % (t, u_t[t])) # outfile.close() g = timeseries.statisticalInefficiency(u_t, u_t) print "g = %16.8f" % g # compute correlation function print "Computing correlation function..." C_t = timeseries.normalizedFluctuationCorrelationFunction(u_t, u_t, int(3 * g)) # outfile = open('corrfun-%s.dat' % datafile_directory, 'w') # for t in range(len(C_t)): # outfile.write('%8d %16.8f\n' % (t, C_t[t])) # outfile.close() # Test MRS's hypothesis. u_t_singlestate = zeros([T], float64) for state in range(K): # construct timeseries Nstate = 0 for t in range(T):
def EXP(w_F, compute_uncertainty=True, is_timeseries=False): """Estimate free energy difference using one-sided (unidirectional) exponential averaging (EXP). Parameters ---------- w_F : np.ndarray, float w_F[t] is the forward work value from snapshot t. t = 0...(T-1) Length T is deduced from vector. compute_uncertainty : bool, optional, default=True if False, will disable computation of the statistical uncertainty (default: True) is_timeseries : bool, default=False if True, correlation in data is corrected for by estimation of statisitcal inefficiency (default: False) Use this option if you are providing correlated timeseries data and have not subsampled the data to produce uncorrelated samples. Returns ------- result_vals : dictionary Possible keys in the result_vals dictionary 'Delta_f' : float Free energy difference 'dDelta_f': float Estimated standard deviation of free energy difference Notes ----- If you are prodividing correlated timeseries data, be sure to set the 'timeseries' flag to True Examples -------- Compute the free energy difference given a sample of forward work values. >>> from pymbar import testsystems >>> [w_F, w_R] = testsystems.gaussian_work_example(mu_F=None, DeltaF=1.0, seed=0) >>> results = EXP(w_F) >>> print('Forward free energy difference is %.3f +- %.3f kT' % (results['Delta_f'], results['dDelta_f'])) Forward free energy difference is 1.088 +- 0.076 kT >>> results = EXP(w_R) >>> print('Reverse free energy difference is %.3f +- %.3f kT' % (results['Delta_f'], results['dDelta_f'])) Reverse free energy difference is -1.073 +- 0.082 kT """ result_vals = dict() # Get number of work measurements. T = float(np.size(w_F)) # number of work measurements # Estimate free energy difference by exponential averaging using DeltaF = - log < exp(-w_F) > DeltaF = -(logsumexp(-w_F) - np.log(T)) if compute_uncertainty: # Compute x_i = np.exp(-w_F_i - max_arg) max_arg = np.max(-w_F) # maximum argument x = np.exp(-w_F - max_arg) # Compute E[x] = <x> and dx Ex = x.mean() # Compute effective number of uncorrelated samples. g = 1.0 # statistical inefficiency if is_timeseries: # Estimate statistical inefficiency of x timeseries. import timeseries g = timeseries.statisticalInefficiency(x, x) # Estimate standard error of E[x]. dx = np.std(x) / np.sqrt(T / g) # dDeltaF = <x>^-1 dx dDeltaF = (dx / Ex) # Return estimate of free energy difference and uncertainty. result_vals['Delta_f'] = DeltaF result_vals['dDelta_f'] = dDeltaF else: result_vals['Delta_f'] = DeltaF return result_vals
def EXPGauss(w_F, compute_uncertainty=True, is_timeseries=False): """Estimate free energy difference using gaussian approximation to one-sided (unidirectional) exponential averaging. Parameters ---------- w_F : np.ndarray, float w_F[t] is the forward work value from snapshot t. t = 0...(T-1) Length T is deduced from vector. compute_uncertainty : bool, optional, default=True if False, will disable computation of the statistical uncertainty (default: True) is_timeseries : bool, default=False if True, correlation in data is corrected for by estimation of statisitcal inefficiency (default: False) Use this option if you are providing correlated timeseries data and have not subsampled the data to produce uncorrelated samples. Returns ------- result_vals : dictionary Possible keys in the result_vals dictionary 'Delta_f' : float Free energy difference between the two states 'dDelta_f': float Estimated standard deviation of free energy difference between the two states. Notes ----- If you are prodividing correlated timeseries data, be sure to set the 'timeseries' flag to True Examples -------- Compute the free energy difference given a sample of forward work values. >>> from pymbar import testsystems >>> [w_F, w_R] = testsystems.gaussian_work_example(mu_F=None, DeltaF=1.0, seed=0) >>> results = EXPGauss(w_F) >>> print('Forward Gaussian approximated free energy difference is %.3f +- %.3f kT' % (results['Delta_f'], results['dDelta_f'])) Forward Gaussian approximated free energy difference is 1.049 +- 0.089 kT >>> results = EXPGauss(w_R) >>> print('Reverse Gaussian approximated free energy difference is %.3f +- %.3f kT' % (results['Delta_f'], results['dDelta_f'])) Reverse Gaussian approximated free energy difference is -1.073 +- 0.080 kT """ # Get number of work measurements. T = float(np.size(w_F)) # number of work measurements var = np.var(w_F) # Estimate free energy difference by Gaussian approximation, dG = <U> - 0.5*var(U) DeltaF = np.average(w_F) - 0.5 * var result_vals = dict() if compute_uncertainty: # Compute effective number of uncorrelated samples. g = 1.0 # statistical inefficiency T_eff = T if is_timeseries: # Estimate statistical inefficiency of x timeseries. import timeseries g = timeseries.statisticalInefficiency(w_F, w_F) T_eff = T / g # Estimate standard error of E[x]. dx2 = var / T_eff + 0.5 * var * var / (T_eff - 1) dDeltaF = np.sqrt(dx2) # Return estimate of free energy difference and uncertainty. result_vals['Delta_f'] = DeltaF result_vals['dDelta_f'] = dDeltaF else: result_vals['Delta_f'] = DeltaF return result_vals
infile = open(filename, 'r') lines = infile.readlines() infile.close() # Parse data. n = 0 for line in lines: if line[0] != '#' and line[0] != '@': tokens = line.split() u_kn[k,n] = beta_k[k] * (float(tokens[2]) - float(tokens[1])) # reduced potential energy without umbrella restraint n += 1 # Compute correlation times for potential energy and chi # timeseries. If the temperatures differ, use energies to determine samples; otherwise, use the cosine of chi if (DifferentTemperatures): g_k[k] = timeseries.statisticalInefficiency(u_kn[k,:], u_kn[k,:]) print "Correlation time for set %5d is %10.3f" % (k,g_k[k]) indices = timeseries.subsampleCorrelatedData(u_kn[k,:]) else: g_k[k] = timeseries.statisticalInefficiency(numpy.cos(chi_kn[k,:]/(180.0/numpy.pi)),numpy.cos(chi_kn[k,:]/(180.0/numpy.pi))) print "Correlation time for set %5d is %10.3f" % (k,g_k[k]) indices = timeseries.subsampleCorrelatedData(numpy.cos(chi_kn[k,:]/(180.0/numpy.pi))) # Subsample data. N_k[k] = len(indices) u_kn[k,0:N_k[k]] = u_kn[k,indices] chi_kn[k,0:N_k[k]] = chi_kn[k,indices] # Set zero of u_kn -- this is arbitrary. u_kn -= u_kn.min() # Construct torsion bins
def analyze_data(store_filename, phipsi_outfile=None): """ Analyze output from parallel tempering simulations. """ temperature = 300.0 * units.kelvin # temperature ndiscard = 100 # number of samples to discard to equilibration # Allocate storage for results. results = dict() # Compute kappa nbins = 10 kB = units.BOLTZMANN_CONSTANT_kB * units.AVOGADRO_CONSTANT_NA # Boltzmann constant kT = (kB * temperature) # thermal energy beta = 1.0 / kT # inverse temperature delta = 360.0 / float(nbins) * units.degrees # bin spacing sigma = delta/3.0 # standard deviation kappa = (sigma / units.radians)**(-2) # kappa parameter (unitless) # Open NetCDF file. ncfile = netcdf.Dataset(store_filename, 'r', version=2) # Get dimensions. [niterations, nstates, natoms, ndim] = ncfile.variables['positions'][:,:,:,:].shape print "%d iterations, %d states, %d atoms" % (niterations, nstates, natoms) # Discard initial configurations to equilibration. print "First %d iterations will be discarded to equilibration." % ndiscard niterations -= ndiscard # Print summary statistics about mixing in state space. [tau2, dtau2] = show_mixing_statistics_with_error(ncfile) # Compute correlation time of state index. states = ncfile.variables['states'][:,:].copy() A_kn = [ states[:,k].copy() for k in range(nstates) ] g_states = timeseries.statisticalInefficiencyMultiple(A_kn) tau_states = (g_states-1.0)/2.0 # Compute statistical error. nblocks = 10 blocksize = int(niterations) / int(nblocks) g_states_i = numpy.zeros([nblocks], numpy.float64) tau_states_i = numpy.zeros([nblocks], numpy.float64) for block_index in range(nblocks): # Extract block states = ncfile.variables['states'][(blocksize*block_index):(blocksize*(block_index+1)),:].copy() A_kn = [ states[:,k].copy() for k in range(nstates) ] g_states_i[block_index] = timeseries.statisticalInefficiencyMultiple(A_kn) tau_states_i[block_index] = (g_states_i[block_index]-1.0)/2.0 dg_states = g_states_i.std() / numpy.sqrt(float(nblocks)) dtau_states = tau_states_i.std() / numpy.sqrt(float(nblocks)) # Print. print "g_states = %.3f+-%.3f iterations" % (g_states, dg_states) print "tau_states = %.3f+-%.3f iterations" % (tau_states, dtau_states) del states, A_kn # Compute end-to-end time. states = ncfile.variables['states'][:,:].copy() [tau_end, dtau_end] = average_end_to_end_time(states) # Compute statistical inefficiency for reduced potential energies = ncfile.variables['energies'][ndiscard:,:,:].copy() states = ncfile.variables['states'][ndiscard:,:].copy() u_n = numpy.zeros([niterations], numpy.float64) for iteration in range(niterations): u_n[iteration] = 0.0 for replica in range(nstates): state = states[iteration,replica] u_n[iteration] += energies[iteration,replica,state] del energies, states g_u = timeseries.statisticalInefficiency(u_n) print "g_u = %8.1f iterations" % g_u # Compute x and y umbrellas. print "Computing torsions..." positions = ncfile.variables['positions'][ndiscard:,:,:,:] coordinates = units.Quantity(numpy.zeros([natoms,ndim], numpy.float32), units.angstroms) phi_it = units.Quantity(numpy.zeros([nstates,niterations], numpy.float32), units.radians) psi_it = units.Quantity(numpy.zeros([nstates,niterations], numpy.float32), units.radians) for iteration in range(niterations): for replica in range(nstates): coordinates[:,:] = units.Quantity(positions[iteration,replica,:,:].copy(), units.angstroms) phi_it[replica,iteration] = compute_torsion(coordinates, 4, 6, 8, 14) psi_it[replica,iteration] = compute_torsion(coordinates, 6, 8, 14, 16) # Run MBAR. print "Grouping torsions by state..." phi_state_it = numpy.zeros([nstates,niterations], numpy.float32) psi_state_it = numpy.zeros([nstates,niterations], numpy.float32) states = ncfile.variables['states'][ndiscard:,:].copy() for iteration in range(niterations): replicas = numpy.argsort(states[iteration,:]) for state in range(1,nstates): replica = replicas[state] phi_state_it[state,iteration] = phi_it[replica,iteration] / units.radians psi_state_it[state,iteration] = psi_it[replica,iteration] / units.radians print "Evaluating reduced potential energies..." N_k = numpy.ones([nstates], numpy.int32) * niterations u_kln = numpy.zeros([nstates, nstates, niterations], numpy.float32) for l in range(1,nstates): phi0 = ((numpy.floor((l-1)/nbins) + 0.5) * delta - 180.0 * units.degrees) / units.radians psi0 = ((numpy.remainder((l-1), nbins) + 0.5) * delta - 180.0 * units.degrees) / units.radians u_kln[:,l,:] = - kappa * numpy.cos(phi_state_it[:,:] - phi0) - kappa * numpy.cos(psi_state_it[:,:] - psi0) # print "Running MBAR..." # #mbar = pymbar.MBAR(u_kln, N_k, verbose=True, method='self-consistent-iteration') # mbar = pymbar.MBAR(u_kln[1:,1:,:], N_k[1:], verbose=True, method='adaptive', relative_tolerance=1.0e-2) # only use biased samples # f_k = mbar.f_k # mbar = pymbar.MBAR(u_kln[1:,1:,:], N_k[1:], verbose=True, method='Newton-Raphson', initial_f_k=f_k) # only use biased samples # #mbar = pymbar.MBAR(u_kln, N_k, verbose=True, method='Newton-Raphson', initialize='BAR') # print "Getting free energy differences..." # [df_ij, ddf_ij] = mbar.getFreeEnergyDifferences(uncertainty_method='svd-ew') # print df_ij # print ddf_ij # print "ln(Z_ij / Z_55):" # reference_bin = 4*nbins+4 # for psi_index in range(nbins): # print " [,%2d]" % (psi_index+1), # print "" # for phi_index in range(nbins): # print "[%2d,]" % (phi_index+1), # for psi_index in range(nbins): # print "%8.3f" % (-df_ij[reference_bin, phi_index*nbins+psi_index]), # print "" # print "" # print "dln(Z_ij / Z_55):" # reference_bin = 4*nbins+4 # for psi_index in range(nbins): # print " [,%2d]" % (psi_index+1), # print "" # for phi_index in range(nbins): # print "[%2d,]" % (phi_index+1), # for psi_index in range(nbins): # print "%8.3f" % (ddf_ij[reference_bin, phi_index*nbins+psi_index]), # print "" # print "" # Compute statistical inefficiencies of various functions of the timeseries data. print "Computing statistical infficiencies of cos(phi), sin(phi), cos(psi), sin(psi)..." cosphi_kn = [ numpy.cos(phi_it[replica,:] / units.radians).copy() for replica in range(1,nstates) ] sinphi_kn = [ numpy.sin(phi_it[replica,:] / units.radians).copy() for replica in range(1,nstates) ] cospsi_kn = [ numpy.cos(psi_it[replica,:] / units.radians).copy() for replica in range(1,nstates) ] sinpsi_kn = [ numpy.sin(psi_it[replica,:] / units.radians).copy() for replica in range(1,nstates) ] g_cosphi = timeseries.statisticalInefficiencyMultiple(cosphi_kn) g_sinphi = timeseries.statisticalInefficiencyMultiple(sinphi_kn) g_cospsi = timeseries.statisticalInefficiencyMultiple(cospsi_kn) g_sinpsi = timeseries.statisticalInefficiencyMultiple(sinpsi_kn) tau_cosphi = (g_cosphi-1.0)/2.0 tau_sinphi = (g_sinphi-1.0)/2.0 tau_cospsi = (g_cospsi-1.0)/2.0 tau_sinpsi = (g_sinpsi-1.0)/2.0 # Compute relaxation times in each torsion. print "Relaxation times for transitions among phi or psi bins alone:" phibin_it = ((phi_it + 180.0 * units.degrees) / (delta + 0.1*units.degrees)).astype(numpy.int16) tau_phi = compute_relaxation_time(phibin_it, nbins) psibin_it = ((psi_it + 180.0 * units.degrees) / (delta + 0.1*units.degrees)).astype(numpy.int16) tau_psi = compute_relaxation_time(psibin_it, nbins) print "tau_phi = %8.1f iteration" % tau_phi print "tau_psi = %8.1f iteration" % tau_psi # Compute statistical error. nblocks = 10 blocksize = int(niterations) / int(nblocks) g_cosphi_i = numpy.zeros([nblocks], numpy.float64) g_sinphi_i = numpy.zeros([nblocks], numpy.float64) g_cospsi_i = numpy.zeros([nblocks], numpy.float64) g_sinpsi_i = numpy.zeros([nblocks], numpy.float64) tau_cosphi_i = numpy.zeros([nblocks], numpy.float64) tau_sinphi_i = numpy.zeros([nblocks], numpy.float64) tau_cospsi_i = numpy.zeros([nblocks], numpy.float64) tau_sinpsi_i = numpy.zeros([nblocks], numpy.float64) for block_index in range(nblocks): # Extract block slice_indices = range(blocksize*block_index,blocksize*(block_index+1)) cosphi_kn = [ numpy.cos(phi_it[replica,slice_indices] / units.radians).copy() for replica in range(1,nstates) ] sinphi_kn = [ numpy.sin(phi_it[replica,slice_indices] / units.radians).copy() for replica in range(1,nstates) ] cospsi_kn = [ numpy.cos(psi_it[replica,slice_indices] / units.radians).copy() for replica in range(1,nstates) ] sinpsi_kn = [ numpy.sin(psi_it[replica,slice_indices] / units.radians).copy() for replica in range(1,nstates) ] g_cosphi_i[block_index] = timeseries.statisticalInefficiencyMultiple(cosphi_kn) g_sinphi_i[block_index] = timeseries.statisticalInefficiencyMultiple(sinphi_kn) g_cospsi_i[block_index] = timeseries.statisticalInefficiencyMultiple(cospsi_kn) g_sinpsi_i[block_index] = timeseries.statisticalInefficiencyMultiple(sinpsi_kn) tau_cosphi_i[block_index] = (g_cosphi_i[block_index]-1.0)/2.0 tau_sinphi_i[block_index] = (g_sinphi_i[block_index]-1.0)/2.0 tau_cospsi_i[block_index] = (g_cospsi_i[block_index]-1.0)/2.0 tau_sinpsi_i[block_index] = (g_sinpsi_i[block_index]-1.0)/2.0 dtau_cosphi = tau_cosphi_i.std() / numpy.sqrt(float(nblocks)) dtau_sinphi = tau_sinphi_i.std() / numpy.sqrt(float(nblocks)) dtau_cospsi = tau_cospsi_i.std() / numpy.sqrt(float(nblocks)) dtau_sinpsi = tau_sinpsi_i.std() / numpy.sqrt(float(nblocks)) del cosphi_kn, sinphi_kn, cospsi_kn, sinpsi_kn print "Integrated autocorrelation times" print "tau_cosphi = %8.1f+-%.1f iterations" % (tau_cosphi, dtau_cosphi) print "tau_sinphi = %8.1f+-%.1f iterations" % (tau_sinphi, dtau_sinphi) print "tau_cospsi = %8.1f+-%.1f iterations" % (tau_cospsi, dtau_cospsi) print "tau_sinpsi = %8.1f+-%.1f iterations" % (tau_sinpsi, dtau_sinpsi) # Print LaTeX line. print "" print "%(store_filename)s & %(tau2).2f $\pm$ %(dtau2).2f & %(tau_states).2f $\pm$ %(dtau_states).2f & %(tau_end).2f $\pm$ %(dtau_end).2f & %(tau_cosphi).2f $\pm$ %(dtau_cosphi).2f & %(tau_sinphi).2f $\pm$ %(dtau_sinphi).2f & %(tau_cospsi).2f $\pm$ %(dtau_cospsi).2f & %(tau_sinpsi).2f $\pm$ %(dtau_sinpsi).2f \\\\" % vars() print "" if phipsi_outfile is not None: # Write uncorrelated (phi,psi) data outfile = open(phipsi_outfile, 'w') outfile.write('# alanine dipeptide 2d umbrella sampling data\n') # Write umbrella restraints nbins = 10 # number of bins per torsion outfile.write('# %d x %d grid of restraints\n' % (nbins, nbins)) outfile.write('# Each state was sampled from p_i(x) = Z_i^{-1} q(x) q_i(x) where q_i(x) = exp[kappa*cos(phi(x)-phi_i) + kappa*cos(psi(x)-psi_i)]\n') outfile.write('# phi(x) and psi(x) are periodic torsion angles on domain [-180, +180) degrees.\n') outfile.write('# kappa = %f\n' % kappa) outfile.write('# phi_i = [-180 + (floor(i / nbins) + 0.5) * delta] degrees\n') outfile.write('# psi_i = [-180 + ( (i % nbins) + 0.5) * delta] degrees\n') outfile.write('# where i = 0...%d, nbins = %d, and delta = %f degrees\n' % (nbins*nbins-1, nbins, delta / units.degrees)) outfile.write('# Data has been subsampled to generate approximately uncorrelated samples.\n') outfile.write('#\n') # write data header outfile.write('# ') for replica in range(nstates): outfile.write('state %06d ' % replica) outfile.write('\n') # write data indices = timeseries.subsampleCorrelatedData(u_n, g=g_u) # indices of uncorrelated iterations states = ncfile.variables['states'][ndiscard:,:].copy() for iteration in indices: outfile.write(' ') replicas = numpy.argsort(states[iteration,:]) for state in range(1,nstates): replica = replicas[state] outfile.write('%+6.1f %+6.1f ' % (phi_it[replica,iteration] / units.degrees, psi_it[replica,iteration] / units.degrees)) outfile.write('\n') outfile.close() return results
# dtau_end = tau_end_i.std() / numpy.sqrt(float(nblocks)) # Print. print "tau_end = %.3f+-%.3f iterations" % (tau_end, dtau_end) del states # Compute statistical inefficiency for reduced potential energies = ncfile.variables['energies'][:,:,:].copy() states = ncfile.variables['states'][:,:].copy() u_n = numpy.zeros([niterations], numpy.float64) for iteration in range(niterations): u_n[iteration] = 0.0 for replica in range(nstates): state = states[iteration,replica] u_n[iteration] += energies[iteration,replica,state] del energies, states g_u = timeseries.statisticalInefficiency(u_n) tau_u = (g_u-1.0)/2.0 print "g_u = %8.1f iterations" % g_u print "tau_u = %8.1f iterations" % tau_u # DEBUG for lactalbumin #continue # Compute torsions. print "Computing torsions..." positions = ncfile.variables['positions'][:,:,:,:] coordinates = units.Quantity(numpy.zeros([natoms,ndim], numpy.float32), units.angstroms) phi_it = units.Quantity(numpy.zeros([nstates,niterations], numpy.float32), units.radians) psi_it = units.Quantity(numpy.zeros([nstates,niterations], numpy.float32), units.radians) for iteration in range(niterations): for replica in range(nstates):
def _subsample_kln(self, u_kln): #Try to load in the data if self.save_equil_data: #Check if we want to save/load equilibration data try: equil_data = numpy.load(os.path.join(self.source_directory, self.save_prefix + self.phase + '_equil_data_%s.npz' % self.subsample_method)) if self.nequil is None: self.nequil = equil_data['nequil'] elif type(self.nequil) is int and self.subsample_method == 'per-state': print "WARRNING: Per-state subsampling requested with only single value for equilibration..." try: self.nequil = equil_data['nequil'] print "Loading equilibration from file with %i states read" % self.nstates except: print "Assuming equal equilibration per state of %i" % self.nequil self.nequil = numpy.array([self.nequil] * self.nstates) self.g_t = equil_data['g_t'] Neff_max = equil_data['Neff_max'] #Do equilibration if we have not already if self.subsample_method == 'per-state' and (len(self.g_t) < self.nstates or len(self.nequil) < self.nstates): equil_loaded = False raise IndexError else: equil_loaded = True except: if self.subsample_method == 'per-state': self.nequil = numpy.zeros([self.nstates], dtype=numpy.int32) self.g_t = numpy.zeros([self.nstates]) Neff_max = numpy.zeros([self.nstates]) for k in xrange(self.nstates): if self.verbose: print "Computing timeseries for state %i/%i" % (k,self.nstates-1) self.nequil[k] = 0 self.g_t[k] = timeseries.statisticalInefficiency(u_kln[k,k,:]) Neff_max[k] = (u_kln[k,k,:].size + 1 ) / self.g_t[k] #[self.nequil[k], self.g_t[k], Neff_max[k]] = self._detect_equilibration(u_kln[k,k,:]) else: if self.nequil is None: [self.nequil, self.g_t, Neff_max] = self._detect_equilibration(self.u_n) else: [self.nequil_timeseries, self.g_t, Neff_max] = self._detect_equilibration(self.u_n) equil_loaded = False if not equil_loaded: numpy.savez(os.path.join(self.source_directory, self.save_prefix + self.phase + '_equil_data_%s.npz' % self.subsample_method), nequil=self.nequil, g_t=self.g_t, Neff_max=Neff_max) elif self.nequil is None: if self.subsample_method == 'per-state': self.nequil = numpy.zeros([self.nstates], dtype=numpy.int32) self.g_t = numpy.zeros([self.nstates]) Neff_max = numpy.zeros([self.nstates]) for k in xrange(self.nstates): [self.nequil[k], self.g_t[k], Neff_max[k]] = self._detect_equilibration(u_kln[k,k,:]) if self.verbose: print "State %i equilibrated with %i samples" % (k, int(Neff_max[k])) else: [self.nequil, self.g_t, Neff_max] = self._detect_equilibration(self.u_n) if self.verbose: print [self.nequil, Neff_max] # 1) Discard equilibration data # 2) Subsample data to obtain uncorrelated samples self.N_k = numpy.zeros(self.nstates, numpy.int32) if self.subsample_method == 'per-state': # Discard samples nsamples_equil = self.niterations - self.nequil self.u_kln = numpy.zeros([self.nstates,self.nstates,nsamples_equil.max()]) for k in xrange(self.nstates): self.u_kln[k,:,:nsamples_equil[k]] = u_kln[k,:,self.nequil[k]:] #Subsample transfer_retained_indices = numpy.zeros([self.nstates,nsamples_equil.max()], dtype=numpy.int32) for k in xrange(self.nstates): state_indices = timeseries.subsampleCorrelatedData(self.u_kln[k,k,:], g = self.g_t[k]) self.N_k[k] = len(state_indices) transfer_retained_indices[k,:self.N_k[k]] = state_indices transfer_kln = numpy.zeros([self.nstates, self.nstates, self.N_k.max()]) self.retained_indices = numpy.zeros([self.nstates,self.N_k.max()], dtype=numpy.int32) for k in xrange(self.nstates): self.retained_indices[k,:self.N_k[k]] = transfer_retained_indices[k,:self.N_k[k]] #Memory reduction transfer_kln[k,:,:self.N_k[k]] = self.u_kln[k,:,self.retained_indices[k,:self.N_k[k]]].T #Have to transpose since indexing in this way causes issues #Cut down on memory, once function is done, transfer_kln should be released self.u_kln = transfer_kln self.retained_iters = self.N_k else: #Discard Samples self.u_kln = u_kln[:,:,self.nequil:] self.u_n = self.u_n[self.nequil:] #Subsamples indices = timeseries.subsampleCorrelatedData(self.u_n, g=self.g_t) # indices of uncorrelated samples self.u_kln = self.u_kln[:,:,indices] self.N_k[:] = len(indices) self.retained_indices = indices self.retained_iters = len(indices) return