def subsampletimeseries(timeser, xyzn, N_k): """ Return a subsampled timeseries based on statistical inefficiency calculations. Parameters ---------- timeser: the timeseries to be subsampled xyzn: the coordinates associated with each frame of the timeseries to be subsampled N_k: original # of samples in each timeseries Returns --------- N_k_sub: new number of samples per timeseries ts_sub: the subsampled timeseries xyz_sub: the subsampled configuration series """ # Make a copy of the timeseries and make sure is numpy array of floats ts = timeser xyz = xyzn # initialize array of statistical inefficiencies g = np.zeros(len(ts), np.float64) for i, t in enumerate(ts): if np.count_nonzero(t) == 0: g[i] = np.float(1.) print "WARNING FLAG" else: g[i] = timeseries.statisticalInefficiency(t) N_k_sub = np.array([ len(timeseries.subsampleCorrelatedData(t, g=b)) for t, b in zip(ts, g) ]) ind = [timeseries.subsampleCorrelatedData(t, g=b) for t, b in zip(ts, g)] if (N_k_sub == N_k).all(): ts_sub = ts xyz_sub = xyz print "No sub-sampling occurred" else: print "Sub-sampling..." ts_sub = np.array([ t[timeseries.subsampleCorrelatedData(t, g=b)] for t, b in zip(ts, g) ]) #for c in xyz: # xyz_sub = [c[timeseries.subsampleCorrelatedData(t,g=b)] for t,b in zip(ts,g)] for i, j in enumerate(xyz): xyz_sub = [j[ii] for ii in ind[i]] return ts_sub, N_k_sub, xyz_sub, ind
def subsample_data_along_axis(data, subsample_rate, axis): """ Generate a decorrelated version of a given input data and subsample_rate along a single axis. Parameters ---------- data : np.array-like of any dimension length subsample_rate : float or int Rate at which to draw samples. A sample is considered decorrelated after every ceil(subsample_rate) of indices along data and the specified axis axis : int axis along which to apply the subsampling Returns ------- subsampled_data : ndarray of same number of dimensions as data Data will be subsampled along the given axis """ # TODO: find a name for the function that clarifies that decorrelation # TODO: is determined exclusively by subsample_rate? cast_data = np.asarray(data) data_shape = cast_data.shape # Since we already have g, we can just pass any appropriate shape to the subsample function indices = timeseries.subsampleCorrelatedData(np.zeros(data_shape[axis]), g=subsample_rate) subsampled_data = np.take(cast_data, indices, axis=axis) return subsampled_data
def compute_timeseries(reduced_potentials): """ Use pymbar timeseries to compute the uncorrelated samples in an array of reduced potentials. Returns the uncorrelated sample indices. Arguments --------- reduced_potentials : np.array of floats reduced potentials from which a timeseries is to be extracted Returns ------- t0 : int production region index g : float statistical inefficiency Neff_max : int effective number of samples in production region full_uncorrelated_indices : list of ints uncorrelated indices """ from pymbar import timeseries t0, g, Neff_max = timeseries.detectEquilibration( reduced_potentials) #computing indices of uncorrelated timeseries A_t_equil = reduced_potentials[t0:] uncorrelated_indices = timeseries.subsampleCorrelatedData(A_t_equil, g=g) A_t = A_t_equil[uncorrelated_indices] full_uncorrelated_indices = [i + t0 for i in uncorrelated_indices] return [t0, g, Neff_max, A_t, full_uncorrelated_indices]
def prepWindow(filename, tstart=0, tstop=None): """ Read window .traj file, compute correlation times, subsample data. Parameters ---------- filename: string name of the file to process. For *.traj file, assumes all lines are data (e.g. no comment lines). tstart: integer nanosecond start time tstop: integer nanosecond stop time Returns ------- counts: int, number of entries for this particular window winZ: numpy list containing SUBSAMPLED data for this window from tstart to tstop """ # Parse data. n, z_sub = parseWindow(filename, tstart, tstop) # Compute correlation times for z (actual spring center position) timeseries. g = timeseries.statisticalInefficiency(z_sub) print "Correlation time for %s is %10.3f" % (re.split('\W+', filename)[1], g) indices = timeseries.subsampleCorrelatedData(z_sub, g) # Subsample data. zsublen = len(indices) z_sub = z_sub[indices] return zsublen, z_sub
def calc_df(u_kln): """ u_kln should be (nstates) x (nstates) x (nframes) note that u_kln should be normalized by kT already where each element is a config from frame `n` of a trajectory conducted with state `k` with energy recalculated using parameters of state `l` """ dims = u_kln.shape if dims[0] != dims[1]: raise ValueError( "dimensions {} of u_kln should be square in the first two indices". format(dims)) nstates = dims[0] N_k = np.zeros([nstates], np.int32) # number of uncorrelated samples for k in range(nstates): [nequil, g, Neff_max] = timeseries.detectEquilibration(u_kln[k, k, :]) indices = timeseries.subsampleCorrelatedData(u_kln[k, k, :], g=g) N_k[k] = len(indices) u_kln[k, :, 0:N_k[k]] = u_kln[k, :, indices].T # Compute free energy differences and statistical uncertainties mbar = MBAR(u_kln, N_k) [DeltaF_ij, dDeltaF_ij, Theta_ij] = mbar.getFreeEnergyDifferences() # save data? return DeltaF_ij, dDeltaF_ij
def gamma_rt(cos,wat,r): """ Calculate the preferential interaction coefficient (gamma) of a protein with water and a cosolvent. ***ALL DISTANCES ARE IN NANOMETERS*** Input: cos, wat, r - cos : (T frames) X (N cosolvent molecules) array, the minimum distance of each cosolvent molecule to the protein Van der Waals surface for each frame. - wat : (T frames) X (M water molecules) array, the minimum distance of each water molecule to the protein Van der Waals surface for each frame. - r : float, distance dividing the local and bulk domains of the solvent. Returns: gamma, sample - gamma : (T frames) array, gamma for the given r, for each inputted frame. - sample : list, the N_effective independent frames of gamma to be used for calculation of the time average of gamma. Obtained using the method of Chodera (2016). References: - BM Baynes and BL Trout. Proteins in mixed solvents: a molecular-level perspective. J. Phys. Chem. B. 107, 14058-14067 (2003). - D Shukla, C Shinde, and BL Trout. Molecular computations of preferential interaction coefficients of proteins. J. Phys. Chem. B. 113, 12546-12554 (2009). - JD Chodera. J. Chem. Theor. Comput. 12, 1799 (2016). """ n_i_x = np.sum(cos > r,axis=1).astype(float) n_ii_x = np.sum(cos < r,axis=1).astype(float) n_i_w = np.sum(wat > r,axis=1).astype(float) n_ii_w = np.sum(wat < r,axis=1).astype(float) gamma = n_ii_x - n_ii_w * (n_i_x/n_i_w) sample = subsampleCorrelatedData(gamma) return gamma, sample
def calcTension(energy_data, verbose=False): dE1 = energy_data[:, 1] - energy_data[:, 0] dE2 = energy_data[:, 2] - energy_data[:, 0] BdE1 = dE1 / kTkJmol BdE2 = dE2 / kTkJmol nstates = 2 nframes = len(dE1) u_kln = np.zeros([nstates, nstates, nframes], np.float64) u_kln[0, 1, :] = BdE1 u_kln[1, 0, :] = BdE2 N_k = np.zeros([nstates], np.int32) # number of uncorrelated samples for k in range(nstates): [nequil, g, Neff_max] = timeseries.detectEquilibration(u_kln[k, k, :]) indices = timeseries.subsampleCorrelatedData(u_kln[k, k, :], g=g) N_k[k] = len(indices) u_kln[k, :, 0:N_k[k]] = u_kln[k, :, indices].T if verbose: print("...found {} uncorrelated samples out of {} total samples...". format(N_k, nframes)) if verbose: print("=== Computing free energy differences ===") mbar = MBAR(u_kln, N_k) [DeltaF_ij, dDeltaF_ij, Theta_ij] = mbar.getFreeEnergyDifferences() tension = DeltaF_ij[ 0, 1] / da * 1e18 * kT #(in J/m^2). note da already has a factor of two for the two areas! tensionError = dDeltaF_ij[0, 1] / da * 1e18 * kT if verbose: print('tension (pymbar): {} +/- {}N/m'.format(tension, tensionError)) return tension, tensionError
def subsample(x, y_mat, num_cols=None): """ Parameters ---------- x : numpy array 1-dimensional array with x-data, such as timestep. y_mat : can take various forms: - list of numpy arrays, such as grouping 1-column data into smaller data series - 1D numpy array, such as subsampling 1-column data - multidimensional numpy array, if data has many columns num_cols : int (opt.) Number of data series for the input y_mat. Use this value to loop over the input data, since it can be formatted as 1- or N-dimensional list or numpy array. If num_cols not specified, the value will be extracted from input data using find_num_cols function. Returns ------- x_mat : list multi-dimensional array of the same shape as z_mat z_mat : list multi-dimesional array in which z_mat[i][j] is the jth value in the ith data series. """ from pymbar import timeseries x_mat = [] z_mat = [] # subsampled y_mat if num_cols is None: num_cols = find_num_cols(y_mat) for i in range(num_cols): # list of np arrays if type(y_mat) is list and len(y_mat[0]) > 1: y = y_mat[i] # 1D np array elif type(y_mat) is np.ndarray and len(y_mat.shape) == 1: y = y_mat # multidimensional np array else: y = y_mat[:,i] # compute correlation times g = timeseries.statisticalInefficiency(y) indices = timeseries.subsampleCorrelatedData(y, g) # subsample data y_sub = y[indices] x_sub = x[indices] z_mat.append(y_sub) x_mat.append(x_sub) print("\nLength of original timeseries data: %d" % len(y) ) print("\nLength of subsampled timeseries data: %d" % len(y_sub) ) return x_mat, z_mat
def get_decorrelated_samples(replica_positions, replica_energies, temperature_list): """ Given a set of replica exchange trajectories, energies, and associated temperatures, this function returns decorrelated samples, as obtained from pymbar with timeseries.subsampleCorrelatedData. :param replica_positions: Positions array for the replica exchange data for which we will write PDB files :type replica_positions: `Quantity() <http://docs.openmm.org/development/api-python/generated/simtk.unit.quantity.Quantity.html>`_ ( np.array( [n_replicas,cgmodel.num_beads,3] ), simtk.unit ) :param replica_energies: List of dimension num_replicas X simulation_steps, which gives the energies for all replicas at all simulation steps :type replica_energies: List( List( float * simtk.unit.energy for simulation_steps ) for num_replicas ) :param temperature_list: List of temperatures for the simulation data. :type temperature_list: List( float * simtk.unit.temperature ) :returns: - configurations ( List( `Quantity() <http://docs.openmm.org/development/api-python/generated/simtk.unit.quantity.Quantity.html>`_ (n_decorrelated_samples,cgmodel.num_beads,3), simtk.unit ) ) - A list of decorrelated samples - energies ( List( `Quantity() <http://docs.openmm.org/development/api-python/generated/simtk.unit.quantity.Quantity.html>`_ ) ) - The energies for the decorrelated samples (configurations) """ all_poses = [] all_energies = [] for replica_index in range(len(replica_positions)): energies = replica_energies[replica_index][replica_index] [t0, g, Neff_max] = timeseries.detectEquilibration(energies) energies_equil = energies[t0:] poses_equil = replica_positions[replica_index][t0:] indices = timeseries.subsampleCorrelatedData(energies_equil) for index in indices: all_energies.append(energies_equil[index]) all_poses.append(poses_equil[index]) all_energies = np.array([float(energy) for energy in all_energies]) return (all_poses, all_energies)
def avg_density(dcd_file,lastframe_file, outdata): trj = md.load(dcd_file, top=lastframe_file) volume = trj.unitcell_lengths.prod(1) mass = sum([a.element.mass for a in trj.top.atoms]) / 6.0221413e23 density_nounit = mass / volume density = density_nounit * u.gram / u.nanometer**3 A_t = np.array(density_nounit ) indices = ts.subsampleCorrelatedData(A_t) ind_density = density[indices] avg_ind_density = ind_density.mean().in_units_of(u.gram / u.liter) std_ind_density = ind_density.std().in_units_of(u.gram / u.liter) N = len(indices) stderr_ind_density = std_ind_density / (N**0.5) temps = [] fid = open(outdata, 'r') fid.next() for line in fid: dtemp = float(line.split(',')[1]) temps.append(dtemp) fid.close() temps = np.array(temps) avg_temp = temps.mean() density_file = 'density_'+dcd_file[:-4]+'_indstd.dat' f = open(density_file, 'w') f.write("Average density of the system:\n") f.write(str(avg_ind_density)) f.write("\nStandard Deviation of density:\n") f.write(str(std_ind_density)) f.write("\nStandard Error of the density:\n") f.write(str(stderr_ind_density)) f.write("\nAverage Temperature of the system:\n") f.write(str(avg_temp)) f.close()
def subsample_gradients(self): r''' method to subsample gradients and get a better estiamte. ''' if self.percentage == 100 and not self.subsample: warnings.warn( "You are not subsampling your data according to the statistical inefficiency nor are " "you discarding initial data. Please set percentage to another value than 100!" ) percentage_removal = (self._N_k * (1 - self.percentage / 100.0)).astype('int32') self._subsampled_N_k_gradients = self._N_k - percentage_removal N_max = int(numpy.max(self._subsampled_N_k_gradients)) self._subsampled_grad_kn = numpy.zeros(shape=(self._N_k.shape[0], N_max)) for p in range(percentage_removal.shape[0]): start = percentage_removal[p] finish = percentage_removal[p] + N_max self._subsampled_grad_kn[p, :] = self._gradients_kn[p, start:finish] if N_max <= 50: warnings.warn( "You have reduced your data to less than 50 samples, the results from these might not " "be trustworthy. If you don't want to add more samples consider rerunning the analysis using the percentage option." ) #if subsampling is percentage, then we are done here, otherwise we will now subsample according to timeseries if self.subsample: print( "#Subsampling gradients according to statistical inefficiency") #first we compute statistical inefficiency self._gradients_kn = self._subsampled_grad_kn.copy() self._N_k = self._subsampled_N_k_gradients.copy() g_k = numpy.zeros(shape=(self._gradients_kn.shape[0])) self._subsampled_N_k_gradients = numpy.zeros( shape=(self._gradients_kn.shape[0])) for i in range(g_k.shape[0]): g_k[i] = timeseries.statisticalInefficiency( self._gradients_kn[i, :]) g = int(numpy.max(g_k)) #now we need to figure out what the indices in the data are for subsampling indices_k = [] for i in range(g_k.shape[0]): indices_k.append( timeseries.subsampleCorrelatedData( self._gradients_kn[i, :], g=g)) self._subsampled_N_k_gradients[i] = len(indices_k[i]) N_max = int(numpy.max(self._subsampled_N_k_gradients)) if N_max <= 50: warnings.warn( "You have reduced your data to less than 50 samples, the results from these might not " "be trustworthy. If you don't want to add more samples consider rerunning the analysis using the percentage option." ) self._subsampled_grad_kn = numpy.zeros( [self._gradients_kn.shape[0], N_max], numpy.float64) for k in range(self._gradients_kn.shape[0]): self._subsampled_grad_kn[k, :] = self._gradients_kn[ k, indices_k[k]]
def subsample(enthalpies): """ Subsamples the enthalpies using John Chodera's code. This is probably better than the simple cutoff we normally use. No output -- it modifies the lists directly """ # Use automatic equilibration detection and pymbar.timeseries to subsample [t0, g, Neff_max] = timeseries.detectEquilibration(enthalpies) enthalpies = enthalpies[t0:] return timeseries.subsampleCorrelatedData(enthalpies, g=g)
def calc_statistics(_data): t0, g, Neff = timeseries.detectEquilibration(_data) data_equil = _data[t0:] indices_subsampled = timeseries.subsampleCorrelatedData(data_equil, g=g) sub_data = data_equil[indices_subsampled] avg = sub_data.mean() std = sub_data.std() err = sub_data.std() / np.sqrt(len(indices_subsampled)) summary = [avg, std, err, t0, g, Neff] return summary
def _construct_decorrelation_mask(self, sim_collection, rep, skip): enes = sim_collection.reps_energies[rep] ops = sim_collection.reps_order_params[rep] steps = enes.steps rpots = utility.calc_reduced_potentials(enes, ops, sim_collection.conditions) start_i, g, Neff = timeseries.detectEquilibration(rpots, nskip=skip) template = '{:<8} {:<8} {:<3} {:<4.1f} {:<.1f}' print(template.format(sim_collection.conditions.fileformat, steps, start_i, g, Neff)) indices = (timeseries.subsampleCorrelatedData(rpots[start_i:], g=skip*g)) return [i + start_i for i in indices]
def decorrelate(traj, facs=None, verbose=False, name=None): traj = np.array(traj) if traj.ndim == 1: idx = timeseries.subsampleCorrelatedData(traj) n0 = traj.size n1 = len(idx) res = traj[idx] elif facs is not None: # The cleanest way to decorrelate multi-dimensional trajectories would probably # be a sort of "parallel-decorrelation", taking frames in a way that both trajectories # are independently decorrelated. pymbar does not offer this functionality, so for # now, here's a work-around: We'll decorrelate such that # traj_sum = facs[0]*traj[0, :] + facs[1]*traj[1, :] + ... # is decorrelated. # Use case: # traj_sum = 1.0 * U + P * V traj_sum = np.zeros(traj.shape[1]) for n, f in enumerate(facs): traj_sum += f * traj[n] idx = timeseries.subsampleCorrelatedData(traj_sum) n0 = traj.shape[1] n1 = len(idx) res = traj[:, idx] else: raise NotImplementedError('trajectory.decorrelate() is not implemented for ' 'trajectories with more than 1 dimension.') if verbose: n = n0 - n1 if not name: name = 'Trajectory' if n == 0: print('{:s} decorrelation: No frames discarded for decorrelation.'.format(name)) elif n == 1: print('{:s} decorrelation: 1 frame ({:.1%} of ' 'trajectory) discarded for decorrelation.'.format(name, 1/n0)) else: print('{:s} decorrelation: {:d} frames ({:.1%} of ' 'trajectory) discarded for decorrelation.'.format(name, n, n/n0)) return res
def compute_timeseries(reduced_potentials: np.array) -> list: """ Use pymbar timeseries to compute the uncorrelated samples in an array of reduced potentials. Returns the uncorrelated sample indices. """ from pymbar import timeseries t0, g, Neff_max = timeseries.detectEquilibration( reduced_potentials) #computing indices of uncorrelated timeseries A_t_equil = reduced_potentials[t0:] uncorrelated_indices = timeseries.subsampleCorrelatedData(A_t_equil, g=g) A_t = A_t_equil[uncorrelated_indices] full_uncorrelated_indices = [i + t0 for i in uncorrelated_indices] return [t0, g, Neff_max, A_t, full_uncorrelated_indices]
def gather_dg(self, u_kln, nstates): # Subsample data to extract uncorrelated equilibrium timeseries N_k = np.zeros([nstates], np.int32) # number of uncorrelated samples for k in range(nstates): [_, g, __] = timeseries.detectEquilibration(u_kln[k, k, :]) indices = timeseries.subsampleCorrelatedData(u_kln[k, k, :], g=g) N_k[k] = len(indices) u_kln[k, :, 0:N_k[k]] = u_kln[k, :, indices].T # Compute free energy differences and statistical uncertainties mbar = MBAR(u_kln, N_k) [DeltaF_ij, dDeltaF_ij, _] = mbar.getFreeEnergyDifferences() print("Number of uncorrelated samples per state: {}".format(N_k)) return DeltaF_ij, dDeltaF_ij
def get_stats(data): """ later, can generalize, to use one column for decorrelating and getting reference indices """ [t0, g, Neff] = timeseries.detectEquilibration(data) data_equil = data[t0:] indices = timeseries.subsampleCorrelatedData(data_equil, g=g) sub_data = data_equil[indices] avg = sub_data.mean() std = sub_data.std() err = sub_data.std()/np.sqrt( len(indices) ) return avg,std,err, t0,g,Neff, sub_data
def getNkandUkln(): # u_kln = u_klt # N_k = [maxn]*K # return (N_k, u_kln) """Identifies uncorrelated samples and updates the arrays of the reduced potential energy and dhdlt retaining data entries of these samples only.""" u_kln = np.zeros([K,K,maxn], np.float64) # u_kln[k,m,n] is the reduced potential energy of uncorrelated sample index n from state k evaluated at state m N_k = np.zeros(K, int) # N_k[k] is the number of uncorrelated samples from state k g = np.zeros(K,float) # autocorrelation times for the data print "Number of correlated and uncorrelated samples:\n\n%8s %10s %12s %12s" % ('Lambda', 'N', 'N_k', 'N/N_k') for k in range(K): if k == 0: g[k] = timeseries.statisticalInefficiency(u_klt[k,k+1,:]) indices = np.array(timeseries.subsampleCorrelatedData(u_klt[k,k+1,:])) # indices of uncorrelated samples else: g[k] = timeseries.statisticalInefficiency(u_klt[k,k-1,:]) indices = np.array(timeseries.subsampleCorrelatedData(u_klt[k,k-1,:])) N = len(indices) # number of uncorrelated samples N_k[k] = N # Store the number of uncorrelated samples from state k. for l in range(K): u_kln[k,l,0:N] = u_klt[k,l,indices] print "%6.2f %12s %12s %12.2f" % (l_list[k], maxn, N_k[k], g[k]) print '' return (N_k, u_kln)
def subsample_energies(self): r''' This subsamples u_kln according to percentage, i.e. remove initial equilibration data and then can additionally subsample according to timeseries ''' #removing percent if self.percentage == 100 and not self.subsample: warnings.warn("You are not subsampling your data according to the statistical inefficiency nor are " "you discarding initial data. Please set percentage to another value than 100!") percentage_removal = (self._N_k*(1-self.percentage/100.0)).astype('int32') self._subsampled_N_k_energies = self._N_k-percentage_removal N_max = int(numpy.max(self._subsampled_N_k_energies)) self._subsampled_u_kln = numpy.zeros(shape=(self._N_k.shape[0], self._N_k.shape[0], N_max)) self._subsampled_energies_kn = numpy.zeros(shape=(self._N_k.shape[0], N_max)) for k in range(0, self._N_k.shape[0]): self._subsampled_u_kln[k] = self._u_kln[k,:,percentage_removal[k]:percentage_removal[k]+N_max] self._subsampled_energies_kn[k] = self._energies_kn[k,percentage_removal[k]:percentage_removal[k]+N_max] if N_max <=50: warnings.warn("You have reduced your data to less than 50 samples, the results from these might not " "be trustworthy. If you don't want to add more samples consider rerunning the analysis using the percentage option.") #Now we are doing some additional subsampling according to timeseries analysis if self.subsample: print("#Subsampling energies according to statistical inefficiency for pymbar") self._u_kln = self._subsampled_u_kln.copy() self._N_k = self._subsampled_N_k_energies.copy() self._energies_kn = self._subsampled_energies_kn.copy() #first we compute statistical inefficiency g_k = numpy.zeros(shape=(self._energies_kn.shape[0])) for i in range(g_k.shape[0]): g_k[i] = timeseries.statisticalInefficiency(self._energies_kn[i,percentage_removal[i]:]) g = numpy.max(g_k) #now we need to figure out what the indices in the data are for subsampling indices_k = [] self._subsampled_N_k_energies = numpy.zeros(shape=(self._energies_kn.shape[0])) for i in range(g_k.shape[0]): indices_k.append(timeseries.subsampleCorrelatedData(self._energies_kn[i,:], g=g)) self._subsampled_N_k_energies[i]=len(indices_k[i]) #self._subsampled_N_k_energies = (numpy.ceil(self._N_k / g)).astype(int) N_max = int(numpy.max(self._subsampled_N_k_energies)) if N_max <=50: warnings.warn("You have reduced your data to less than 50 samples, the results from these might not " "be trustworthy. If you don't want to add more samples consider rerunning the analysis using the percentage option.") self._subsampled_u_kln = numpy.zeros([self._gradients_kn.shape[0],self._gradients_kn.shape[0], N_max], numpy.float64) for k in range(self._gradients_kn.shape[0]): self._subsampled_u_kln[k,:,:] = self._u_kln[k,:,indices_k[k]].transpose()
def read_concentration(files, discard=10, fast=False): """ Calculate the mean concentration and standard error from numerous numerous simulations, where each simulation has a fixed chemical potential. Timeseries analysis is used to determine equilibrium properties. Parameters ---------- files: list of str the path to each results file that will be analysed. discard: int the initial amount of data to throw away fast: bool whether to perform the fast varient of the time series analysis """ concentration = np.zeros(len(files)) standard_error = np.zeros(len(files)) delta_mu = np.zeros(len(files)) lower = np.zeros(len(files)) upper = np.zeros(len(files)) for i in range(len(files)): ncfile = Dataset(files[i], 'r') volume = ncfile.groups['Sample state data']['volume'][:] #ncations = ncfile.groups['Sample state data']['species counts'][:, 1] nsalt = np.min(ncfile.groups['Sample state data']['species counts'][:, 1:2], axis=1) delta_mu[i] = ncfile.groups['Control parameters']['delta_chem'][0] ncfile.close() # Get the concentration in Molarity c = 1.0 * nsalt / volume * 1.66054 # Estimate the mean and standard error with timeseries analysis t_equil, stat_ineff, n_eff = timeseries.detectEquilibration(c[discard:], fast=fast) #mu, sigma, num_batches, conf_width = misc_tools.batch_estimate_2(c[(discard + t_equil):], stat_ineff) #print("{0} batches for {1}".format(num_batches, files[i])) c_equil = c[(discard + t_equil):] concentration[i] = np.mean(c_equil) independent_inds = timeseries.subsampleCorrelatedData(c_equil, g=stat_ineff, conservative=True) mu_samps = misc_tools.bootstrap_estimates(c_equil[independent_inds]) lower[i] = np.percentile(mu_samps, 2.5) upper[i] = np.percentile(mu_samps, 97.5) standard_error[i] = mu_samps.std() return concentration, standard_error, delta_mu, lower, upper
def decorrelate(traj, verbose=False, name=None): traj = np.array(traj) if traj.ndim == 1: idx = timeseries.subsampleCorrelatedData(traj) n0 = traj.size n1 = len(idx) res = traj[idx] elif traj.ndim == 2: # pymbar doesn't offer to decorrelate two samples, so let's do it ourselves # and just use the decorrelation of the sample more strongly correlated # # calculate (maximal) inefficiency g1 = timeseries.statisticalInefficiency(traj[0]) g2 = timeseries.statisticalInefficiency(traj[1]) g = np.max([g1, g2]) # calculate index n0 = traj.shape[1] idx = np.unique( np.array(np.round(np.arange(0, int(n0 / g + .5)) * g), dtype=int)) idx = idx[idx < n0] n1 = len(idx) res = traj[:, idx] else: raise NotImplementedError( 'trajectory.decorrelate() is not implemented for ' 'trajectories with more than 1 dimension.') if verbose: n = n0 - n1 if not name: name = 'Trajectory' if n == 0: print('{:s} decorrelation: No frames discarded for decorrelation.'. format(name)) elif n == 1: print('{:s} decorrelation: 1 frame ({:.1%} of ' 'trajectory) discarded for decorrelation.'.format( name, 1 / n0)) else: print('{:s} decorrelation: {:d} frames ({:.1%} of ' 'trajectory) discarded for decorrelation.'.format( name, n, n / n0)) return res
def equil_sample( data, threshold_fraction=0.0, threshold_neff=1, conservative=True ): """Returns a statistically independent subset of an array of data. Parameters ---------- data : numpy.typing.Arraylike 1-D time dependent data to check for equilibration. threshold_fraction : float, optional, default=0.8 Fraction of data expected to be equilibrated. threshold_neff : int, optional, default=100 Minimum amount of effectively correlated samples to consider a_t 'equilibrated'. conservative : bool, default=True if set to True, uniformly-spaced indices are chosen with interval ceil(g), where g is the statistical inefficiency. Otherwise, indices are chosen non-uniformly with interval of approximately g in order to end up with approximately T/g total indices Returns ------- (numpy.ndarray, numpy.ndarray, int, int) """ is_equil, prod_start, ineff, Neff = is_equilibrated( data, threshold_fraction, threshold_neff ) if is_equil: uncorr_indices = timeseries.subsampleCorrelatedData( data[prod_start:], g=ineff, conservative=conservative ) uncorr_sample = data[prod_start:][uncorr_indices] return(uncorr_sample, uncorr_indices, prod_start, Neff) else: raise ValueError( "Property does not have requisite threshold of production data " "expected. More production data is needed, or the threshold needs " "to be lowered. See is_equilibrated for more information." )
def subsample_energies(self): if self.subsample_method!='timeseries': print("We are only eliminating samples from the beginning of the data and are still working with highly" " correlated data!") if self.percentage ==100: RuntimeWarning("You are not subsampling your data according to the statistical inefficiency nor are" "you discarding initial data. Please set percentage to another value than 100!") percentage_removal = self._N_k*(1-self.percentage/100.0) self._subsampled_N_k_energies = self._N_k-percentage_removal N_max = np.max(self._subsampled_N_k_energies) self._subsampled_u_kln = np.zeros(shape=(self._N_k.shape[0], self._N_k.shape[0], N_max)) for i in range(percentage_removal.shape[0]): for j in range(percentage_removal.shape[0]): self._subsampled_u_kln[i,j,:] = self._u_kln[i,j,percentage_removal[j]:] if N_max <=100: RuntimeWarning("You have reduced your data to less than 100 samples, the results from these might not " "be trustworthy. ") else: print("We are doing a timeseries analysis using the timeseries analysis module in pymbar and will subsample" " according to that.") #first we compute statistical inefficiency g_k = np.zeros(shape=(self._energies_kn.shape[0])) for i in range(g_k.shape[0]): g_k[i] = timeseries.statisticalInefficiency(self._energies_kn[i,:]) g = np.max(g_k) #now we need to figure out what the indices in the data are for subsampling indices_k = [] self._subsampled_N_k_energies = np.zeros(shape=(self._gradients_kn.shape[0])) for i in range(g_k.shape[0]): indices_k.append(timeseries.subsampleCorrelatedData(self._energies_kn[i,:], g=g)) self._subsampled_N_k_energies[i]=len(indices_k[i]) #self._subsampled_N_k_energies = (np.ceil(self._N_k / g)).astype(int) N_max = np.max(self._subsampled_N_k_energies) if N_max <=100: RuntimeWarning("You have reduced your data to less than 100 samples, the results from these might not " "be trustworthy. ") self._subsampled_u_kln = np.zeros([self._gradients_kn.shape[0],self._gradients_kn.shape[0], N_max], np.float64) for k in range(self._gradients_kn.shape[0]): self._subsampled_u_kln[k,:,:] = self._u_kln[k,:,indices_k[k]].transpose()
def individual_analysis_procedure(temperature): ### # # This subroutine analyzes a timeseries for 'temperature', # and generates a set of decorrelated sample energies and distances, # which are used in later sampling to generate a free energy surface. # ### if (search_for_existing_data and not (os.path.exists( str(output_dir + str(temperature) + "/uncorrelated_distances.dat")) )) or not (search_for_existing_data): output_obj = open(str(output_dir + str(temperature) + "/sim_data.dat"), 'r') E_total_all_temp = np.array( [l.split(',')[3] for l in output_obj.readlines()] ) # E_total_all_temp temporarily stores the total energies from NaCl simulation output output_obj.close() distances = util.get_distances( str(output_dir + str(temperature) + "/coordinates.pdb"), simulation_steps) # Read in the distances E_total_all = np.array( np.delete(E_total_all_temp, 0, 0), dtype=float ) # E_total_all stores total energies from NaCl simulation output, after re-typing [t0, g, Neff_max] = timeseries.detectEquilibration( E_total_all, nskip=nskip ) # Identify the indices of samples with high statistical efficiency (g) E_total_equil = E_total_all[ t0:] # Using the index for the equilibration time (t0), truncate the time-series data before this index uncorrelated_energy_indices = timeseries.subsampleCorrelatedData( E_total_equil, g=g) # Determine indices of uncorrelated samples np.savetxt( str(output_dir + str(temperature) + '/uncorrelated_total_energies.dat'), E_total_equil[uncorrelated_energy_indices] ) # Write uncorrelated total energies to file np.savetxt( str(output_dir + str(temperature) + '/uncorrelated_distances.dat'), distances[uncorrelated_energy_indices] ) # Write uncorrelated Na-Cl distances to file return
def gather_dg(self, u_kln, nstates): u_kln = np.vstack(u_kln) # Subsample data to extract uncorrelated equilibrium timeseries N_k = np.zeros([nstates], np.int32) # number of uncorrelated samples for k in range(nstates): [_, g, __] = timeseries.detectEquilibration(u_kln[k, k, :]) indices = timeseries.subsampleCorrelatedData(u_kln[k, k, :], g=g) N_k[k] = len(indices) u_kln[k, :, 0:N_k[k]] = u_kln[k, :, indices].T # Compute free energy differences and statistical uncertainties mbar = MBAR(u_kln, N_k) [DeltaF_ij, dDeltaF_ij, _] = mbar.getFreeEnergyDifferences() logger.debug( "Number of uncorrelated samples per state: {}".format(N_k)) logger.debug("Relative free energy change for {0} = {1} +- {2}".format( self.name, DeltaF_ij[0, nstates - 1] * self.kTtokcal, dDeltaF_ij[0, nstates - 1] * self.kTtokcal)) return DeltaF_ij[0, nstates - 1] * self.kTtokcal, dDeltaF_ij[0, nstates - 1] * self.kTtokcal
def subsampling(self, integratedACF=True): """ Performs inline subsampling based on the statistical inefficiency ``g`` of the specified attribute `acfun` of :class:`sample`, aiming at obtaining a sample of :term:`IID` configurations. Subsampling is done via jumps of varying sizes around ``g``, so that the sample size decays by a factor of approximately ``1/g``. Parameters ---------- integratedACF : bool, optional, default=True If true, the integrated :term:`ACF` method :cite:`Chodera_2007` will be used for computing the statistical inefficiency. Otherwise, the :term:`OBM` method will be used instead. Returns ------- :class:`sample` Although the subsampling is done inline, the new sample is returned for chaining purposes. """ n = len(self.dataset) if mics.verbose: info("\n=== Subsampling via %s ===" % ("integrated ACF" if integratedACF else "OBM")) info("Original sample size:", n) if integratedACF: y = multimap([self.acfun.lambdify()], self.dataset) g = timeseries.statisticalInefficiency(y[0]) else: g = n / self.neff new = timeseries.subsampleCorrelatedData(self.dataset.index, g) self.dataset = self.dataset.reindex(new) self.neff = len(new) if mics.verbose: info("Statistical inefficiency:", g) info("New sample size:", self.neff) return self
def equilibrate_and_subsample(self, u_kln_replica, u_kln, u_n, ndiscard=0, nuse=None): """Equilibrate, truncate, and subsample uncorrelated samples. Parameters ---------- ndiscard : int, optinoal, default=0 number of iterations to discard to equilibration nuse : int, optional, default=None maximum number of iterations to use (after discarding) Returns ------- """ logger.info("Discarding initial data as equilibration (ndiscard = %d)" % ndiscard) u_kln_replica = u_kln_replica[:,:,ndiscard:] u_kln = u_kln[:,:, ndiscard:] u_n = u_n[ndiscard:] if nuse is not None: logger.info("Truncating to number of specified conforamtions to use(nuse = %d)" % nuse) u_kln_replica = u_kln_replica[:,:,0:nuse] u_kln = u_kln[:,:,0:nuse] u_n = u_n[0:nuse] logger.info("Subsample data to obtain uncorrelated samples") N_k = np.zeros(self.n_states, np.int32) indices = timeseries.subsampleCorrelatedData(u_n) # indices of uncorrelated samples N = len(indices) # number of uncorrelated samples N_k[:] = N u_kln[:, :, 0:N] = u_kln[:, :, indices] logger.info("number of uncorrelated samples:") logger.info(N_k) logger.info("") return u_kln_replica, u_kln, u_n, N_k, N
def estimate_free_energies(ncfile, ndiscard=0, nuse=None, g=1.0, replicas=None): """Estimate free energies of all alchemical states. ARGUMENTS ncfile (NetCDF) - input YANK netcdf file OPTIONAL ARGUMENTS ndiscard (int) - number of iterations to discard to equilibration (default: 0) nuse (int) - maximum number of iterations to use (after discarding) (default: None) g (float) - statistical inefficiency to use for subsampleing (default: 1.0) replicas (list of int) - if specified, only use these replicas for estimating the free energies (default: None) TODO: Automatically determine 'ndiscard'. """ # Get current dimensions. niterations = ncfile.variables['energies'].shape[0] nstates = ncfile.variables['energies'].shape[1] natoms = ncfile.variables['energies'].shape[2] # Extract energies. energies = ncfile.variables['energies'] u_kln_replica = numpy.zeros([nstates, nstates, niterations], numpy.float64) for n in range(niterations): u_kln_replica[:,:,n] = energies[n,:,:] # Extract states. states_kn_replica = numpy.zeros([nstates, niterations], numpy.int32) for n in range(niterations): states_kn_replica[:,n] = ncfile.variables['states'][n,:] # Discard initial data to equilibration. u_kln_replica = u_kln_replica[:,:,ndiscard:] states_kn_replica = states_kn_replica[:,ndiscard:] # If specified, truncate to number of specified conformations to use. if (nuse): u_kln_replica = u_kln_replica[:,:,0:nuse] states_kn_replica = states_kn_replica[:,0:nuse] # Subsample data to obtain uncorrelated samples A_n = u_kln_replica[0,0,:] indices = timeseries.subsampleCorrelatedData(A_n, g=g) # indices of uncorrelated samples N = len(indices) # number of uncorrelated samples u_kln_replica[:,:,0:N] = u_kln_replica[:,:,indices] states_kn_replica[:,0:N] = states_kn_replica[:,indices] # Deconvolute replicas to obtain energies by state. u_kln = numpy.zeros([nstates, nstates, N], numpy.float64) if replicas is None: # Use all replicas. N_k = N * numpy.ones(nstates, numpy.int32) for n in range(N): state_indices = states_kn_replica[:,n] u_kln[state_indices,:,n] = u_kln_replica[:,:,n] else: # Use only specified replicas. N_k = numpy.zeros(nstates, numpy.int32) for n in range(N): state_indices = ncfile.variables['states'][n,:] for replica in replicas: state_index = states_kn_replica[replica,n] u_kln[state_index,:,N_k[state_index]] = u_kln_replica[replica,:,n] N_k[state_index] += 1 #=================================================================================================== # Estimate free energy difference with MBAR. #=================================================================================================== # Initialize MBAR (computing free energy estimates, which may take a while) mbar = MBAR(u_kln, N_k, verbose = False, maximum_iterations = 50000) # use slow self-consistent-iteration (the default) # Get matrix of dimensionless free energy differences and uncertainty estimate. (Deltaf_ij, dDeltaf_ij) = mbar.getFreeEnergyDifferences(uncertainty_method='svd-ew') # Return free energy differences and an estimate of the covariance. return (Deltaf_ij, dDeltaf_ij)
if len(fep_columns) > 0: for i in range(len(fep_columns)): reduced_fep_data.append(numpy.zeros([K,N_samples], numpy.float64)) for k in range(K): # Extract timeseries. A_t = biasing_variable_kt[0][k,:] # Compute statistical inefficiency. try: g = timeseries.statisticalInefficiency(A_t) except Exception as e: print str(e) print A_t # Subsample data. if subsample_trajectories: indices = timeseries.subsampleCorrelatedData(A_t, g=g) else: indices = timeseries.subsampleCorrelatedData(A_t, g=1) N = len(indices) # number of uncorrelated samples print "k = %5d : g = %.1f, N = %d" % (k, g, N) for i in range(nbiases): biasing_variable_kn[i][k,0:N] = biasing_variable_kt[i][k,indices] for i in range(nperturbations+1): U_kn[i][k,0:N] = U_kt[i][k,indices] if not cluster_binning: pmf_variable_kn_1[k,0:N] = pmf_variable_kt_1[k,indices] if ndim == 2: pmf_variable_kn_2[k,0:N] = pmf_variable_kt_2[k,indices] if cluster_binning: cluster_bin_kn[k,0:N] = cluster_bin_kt[k,indices] if len(expectation_columns) > 0:
infile.close() # Parse data. n = 0 for line in lines: if line[0] != '#' and line[0] != '@': tokens = line.split() u_kn[k,n] = beta_k[k] * (float(tokens[2]) - float(tokens[1])) # reduced potential energy without umbrella restraint n += 1 # Compute correlation times for potential energy and chi # timeseries. If the temperatures differ, use energies to determine samples; otherwise, use the cosine of chi if (DifferentTemperatures): g_k[k] = timeseries.statisticalInefficiency(u_kn[k,:], u_kn[k,0:N_k[k]]) print "Correlation time for set %5d is %10.3f" % (k,g_k[k]) indices = timeseries.subsampleCorrelatedData(u_kn[k,0:N_k[k]]) else: chi_radians = chi_kn[k,0:N_k[k]]/(180.0/numpy.pi) g_cos = timeseries.statisticalInefficiency(numpy.cos(chi_radians)) g_sin = timeseries.statisticalInefficiency(numpy.sin(chi_radians)) print "g_cos = %.1f | g_sin = %.1f" % (g_cos, g_sin) g_k[k] = max(g_cos, g_sin) print "Correlation time for set %5d is %10.3f" % (k,g_k[k]) indices = timeseries.subsampleCorrelatedData(chi_radians, g=g_k[k]) # Subsample data. N_k[k] = len(indices) u_kn[k,0:N_k[k]] = u_kn[k,indices] chi_kn[k,0:N_k[k]] = chi_kn[k,indices] N_max = numpy.max(N_k) # shorten the array size u_kln = numpy.zeros([K,K,N_max], numpy.float64) # u_kln[k,l,n] is the reduced potential energy of snapshot n from umbrella simulation k evaluated at umbrella l
def estimate_free_energies(ncfile, ndiscard = 0, nuse = None): """Estimate free energies of all alchemical states. ARGUMENTS ncfile (NetCDF) - input YANK netcdf file OPTIONAL ARGUMENTS ndiscard (int) - number of iterations to discard to equilibration nuse (int) - maximum number of iterations to use (after discarding) TODO: Automatically determine 'ndiscard'. """ # Get current dimensions. niterations = ncfile.variables['energies'].shape[0] nstates = ncfile.variables['energies'].shape[1] natoms = ncfile.variables['energies'].shape[2] # Extract energies. logger.info("Reading energies...") energies = ncfile.variables['energies'] u_kln_replica = np.zeros([nstates, nstates, niterations], np.float64) for n in range(niterations): u_kln_replica[:,:,n] = energies[n,:,:] logger.info("Done.") # Deconvolute replicas logger.info("Deconvoluting replicas...") u_kln = np.zeros([nstates, nstates, niterations], np.float64) for iteration in range(niterations): state_indices = ncfile.variables['states'][iteration,:] u_kln[state_indices,:,iteration] = energies[iteration,:,:] logger.info("Done.") # Compute total negative log probability over all iterations. u_n = np.zeros([niterations], np.float64) for iteration in range(niterations): u_n[iteration] = np.sum(np.diagonal(u_kln[:,:,iteration])) #logger.info(u_n # DEBUG outfile = open('u_n.out', 'w') for iteration in range(niterations): outfile.write("%8d %24.3f\n" % (iteration, u_n[iteration])) outfile.close() # Discard initial data to equilibration. u_kln_replica = u_kln_replica[:,:,ndiscard:] u_kln = u_kln[:,:,ndiscard:] u_n = u_n[ndiscard:] # Truncate to number of specified conforamtions to use if (nuse): u_kln_replica = u_kln_replica[:,:,0:nuse] u_kln = u_kln[:,:,0:nuse] u_n = u_n[0:nuse] # Subsample data to obtain uncorrelated samples N_k = np.zeros(nstates, np.int32) indices = timeseries.subsampleCorrelatedData(u_n) # indices of uncorrelated samples #print u_n # DEBUG #indices = range(0,u_n.size) # DEBUG - assume samples are uncorrelated N = len(indices) # number of uncorrelated samples N_k[:] = N u_kln[:,:,0:N] = u_kln[:,:,indices] logger.info("number of uncorrelated samples:") logger.info(N_k) logger.info("") #=================================================================================================== # Estimate free energy difference with MBAR. #=================================================================================================== # Initialize MBAR (computing free energy estimates, which may take a while) logger.info("Computing free energy differences...") mbar = MBAR(u_kln, N_k, verbose = False, method = 'self-consistent-iteration', maximum_iterations = 50000) # use slow self-consistent-iteration (the default) #mbar = MBAR(u_kln, N_k, verbose = True, method = 'Newton-Raphson') # use faster Newton-Raphson solver # Get matrix of dimensionless free energy differences and uncertainty estimate. logger.info("Computing covariance matrix...") (Deltaf_ij, dDeltaf_ij) = mbar.getFreeEnergyDifferences(uncertainty_method='svd-ew') # # Matrix of free energy differences logger.info("Deltaf_ij:") for i in range(nstates): for j in range(nstates): print "%8.3f" % Deltaf_ij[i,j], print "" # print Deltaf_ij # # Matrix of uncertainties in free energy difference (expectations standard deviations of the estimator about the true free energy) logger.info("dDeltaf_ij:") for i in range(nstates): for j in range(nstates): print "%8.3f" % dDeltaf_ij[i,j], print "" # Return free energy differences and an estimate of the covariance. return (Deltaf_ij, dDeltaf_ij)
g_k = zeros([K], float64) for k in range(K): # Compute statistical inefficiency for extension timeseries g = timeseries.statisticalInefficiency(x_kt[k,0:T_k[k]], x_kt[k,0:T_k[k]]) # store statistical inefficiency g_k[k] = g print("timeseries %d : g = %.1f, %.0f uncorrelated samples (of %d total samples)" % (k+1, g, floor(T_k[k] / g), T_k[k])) N_max = max(N_max, ceil(T_k[k] / g) + 1) # Subsample trajectory position data. x_kn = zeros([K, N_max], float64) bin_kn = zeros([K, N_max], int32) N_k = zeros([K], int32) for k in range(K): # Compute correlation times for potential energy and chi timeseries. indices = timeseries.subsampleCorrelatedData(x_kt[k,0:T_k[k]]) # Store subsampled positions. N_k[k] = len(indices) x_kn[k,0:N_k[k]] = x_kt[k,indices] bin_kn[k,0:N_k[k]] = bin_kt[k,indices] # Set arbitrary zeros for external biasing potential. x0_k = zeros([K], float64) # x position corresponding to zero of potential for k in range(K): x0_k[k] = x_kn[k,0:N_k[k]].mean() print("x0_k = ") print(x0_k) # Compute bias energies in units of kT. u_kln = zeros([K,K,N_max], float64) # u_kln[k,l,n] is the reduced (dimensionless) relative potential energy of snapshot n from umbrella simulation k evaluated at umbrella l for k in range(K):
def overlap_check(reference_system, positions, platform_name=None, precision=None, nsteps=50, nsamples=200, factory_args=None, cached_trajectory_filename=None): """ Test overlap between reference system and alchemical system by running a short simulation. Parameters ---------- reference_system : simtk.openmm.System The reference System object to compare with positions : simtk.unit.Quantity with units compatible with nanometers The positions to assess energetics for. platform_name : str, optional, default=None The name of the platform to use for benchmarking. nsteps : int, optional, default=50 Number of molecular dynamics steps between samples. nsamples : int, optional, default=100 Number of samples to collect. factory_args : dict(), optional, default=None Arguments passed to AbsoluteAlchemicalFactory. cached_trajectory_filename : str, optional, default=None If specified, attempt to cache (or reuse) trajectory. """ # Create a fully-interacting alchemical state. factory = AbsoluteAlchemicalFactory(reference_system, **factory_args) alchemical_state = AlchemicalState() alchemical_system = factory.createPerturbedSystem(alchemical_state) temperature = 300.0 * unit.kelvin collision_rate = 5.0 / unit.picoseconds timestep = 2.0 * unit.femtoseconds kT = (kB * temperature) # Select platform. platform = None if platform_name: platform = openmm.Platform.getPlatformByName(platform_name) # Create integrators. reference_integrator = openmm.LangevinIntegrator(temperature, collision_rate, timestep) alchemical_integrator = openmm.VerletIntegrator(timestep) # Create contexts. if platform: reference_context = openmm.Context(reference_system, reference_integrator, platform) alchemical_context = openmm.Context(alchemical_system, alchemical_integrator, platform) else: reference_context = openmm.Context(reference_system, reference_integrator) alchemical_context = openmm.Context(alchemical_system, alchemical_integrator) ncfile = None if cached_trajectory_filename: cache_mode = 'write' # Try reading from cache from netCDF4 import Dataset if os.path.exists(cached_trajectory_filename): try: ncfile = Dataset(cached_trajectory_filename, 'r') if (ncfile.variables['positions'].shape == (nsamples, reference_system.getNumParticles(), 3)): # Read the cache if everything matches cache_mode = 'read' except: pass if cache_mode == 'write': # If anything went wrong, create a new cache. try: (pathname, filename) = os.path.split(cached_trajectory_filename) if not os.path.exists(pathname): os.makedirs(pathname) ncfile = Dataset(cached_trajectory_filename, 'w', format='NETCDF4') ncfile.createDimension('samples', 0) ncfile.createDimension('atoms', reference_system.getNumParticles()) ncfile.createDimension('spatial', 3) ncfile.createVariable('positions', 'f4', ('samples', 'atoms', 'spatial')) except Exception as e: logger.info(str(e)) logger.info('Could not create a trajectory cache (%s).' % cached_trajectory_filename) ncfile = None # Collect simulation data. reference_context.setPositions(positions) du_n = np.zeros([nsamples], np.float64) # du_n[n] is the print() import click with click.progressbar(range(nsamples)) as bar: for sample in bar: if cached_trajectory_filename and (cache_mode == 'read'): # Load cached frames. positions = unit.Quantity(ncfile.variables['positions'][sample,:,:], unit.nanometers) reference_context.setPositions(positions) else: # Run dynamics. reference_integrator.step(nsteps) # Get reference energies. reference_state = reference_context.getState(getEnergy=True, getPositions=True) reference_potential = reference_state.getPotentialEnergy() if np.isnan(reference_potential/kT): raise Exception("Reference potential is NaN") # Get alchemical energies. alchemical_context.setPositions(reference_state.getPositions(asNumpy=True)) alchemical_state = alchemical_context.getState(getEnergy=True) alchemical_potential = alchemical_state.getPotentialEnergy() if np.isnan(alchemical_potential/kT): raise Exception("Alchemical potential is NaN") du_n[sample] = (alchemical_potential - reference_potential) / kT if cached_trajectory_filename and (cache_mode == 'write') and (ncfile is not None): ncfile.variables['positions'][sample,:,:] = reference_state.getPositions(asNumpy=True) / unit.nanometers # Clean up. del reference_context, alchemical_context if cached_trajectory_filename and (ncfile is not None): ncfile.close() # Discard data to equilibration and subsample. from pymbar import timeseries [t0, g, Neff] = timeseries.detectEquilibration(du_n) indices = timeseries.subsampleCorrelatedData(du_n, g=g) du_n = du_n[indices] # Compute statistics. from pymbar import EXP [DeltaF, dDeltaF] = EXP(du_n) # Raise an exception if the error is larger than 3kT. MAX_DEVIATION = 3.0 # kT if (dDeltaF > MAX_DEVIATION): report = "DeltaF = %12.3f +- %12.3f kT (%5d samples, g = %6.1f)" % (DeltaF, dDeltaF, Neff, g) raise Exception(report) return
#------------------------------------------------------------------------ # Read Data From File #------------------------------------------------------------------------ print("") print("Preparing data:") T_from_file = read_simulation_temps(simulation,NumTemps) E_from_file = read_total_energies(simulation,TE_COL_NUM) K = len(T_from_file) N_k = numpy.zeros(K,numpy.int32) g = numpy.zeros(K,numpy.float64) for k in range(K): # subsample the energies g[k] = timeseries.statisticalInefficiency(E_from_file[k]) indices = numpy.array(timeseries.subsampleCorrelatedData(E_from_file[k],g=g[k])) # indices of uncorrelated samples N_k[k] = len(indices) # number of uncorrelated samples E_from_file[k,0:N_k[k]] = E_from_file[k,indices] #------------------------------------------------------------------------ # Insert Intermediate T's and corresponding blank U's and E's #------------------------------------------------------------------------ Temp_k = T_from_file minT = T_from_file[0] maxT = T_from_file[len(T_from_file) - 1] #beta = 1/(k*BT) #T = 1/(kB*beta) if dertype == 'temperature': minv = minT maxv = maxT elif dertype == 'beta': # actually going in the opposite direction as beta for logistical reasons
def main(): usage = """ usage: %prog [options] <metadata file> """ parser = optparse.OptionParser(usage) parser.add_option("-o", "--outfile", dest="output_file", default='mbar_pmf.out', help="Output file for PMF [default: %default]") parser.add_option("-t", "--temperature", dest="temperature", default=300., type="float", help="Initial temperature in K [default: %default K]") parser.add_option("-b", "--bins", dest="bins", default=50, type="int", help="Number of bins for 1D PMF [default: %default]") parser.add_option("-d", "--double", dest="double_k", default=False, action='store_true', help="Double the k values [default: %default]") parser.add_option("-c", "--kcal", dest="kcal_k", default=False, action='store_true', help="Convert k values from kcal to kJ [default: %default]") parser.add_option("-s", "--skip-subsampling", dest="skip_subsampling", default=False, action='store_true', help="Skip data subsampling [default: %default]") parser.add_option("-v", "--verbose", dest="verbose", default=False, action="store_true", help="Verbose output from PyMBAR [default: %default]") (options, args) = parser.parse_args() if len(args) < 1: parser.error('No metadata file passed') elif not os.path.exists(args[0]): parser.error('Metadata file not found') metadata = [] # stores metadata per umbrella N_max = 0 # the max number of snapshots per umbrella different_temperatures = False # flag to know if we are reading in energies for the snapshots # open the wham metadata file print "Opening metadata file %s" % args[0] f = open(args[0], 'r') metadata_lines = f.readlines() f.close() # first get all the metadata and count the max number of snapshots per umbrella for line in metadata_lines: # skip comments if line.startswith('#'): continue # split lines based on spaces, but convert tabs to spaces first clean_split = filter(None, line.strip().expandtabs().split(' ')) if not os.path.exists(clean_split[0]): print "Data file %s doesn't exist, skipping this replica" % clean_split[0] continue else: # get the number of snapshots for the replica nsnapshots = file_len(clean_split[0]) # /path/to/timeseries/file loc_win_min spring [correl time] [temperature] k = float(clean_split[2]) if options.double_k: k = k*2.0 if options.kcal_k: k = k*4.184 current_meta = { 'path': clean_split[0], 'coord': float(clean_split[1]), 'k': k, 'n': nsnapshots } # K_k[k] = float(tokens[1]) * (numpy.pi/180)**2 # spring constant (read in kJ/mol/rad**2, converted to kJ/mol/deg**2) if len(clean_split) >= 4: # TODO: temperature the 4rd or 5th value??? # temperature might be the 4th value... current_meta['t'] = float(clean_split[3]) different_temperatures = True metadata.append(current_meta) N_max = numpy.max([ w['n'] for w in metadata ]) print "Max number of snapshots %d" % N_max # now allocate the memory for the arrays K = len(metadata) T_k = numpy.ones(K,float)*options.temperature # inital temperatures are all equal beta_k = 1.0/(kB*T_k) # beta factor for the different temperatures data = numpy.zeros([K,N_max], numpy.float64) # the snapshot data u_kn = numpy.zeros([K,N_max], numpy.float64) # u_kn[k,n] is the reduced potential energy without umbrella restraints of snapshot n of umbrella simulation k u_kln = numpy.zeros([K,K,N_max], numpy.float64) # u_kln[k,l,n] is the reduced potential energy of snapshot n from umbrella simulation k evaluated at umbrella l g_k = numpy.zeros([K],numpy.float32) # correlation time data_min = [] # will set the min and max data values later data_max = [] # Now loop through each datafile and extract the data for i, w in enumerate(metadata): print "Reading %s..." % w['path'] f = open(w['path'], 'r') lines = f.readlines() f.close() clean_split_lines = [ filter(None, line.strip().expandtabs().split(' ')) for line in lines if not line.startswith('#') ] if different_temperatures: raise Exception('Differen\'t temperatures aren\'t supported yet') # if different temperatures are specified the metadata file, # then we need the energies to compute the PMF, found in the third column # for j,l in enumerate(clean_split_lines): # data[i,j] = float(l[1]) # second column is the coordinate # # third column will be the system's potential energy # potential_energy = float(l[2]) # dchi = w['coord']-float(l[1]) # restraint_potential = k_multiplier*w['k']*(dchi**2) # # TODO: given the coordinate and the restraining potential, calculate the umbrella restraint # u_kn[i,j] = beta_k[i] * (potential_energy-restraint_potential) # reduced potential energy without umbrella restraint # # # Compute correlation times for potential energy and timeseries. # # If the temperatures differ, use energies to determine samples; otherwise, use the cosine of chi # g_k[i] = timeseries.statisticalInefficiency(u_kn[i,:], u_kn[i,:]) # indices = timeseries.subsampleCorrelatedData(u_kn[i,:]) else: # no temperature column for j,l in enumerate(clean_split_lines): data[i,j] = float(l[1]) dataset = numpy.cos(data[i,:w['n']]) g_k[i] = timeseries.statisticalInefficiency(dataset,dataset) if not options.skip_subsampling: indices = timeseries.subsampleCorrelatedData(dataset) if options.skip_subsampling: data_max.append(numpy.max(data[i])) data_min.append(numpy.min(data[i])) w['n'] = len(data[i]) u_kn[i,0:w['n']] = u_kn[i] data[i,0:w['n']] = data[i] else: # get min and max for data, used for binning ranges data_max.append(numpy.max(data[i,indices])) data_min.append(numpy.min(data[i,indices])) # Subsample the data w['n'] = len(indices) u_kn[i,0:w['n']] = u_kn[i,indices] data[i,0:w['n']] = data[i,indices] print "Correlation time for set %5d is %10.3f" % (i,g_k[i]) print "Finished reading data files" # Set zero of u_kn -- this is arbitrary. u_kn -= u_kn.min() # Construct torsion bins print "Binning data..." data_min = numpy.min(data_min) data_max = numpy.max(data_max) delta = (data_max - data_min) / float(options.bins) print "Min coord: %f" % data_min print "Max coord: %f" % data_max print "Delta for binning %f" % delta # compute bin centers bin_center_i = numpy.zeros([options.bins], numpy.float64) for i in range(options.bins): bin_center_i[i] = data_min + delta/2 + delta * i # Bin data bin_kn = numpy.zeros([K,N_max], numpy.int32)-1 # for each window for k in range(K): # for 0 to the number of snapshots in the window k for n in range(metadata[k]['n']): # Compute bin assignment. bin_kn[k,n] = int((data[k,n] - data_min) / delta) for l in range(K): # Compute minimum-image torsion deviation from umbrella center l dchi = data[k,n] - metadata[l]['coord'] # Compute energy of snapshot n from simulation k in umbrella potential l u_kln[k,l,n] = u_kn[k,n] + beta_k[k]*metadata[l]['k']*(dchi**2) for i in range(options.bins): if numpy.sum(bin_kn==i) == 0: for j in range(options.bins): print "Bin: %d" % j print numpy.sum(bin_kn==j) raise Exception("At least one bin has no samples. Adjust bin sizes or eliminate empty bins to ensure at least one sample per bin.") # Initialize MBAR. print "Running MBAR..." N_k = numpy.array([ w['n'] for w in metadata ], numpy.int32) mbar = pymbar.MBAR(u_kln, N_k, verbose=options.verbose, initialize='BAR') #mbar = pymbar.MBAR(u_kln, N_k, verbose=options.verbose) #mbar = pymbar.MBAR(u_kln, N_k, verbose = True, method = 'Newton-Raphson') # Compute PMF in unbiased potential (in units of kT). (f_i, df_i) = mbar.computePMF(u_kn, bin_kn, options.bins) # Write out PMF and save to file print "Saving PMF to file: %s" % options.output_file f = open(options.output_file, 'w') print "PMF (in units of kT)" print "%8s %8s %8s" % ('bin', 'f', 'df') f.write("#Coor Free +/-\n") for i in range(options.bins): print "%8.1f %8.3f %8.3f" % (bin_center_i[i], f_i[i], df_i[i]) f.write("%8.1f %8.3f %8.3f\n" % (bin_center_i[i], f_i[i], df_i[i])) f.close()
#thisDistDat = np.loadtxt('%s/prod_pullx.xvg'%adir) thisEDat = np.loadtxt('%s/prod_out.txt' % adir) thisDistDat = np.loadtxt('%s/prod_restraint.txt' % adir) #allRef[i] = thisDistDat[0, 2] allRef[i] = thisDistDat[0, 1] #Need to adjust so have energies with same frequency as distances #Should also have restraint energies in file so can subtract #thisEnergy = thisEDat[::2,2] - thisEDat[::2,1] #thisDist = thisDistDat[:,1] thisEnergy = thisEDat[:, 2] - thisDistDat[:, 3] thisDist = thisDistDat[:, 2] #Only take uncorrelated samples... uncorrinds = timeseries.subsampleCorrelatedData(thisEnergy) #uncorrinds = np.arange(len(thisEnergy)) numSamples[i] = len(uncorrinds) print "For %s, have %i independent samples." % (adir, len(uncorrinds)) allU = np.hstack((allU, thisEnergy[uncorrinds])) allDist = np.hstack((allDist, thisDist[uncorrinds])) #Plot histogram with uncorrelated indices thisHist, thisBins = np.histogram(thisDist[uncorrinds], bins='auto', density=False) binCents = 0.5 * (thisBins[:-1] + thisBins[1:]) histAx.plot(binCents, thisHist)
def overlap_check(reference_system, positions, receptor_atoms, ligand_atoms, platform_name=None, annihilate_electrostatics=True, annihilate_sterics=False, precision=None, nsteps=50, nsamples=200): """ Test overlap between reference system and alchemical system by running a short simulation. Parameters ---------- reference_system : simtk.openmm.System The reference System object to compare with positions : simtk.unit.Quantity with units compatible with nanometers The positions to assess energetics for. receptor_atoms : list of int The list of receptor atoms. ligand_atoms : list of int The list of ligand atoms to alchemically modify. platform_name : str, optional, default=None The name of the platform to use for benchmarking. annihilate_electrostatics : bool, optional, default=True If True, electrostatics will be annihilated; if False, decoupled. annihilate_sterics : bool, optional, default=False If True, sterics will be annihilated; if False, decoupled. nsteps : int, optional, default=50 Number of molecular dynamics steps between samples. nsamples : int, optional, default=100 Number of samples to collect. """ # Create a fully-interacting alchemical state. factory = AbsoluteAlchemicalFactory(reference_system, ligand_atoms=ligand_atoms) alchemical_state = AlchemicalState() alchemical_system = factory.createPerturbedSystem(alchemical_state) temperature = 300.0 * units.kelvin collision_rate = 5.0 / units.picoseconds timestep = 2.0 * units.femtoseconds kT = (kB * temperature) # Select platform. platform = None if platform_name: platform = openmm.Platform.getPlatformByName(platform_name) # Create integrators. reference_integrator = openmm.LangevinIntegrator(temperature, collision_rate, timestep) alchemical_integrator = openmm.VerletIntegrator(timestep) # Create contexts. if platform: reference_context = openmm.Context(reference_system, reference_integrator, platform) alchemical_context = openmm.Context(alchemical_system, alchemical_integrator, platform) else: reference_context = openmm.Context(reference_system, reference_integrator) alchemical_context = openmm.Context(alchemical_system, alchemical_integrator) # Collect simulation data. reference_context.setPositions(positions) du_n = np.zeros([nsamples], np.float64) # du_n[n] is the for sample in range(nsamples): # Run dynamics. reference_integrator.step(nsteps) # Get reference energies. reference_state = reference_context.getState(getEnergy=True, getPositions=True) reference_potential = reference_state.getPotentialEnergy() # Get alchemical energies. alchemical_context.setPositions(reference_state.getPositions()) alchemical_state = alchemical_context.getState(getEnergy=True) alchemical_potential = alchemical_state.getPotentialEnergy() du_n[sample] = (alchemical_potential - reference_potential) / kT # Clean up. del reference_context, alchemical_context # Discard data to equilibration and subsample. from pymbar import timeseries [t0, g, Neff] = timeseries.detectEquilibration(du_n) indices = timeseries.subsampleCorrelatedData(du_n, g=g) du_n = du_n[indices] # Compute statistics. from pymbar import EXP [DeltaF, dDeltaF] = EXP(du_n) # Raise an exception if the error is larger than 3kT. MAX_DEVIATION = 3.0 # kT if (dDeltaF > MAX_DEVIATION): report = "DeltaF = %12.3f +- %12.3f kT (%5d samples, g = %6.1f)" % (DeltaF, dDeltaF, Neff, g) raise Exception(report) return
def extract_ncfile_energies(ncfile, ndiscard=0, nuse=None, g=None): """ Extract and decorelate energies from the ncfile to gather common data for other functions Parameters ---------- ncfile : NetCDF Input YANK netcdf file ndiscard : int, optional, default=0 Number of iterations to discard to equilibration nuse : int, optional, default=None Maximum number of iterations to use (after discarding) g : int, optional, default=None Statistical inefficiency to use if desired; if None, will be computed. TODO ---- * Automatically determine 'ndiscard'. """ # Get current dimensions. niterations = ncfile.variables['energies'].shape[0] nstates = ncfile.variables['energies'].shape[1] natoms = ncfile.variables['energies'].shape[2] # Extract energies. logger.info("Reading energies...") energies = ncfile.variables['energies'] u_kln_replica = np.zeros([nstates, nstates, niterations], np.float64) for n in range(niterations): u_kln_replica[:,:,n] = energies[n,:,:] logger.info("Done.") # Deconvolute replicas logger.info("Deconvoluting replicas...") u_kln = np.zeros([nstates, nstates, niterations], np.float64) for iteration in range(niterations): state_indices = ncfile.variables['states'][iteration,:] u_kln[state_indices,:,iteration] = energies[iteration,:,:] logger.info("Done.") # Compute total negative log probability over all iterations. u_n = np.zeros([niterations], np.float64) for iteration in range(niterations): u_n[iteration] = np.sum(np.diagonal(u_kln[:,:,iteration])) # Discard initial data to equilibration. u_kln_replica = u_kln_replica[:,:,ndiscard:] u_kln = u_kln[:,:,ndiscard:] u_n = u_n[ndiscard:] # Truncate to number of specified conforamtions to use if (nuse): u_kln_replica = u_kln_replica[:,:,0:nuse] u_kln = u_kln[:,:,0:nuse] u_n = u_n[0:nuse] # Subsample data to obtain uncorrelated samples N_k = np.zeros(nstates, np.int32) indices = timeseries.subsampleCorrelatedData(u_n, g=g) # indices of uncorrelated samples #print(u_n) # DEBUG #indices = range(0,u_n.size) # DEBUG - assume samples are uncorrelated N = len(indices) # number of uncorrelated samples N_k[:] = N u_kln = u_kln[:,:,indices] logger.info("number of uncorrelated samples:") logger.info(N_k) logger.info("") # Check for the expanded cutoff states, and subsamble as needed try: u_ln_full_raw = ncfile.variables['fully_interacting_expanded_cutoff_energies'][:].T #Its stored as nl, need in ln u_ln_non_raw = ncfile.variables['noninteracting_expanded_cutoff_energies'][:].T fully_interacting_u_ln = np.zeros(u_ln_full_raw.shape) noninteracting_u_ln = np.zeros(u_ln_non_raw.shape) # Deconvolute the fully interacting state for iteration in range(niterations): state_indices = ncfile.variables['states'][iteration,:] fully_interacting_u_ln[state_indices,iteration] = u_ln_full_raw[:,iteration] noninteracting_u_ln[state_indices,iteration] = u_ln_non_raw[:,iteration] # Discard non-equilibrated samples fully_interacting_u_ln = fully_interacting_u_ln[:,ndiscard:] fully_interacting_u_ln = fully_interacting_u_ln[:,indices] noninteracting_u_ln = noninteracting_u_ln[:,ndiscard:] noninteracting_u_ln = noninteracting_u_ln[:,indices] # Augment u_kln to accept the new state u_kln_new = np.zeros([nstates + 2, nstates + 2, N], np.float64) N_k_new = np.zeros(nstates + 2, np.int32) # Insert energies u_kln_new[1:-1,0,:] = fully_interacting_u_ln u_kln_new[1:-1,-1,:] = noninteracting_u_ln # Fill in other energies u_kln_new[1:-1,1:-1,:] = u_kln N_k_new[1:-1] = N_k # Notify users logger.info("Found expanded cutoff states in the energies!") logger.info("Free energies will be reported relative to them instead!") # Reset values, last step in case something went wrong so we dont overwrite u_kln on accident u_kln = u_kln_new N_k = N_k_new except: pass return u_kln, N_k, u_n
out.write("Molecule, Log D +/-, HPD95%[low, high]\n") debug.write("Molecule mean - median = difference") used_samples.write("Molecule, equilibration, N samples") # curdir = os.getcwd() # os.makedirs("plots", ) # os.chdir("plots") for mol in sorted(list(x.logd.keys())): print("Processing {}".format(mol)) # sns.plt.figure() trace = numpy.asarray(mc.trace("LogD_{}".format(mol))[:]) # Burn in and thinning estimated using pymbar burnin = detectEquilibration(trace)[0] trace= trace[burnin:] uncorrelated_indices = subsampleCorrelatedData(trace) trace=trace[uncorrelated_indices] median = pymc.utils.quantiles(trace)[50] mean = numpy.mean(trace) lower, upper = pymc.utils.hpd(trace, 0.05) lower_s = to_precision(lower,2) # string of number with 2 sig digits upper_s = to_precision(upper,2) logd = ufloat(mean, numpy.std(trace)) # Formats the mean and error by the correct amount of significant digits out.write("{0}, {1:.1u}, [{2}, {3}]\n".format(mol, logd, lower_s, upper_s )) debug.write("{}: {} - {} = {}".format(mol, mean, median, mean-median)) used_samples.write("{}, {}, {}".format(mol, burnin, len(uncorrelated_indices))) # pymc.Matplot.plot(trace, "LogD_{}".format(mol)) # sns.plt.figure()
def SimulateAlchemy(path, niter, nsteps_per_iter, nlambda): """Calculates the binding free energy of a ligand names 'UNL' using alchemy. One step corresponds to two femtoseconds. """ prmtop = app.AmberPrmtopFile(f'{path}/com.prmtop') inpcrd = app.AmberInpcrdFile(f'{path}/com.inpcrd') system = prmtop.createSystem(implicitSolvent=app.GBn2, nonbondedMethod=app.CutoffNonPeriodic, nonbondedCutoff=1.0 * unit.nanometers, constraints=app.HBonds, rigidWater=True, ewaldErrorTolerance=0.0005) # Detect ligand indices ligand_ind = [] for atm in prmtop.topology.atoms(): # OpenEye make the ligand name 'UNL' if atm.residue.name == 'UNL': ligand_ind.append(atm.index) ligand_ind = set(ligand_ind) AddAlchemyForces(system, ligand_ind) integrator = mm.LangevinIntegrator(300 * unit.kelvin, 1.0 / unit.picoseconds, 2.0 * unit.femtoseconds) integrator.setConstraintTolerance(0.00001) # TODO: The issues here are the same as the mmgbsa.py script # TODO: This should just recognize whatever the computer is capable of, not force CUDA. # TODO: I am not sure if mixed precision is necessary. Just need to be consistent platform = mm.Platform.getPlatformByName('CUDA') properties = {'CudaPrecision': 'mixed'} simulation = app.Simulation(prmtop.topology, system, integrator, platform) simulation.context.setPositions(inpcrd.positions) simulation.minimizeEnergy() ### Now simulate system import numpy as np from pymbar import MBAR, timeseries lambdas = np.linspace(1.0, 0.0, nlambda) # Save the potential energies for MBAR u_kln = np.zeros([nlambda, nlambda, niter]) kT = unit.AVOGADRO_CONSTANT_NA * unit.BOLTZMANN_CONSTANT_kB * integrator.getTemperature( ) # TODO: This runs in series. Someone comfortable with MPI should help parallelize this. for k in range(nlambda): for i in range(niter): print('state %5d iteration %5d / %5d' % (k, i, niter)) simulation.context.setParameter('lambda', lambdas[k]) integrator.step(nsteps_per_iter) for l in range(nlambda): simulation.context.setParameter('lambda', lambdas[l]) u_kln[k, l, i] = simulation.context.getState( getEnergy=True).getPotentialEnergy() / kT # Subsample to reduce variation N_k = np.zeros([nlambda], np.int32) # number of uncorrelated samples for k in range(nlambda): [t0, g, Neff_max] = timeseries.detectEquilibration(u_kln[k, k, :]) # TODO: maybe should use 't0:' instead of ':' in third index indices = timeseries.subsampleCorrelatedData(u_kln[k, k, :], g=g) N_k[k] = len(indices) u_kln[k, :, 0:N_k[k]] = u_kln[k, :, indices].T # Calculate the energy difference # TODO: I've never worked with pymbar beyond the timeseries function. I'm not sure how the error in DeltaF is calculated, and I don't know what Theta is right now. mbar = MBAR(u_kln, N_k) [DeltaF_ij, dDeltaF_ij, Theta_ij] = mbar.getFreeEnergyDifferences() return DeltaF_ij[0][-1], dDeltaF_ij[0][-1]
# Estimate the statistical inefficiency of the simulation by analyzing the timeseries of interest. # We use the max of cos and sin of the phi and psi timeseries because they are periodic angles. # The print "Computing statistical inefficiencies..." g_cosphi = timeseries.statisticalInefficiencyMultiple(numpy.cos(phi_kt_replica * numpy.pi / 180.0)) print "g_cos(phi) = %.1f" % g_cosphi g_sinphi = timeseries.statisticalInefficiencyMultiple(numpy.sin(phi_kt_replica * numpy.pi / 180.0)) print "g_sin(phi) = %.1f" % g_sinphi g_cospsi = timeseries.statisticalInefficiencyMultiple(numpy.cos(psi_kt_replica * numpy.pi / 180.0)) print "g_cos(psi) = %.1f" % g_cospsi g_sinpsi = timeseries.statisticalInefficiencyMultiple(numpy.sin(psi_kt_replica * numpy.pi / 180.0)) print "g_sin(psi) = %.1f" % g_sinpsi # Subsample data with maximum of all correlation times. print "Subsampling data..." g = numpy.max(numpy.array([g_cosphi, g_sinphi, g_cospsi, g_sinpsi])) indices = timeseries.subsampleCorrelatedData(U_kt[k,:], g = g) print "Using g = %.1f to obtain %d uncorrelated samples per temperature" % (g, len(indices)) N_max = int(numpy.ceil(T / g)) # max number of samples per temperature U_kn = numpy.zeros([K, N_max], numpy.float64) phi_kn = numpy.zeros([K, N_max], numpy.float64) psi_kn = numpy.zeros([K, N_max], numpy.float64) N_k = N_max * numpy.ones([K], numpy.int32) for k in range(K): U_kn[k,:] = U_kt[k,indices] phi_kn[k,:] = phi_kt[k,indices] psi_kn[k,:] = psi_kt[k,indices] print "%d uncorrelated samples per temperature" % N_max #=================================================================================================== # Generate a list of indices of all configurations in kn-indexing #===================================================================================================
getEnergy=True).getPotentialEnergy() / (kT) print(potential_energy) u_kln[k, l, iteration] = potential_energy outline += ",%.4f" % potential_energy simfile.write(outline) simfile.close() print("**************************************************") print("Estimation of free energy with MBAR ...") #try an on the fly mbar estimation # Subsample data to extract uncorrelated equilibrium timeseries N_k = np.zeros([nstates], np.int32) # number of uncorrelated samples for k in range(nstates): [nequil, g, Neff_max] = timeseries.detectEquilibration(u_kln[k, k, :]) indices = timeseries.subsampleCorrelatedData(u_kln[k, k, :], g=g) N_k[k] = len(indices) u_kln[k, :, 0:N_k[k]] = u_kln[k, :, indices].T # Compute free energy differences and statistical uncertainties mbar = MBAR(u_kln, N_k, verbose=True, method="adaptive", relative_tolerance=1e-10) #, initialize="BAR") [DeltaF_ij, dDeltaF_ij, Theta_ij] = mbar.getFreeEnergyDifferences(uncertainty_method='svd-ew') print('DeltaF_ij (kcal/mol):') print(DeltaF_ij[0, nstates - 1] * 298.0 * 0.001987204) mbar_fe = DeltaF_ij[0, nstates - 1] * 298.0 * 0.001987204
# Subsample trajectories based on Hconf hconf = zeros([K, N_max], float64) volume = zeros([K, N_max], float64) uconf = zeros([K, N_max], float64) N_k = zeros([K], int32) if correlated_data == 1: hconf = zeros([K, T_max], float64) N_ksam = zeros([K], int32) indices2 = zeros([T_max], int32) for k in range(1,T_max): indices2[k] = k for k in range(K): # Compute correlation times if not hconf_original[k,0] == 0: indices = timeseries.subsampleCorrelatedData(hconf_original[k,0:T_k[k]]) # Store subsampled positions if len(indices) >= 1000: N_ksam[k] = len(indices) hconf[k,0:N_ksam[k]] = hconf_original[k,indices] volume[k,0:N_ksam[k]] = volume_original[k,indices] uconf[k,0:N_ksam[k]] = uconf_original[k,indices] else: N_ksam[k] = len(indices2) hconf[k,0:N_ksam[k]] = hconf_original[k,indices2] volume[k,0:N_ksam[k]] = volume_original[k,indices2] uconf[k,0:N_ksam[k]] = uconf_original[k,indices2] print('\n') else: N_ksam[k] = len(indices2) hconf[k,0:N_ksam[k]] = hconf_original[k,indices2]
if line[0] != '#' and line[0] != '@': tokens = line.split() print(tokens) u_kn[k, n] = beta_k[k] * ( float(tokens[2]) - float(tokens[1]) ) # reduced potential energy without umbrella restraint n += 1 # Compute correlation times for potential energy and chi # timeseries. If the temperatures differ, use energies to determine samples; otherwise, use the cosine of chi if (DifferentTemperatures): g_k[k] = timeseries.statisticalInefficiency(u_kn[k, :], u_kn[k, 0:N_k[k]]) print("Correlation time for set %5d is %10.3f" % (k, g_k[k])) indices = timeseries.subsampleCorrelatedData(u_kn[k, 0:N_k[k]]) else: chi_radians = chi_kn[k, 0:N_k[k]] / (180.0 / numpy.pi) g_cos = timeseries.statisticalInefficiency(numpy.cos(chi_radians)) g_sin = timeseries.statisticalInefficiency(numpy.sin(chi_radians)) print("g_cos = %.1f | g_sin = %.1f" % (g_cos, g_sin)) g_k[k] = max(g_cos, g_sin) print("Correlation time for set %5d is %10.3f" % (k, g_k[k])) indices = timeseries.subsampleCorrelatedData(chi_radians, g=g_k[k]) # Subsample data. N_k[k] = len(indices) u_kn[k, 0:N_k[k]] = u_kn[k, indices] chi_kn[k, 0:N_k[k]] = chi_kn[k, indices] N_max = numpy.max(N_k) # shorten the array size u_kln = numpy.zeros(
kT = unit.AVOGADRO_CONSTANT_NA * unit.BOLTZMANN_CONSTANT_kB * integrator.getTemperature() for k in range(nstates): for iteration in range(niterations): print('state %5d iteration %5d / %5d' % (k, iteration, niterations)) # Set alchemical state context.setParameter('lambda', lambdas[k]) # Run some dynamics integrator.step(nsteps) # Compute energies at all alchemical states for l in range(nstates): context.setParameter('lambda', lambdas[l]) u_kln[k,l,iteration] = context.getState(getEnergy=True).getPotentialEnergy() / kT # Estimate free energy of Lennard-Jones particle insertion from pymbar import MBAR, timeseries # Subsample data to extract uncorrelated equilibrium timeseries N_k = np.zeros([nstates], np.int32) # number of uncorrelated samples for k in range(nstates): [nequil, g, Neff_max] = timeseries.detectEquilibration(u_kln[k,k,:]) indices = timeseries.subsampleCorrelatedData(u_kln[k,k,:], g=g) N_k[k] = len(indices) u_kln[k,:,0:N_k[k]] = u_kln[k,:,indices].T # Compute free energy differences and statistical uncertainties mbar = MBAR(u_kln, N_k) [DeltaF_ij, dDeltaF_ij, Theta_ij] = mbar.getFreeEnergyDifferences() print('DeltaF_ij (kT):') print(DeltaF_ij) print('dDeltaF_ij (kT):') print(dDeltaF_ij)
def generate_simulation_data(database, parameters, cid): """ Regenerate simulation data for given parameters. ARGUMENTS database (dict) - database of molecules parameters (dict) - dictionary of GBSA parameters keyed on GBSA atom types """ platform = openmm.Platform.getPlatformByName("Reference") from pymbar import timeseries entry = database[cid] molecule = entry["molecule"] iupac_name = entry["iupac"] # Retrieve vacuum system. vacuum_system = copy.deepcopy(entry["system"]) # Retrieve OpenMM System. solvent_system = copy.deepcopy(entry["system"]) # Get nonbonded force. forces = { solvent_system.getForce(index).__class__.__name__: solvent_system.getForce(index) for index in range(solvent_system.getNumForces()) } nonbonded_force = forces["NonbondedForce"] # Add GBSA term gbsa_force = openmm.GBSAOBCForce() gbsa_force.setNonbondedMethod(openmm.GBSAOBCForce.NoCutoff) # set no cutoff gbsa_force.setSoluteDielectric(1) gbsa_force.setSolventDielectric(78) # Build indexable list of atoms. atoms = [atom for atom in molecule.GetAtoms()] natoms = len(atoms) # Assign GBSA parameters. for (atom_index, atom) in enumerate(atoms): [charge, sigma, epsilon] = nonbonded_force.getParticleParameters(atom_index) atomtype = atom.GetStringData("gbsa_type") # GBSA atomtype radius = parameters["%s_%s" % (atomtype, "radius")] * units.angstroms scalingFactor = parameters["%s_%s" % (atomtype, "scalingFactor")] gbsa_force.addParticle(charge, radius, scalingFactor) # Add the force to the system. solvent_system.addForce(gbsa_force) # Create context for solvent system. timestep = 2.0 * units.femtosecond collision_rate = 20.0 / units.picoseconds temperature = entry["temperature"] integrator = openmm.LangevinIntegrator(temperature, collision_rate, timestep) context = openmm.Context(vacuum_system, integrator, platform) # Set the coordinates. positions = entry["positions"] context.setPositions(positions) # Minimize. openmm.LocalEnergyMinimizer.minimize(context) # Simulate, saving periodic snapshots of configurations. kT = kB * temperature beta = 1.0 / kT initial_time = time.time() nsteps_per_iteration = 2500 niterations = 200 x_n = np.zeros([niterations, natoms, 3], np.float32) # positions, in nm u_n = np.zeros([niterations], np.float64) # energy differences, in kT for iteration in range(niterations): integrator.step(nsteps_per_iteration) state = context.getState(getEnergy=True, getPositions=True) x_n[iteration, :, :] = state.getPositions(asNumpy=True) / units.nanometers u_n[iteration] = beta * state.getPotentialEnergy() if np.any(np.isnan(u_n)): raise Exception("Encountered NaN for molecule %s | %s" % (cid, iupac_name)) final_time = time.time() elapsed_time = final_time - initial_time # Clean up. del context, integrator # Discard initial transient to equilibration. [t0, g, Neff_max] = timeseries.detectEquilibration(u_n) x_n = x_n[t0:, :, :] u_n = u_n[t0:] # Subsample to remove correlation. indices = timeseries.subsampleCorrelatedData(u_n, g=g) x_n = x_n[indices, :, :] u_n = u_n[indices] # Store data. entry["x_n"] = x_n entry["u_n"] = u_n print "%48s | %64s | simulation %12.3f s | %5d samples discarded | %5d independent samples remain" % ( cid, iupac_name, elapsed_time, t0, len(indices), ) return [cid, entry]
def DoBAR(fwds, revs, label, verbose): """ BAR to combine fwd and rev data of dGs. Here, don't multiply dGs_R by -1 since BAR calls for reverse work value. Parameters ---------- fwds: dictionary of forward work values for each window revs: dictionary of reverse work values for each window label: string label of what it is (only for printing output) Returns ------- dgs: 1D list of accumulated list of energy values. Ex. if each step was 2, then dgs would be [0,2,4...] gsdlist: 1D list of accompanying stdevs to the dgs list """ fwd_ss = {} # subsampled version of fwds rev_ss = {} # subsampled version of revs dg_bar = np.zeros([len(fwds)], np.float64) # allocate storage: dG steps gsd_bar = np.zeros([len(fwds)], np.float64) # allocate storage: dG stdev steps dgs = np.zeros([len(fwds)], np.float64) # allocate storage: dG accumulated gsdlist = np.zeros([len(fwds)], np.float64) # allocate storage: dG stdev accum #corr_time = np.zeros([len(fwds)], np.float64) corr_time = {} for key, value in fwds.items(): # this notation changes in python3: http://tinyurl.com/j3uq3me # compute correlation time g = timeseries.statisticalInefficiency(value) corr_time[key] = [g] # compute indices of UNcorrelated timeseries, then extract those samples indices = timeseries.subsampleCorrelatedData(value, g) fwd_ss[key] = value[indices] for key, value in revs.items(): # this notation changes in python3: http://tinyurl.com/j3uq3me # compute correlation time g = timeseries.statisticalInefficiency(value) corr_time[key].append(g) # compute indices of UNcorrelated timeseries, then extract those samples indices = timeseries.subsampleCorrelatedData(value, g) rev_ss[key] = value[indices] bar = {} # then apply BAR estimator to get dG for each step for kF, kR in zip(sorted(fwd_ss.keys()), sorted(list(rev_ss.keys()), reverse=True)): dg_bar[kF], gsd_bar[kF] = BAR(fwd_ss[kF],rev_ss[kR]) bar[kF] = [ np.sum(dg_bar), dg_bar[kF], gsd_bar[kF] ] # calculate the net dG standard deviation = sqrt[ sum(s_i^2) ] gsd = (np.sum(np.power(gsd_bar, 2)))**0.5 net = 0. netsd = 0. for i, g in enumerate(dg_bar): # accumulate net dGs into running sums (plot this) dgs[i] = dg_bar[i] + net net = dgs[i] # combine the stdevs: s = sqrt(s1^2 + s2^2 + ...) gsdlist[i] = ((gsd_bar[i])**2.+(netsd)**2.)**0.5 netsd = gsdlist[i] if verbose == True: print('\n\n#####---Correlation Times for dG_{}--#####'.format(label)) print('Window'.rjust(3), 'F'.rjust(5), 'R'.rjust(9)) for k,v in corr_time.items(): print("{:3d} {:10.3f} {:10.3f}".format(k, v[0], v[1]) ) print("\n\n#####---BAR estimator for dG_{}---#####".format(label)) print('Window'.rjust(3), 'dG'.rjust(5), 'ddG'.rjust(11), "Uncert.".rjust(11)) print("---------------------------------------------------------") for k, v in bar.items(): str = '{:3d} {:10.4f} {:10.4f} +- {:3.4f}'.format(k, v[0], v[1], v[2]) print(str) print(("\nNet dG_{} energy difference = {:.4f} +- {:.4f} kcal/mol".format(label, np.sum(dg_bar), gsd))) return dgs, gsdlist
####################################################################### # Subsample {U,A}_kn_correlated to be uncorrelated # ####################################################################### print "Subsampling to achieve uncorrelated data" if stat_inefficiency == None: print "(1 of 2) Calculating statistical inefficiency (i = ", stdout.flush() for d in range(N_CVs): statnew = timeseries.statisticalInefficiencyMultiple(A_ikn_correlated[d]) stat_inefficiency = max([stat_inefficiency, statnew]) print stat_inefficiency, ")" else: print "(1 of 2) Using given statistical inefficiency (i =", str(stat_inefficiency) + ")" indices = timeseries.subsampleCorrelatedData(U_kn_correlated[0,:], g = stat_inefficiency) N_uncorrelated_samples = len(indices) print "(2 of 2) Subsampling to achieve", N_uncorrelated_samples, "samples per replica" U_kn = zeros([ N_replicas+N_output_temps,N_uncorrelated_samples], float32) A_ikn = zeros([N_CVs,N_replicas+N_output_temps,N_uncorrelated_samples], float32) for k in range(N_replicas): U_kn[k] = U_kn_correlated[k][indices] for d in range(N_CVs): A_ikn[d][k] = A_ikn_correlated[d][k][indices] print "" #######################################################################
def estimate_free_energies(ncfile, ndiscard=0, nuse=None, g=None): """ Estimate free energies of all alchemical states. Parameters ---------- ncfile : NetCDF Input YANK netcdf file ndiscard : int, optional, default=0 Number of iterations to discard to equilibration nuse : int, optional, default=None Maximum number of iterations to use (after discarding) g : int, optional, default=None Statistical inefficiency to use if desired; if None, will be computed. TODO ---- * Automatically determine 'ndiscard'. """ # Get current dimensions. niterations = ncfile.variables['energies'].shape[0] nstates = ncfile.variables['energies'].shape[1] natoms = ncfile.variables['energies'].shape[2] # Extract energies. logger.info("Reading energies...") energies = ncfile.variables['energies'] u_kln_replica = np.zeros([nstates, nstates, niterations], np.float64) for n in range(niterations): u_kln_replica[:,:,n] = energies[n,:,:] logger.info("Done.") # Deconvolute replicas logger.info("Deconvoluting replicas...") u_kln = np.zeros([nstates, nstates, niterations], np.float64) for iteration in range(niterations): state_indices = ncfile.variables['states'][iteration,:] u_kln[state_indices,:,iteration] = energies[iteration,:,:] logger.info("Done.") # Compute total negative log probability over all iterations. u_n = np.zeros([niterations], np.float64) for iteration in range(niterations): u_n[iteration] = np.sum(np.diagonal(u_kln[:,:,iteration])) #logger.info(u_n # DEBUG outfile = open('u_n.out', 'w') for iteration in range(niterations): outfile.write("%8d %24.3f\n" % (iteration, u_n[iteration])) outfile.close() # Discard initial data to equilibration. u_kln_replica = u_kln_replica[:,:,ndiscard:] u_kln = u_kln[:,:,ndiscard:] u_n = u_n[ndiscard:] # Truncate to number of specified conforamtions to use if (nuse): u_kln_replica = u_kln_replica[:,:,0:nuse] u_kln = u_kln[:,:,0:nuse] u_n = u_n[0:nuse] # Subsample data to obtain uncorrelated samples N_k = np.zeros(nstates, np.int32) indices = timeseries.subsampleCorrelatedData(u_n, g=g) # indices of uncorrelated samples #print u_n # DEBUG #indices = range(0,u_n.size) # DEBUG - assume samples are uncorrelated N = len(indices) # number of uncorrelated samples N_k[:] = N u_kln[:,:,0:N] = u_kln[:,:,indices] logger.info("number of uncorrelated samples:") logger.info(N_k) logger.info("") #=================================================================================================== # Estimate free energy difference with MBAR. #=================================================================================================== # Initialize MBAR (computing free energy estimates, which may take a while) logger.info("Computing free energy differences...") mbar = MBAR(u_kln, N_k) # Get matrix of dimensionless free energy differences and uncertainty estimate. logger.info("Computing covariance matrix...") try: # pymbar 2 (Deltaf_ij, dDeltaf_ij) = mbar.getFreeEnergyDifferences() except ValueError: # pymbar 3 (Deltaf_ij, dDeltaf_ij, theta_ij) = mbar.getFreeEnergyDifferences() # # Matrix of free energy differences logger.info("Deltaf_ij:") for i in range(nstates): str_row = "" for j in range(nstates): str_row += "%8.3f" % Deltaf_ij[i, j] logger.info(str_row) # print Deltaf_ij # # Matrix of uncertainties in free energy difference (expectations standard deviations of the estimator about the true free energy) logger.info("dDeltaf_ij:") for i in range(nstates): str_row = "" for j in range(nstates): str_row += "%8.3f" % dDeltaf_ij[i, j] logger.info(str_row) # Return free energy differences and an estimate of the covariance. return (Deltaf_ij, dDeltaf_ij)
def estimate_enthalpies(ncfile, ndiscard=0, nuse=None, g=None): """ Estimate enthalpies of all alchemical states. Parameters ---------- ncfile : NetCDF Input YANK netcdf file ndiscard : int, optional, default=0 Number of iterations to discard to equilibration nuse : int, optional, default=None Number of iterations to use (after discarding) g : int, optional, default=None Statistical inefficiency to use if desired; if None, will be computed. TODO ---- * Automatically determine 'ndiscard'. * Combine some functions with estimate_free_energies. """ # Get current dimensions. niterations = ncfile.variables['energies'].shape[0] nstates = ncfile.variables['energies'].shape[1] natoms = ncfile.variables['energies'].shape[2] # Extract energies. logger.info("Reading energies...") energies = ncfile.variables['energies'] u_kln_replica = np.zeros([nstates, nstates, niterations], np.float64) for n in range(niterations): u_kln_replica[:,:,n] = energies[n,:,:] logger.info("Done.") # Deconvolute replicas logger.info("Deconvoluting replicas...") u_kln = np.zeros([nstates, nstates, niterations], np.float64) for iteration in range(niterations): state_indices = ncfile.variables['states'][iteration,:] u_kln[state_indices,:,iteration] = energies[iteration,:,:] logger.info("Done.") # Compute total negative log probability over all iterations. u_n = np.zeros([niterations], np.float64) for iteration in range(niterations): u_n[iteration] = np.sum(np.diagonal(u_kln[:,:,iteration])) #print u_n # DEBUG outfile = open('u_n.out', 'w') for iteration in range(niterations): outfile.write("%8d %24.3f\n" % (iteration, u_n[iteration])) outfile.close() # Discard initial data to equilibration. u_kln_replica = u_kln_replica[:,:,ndiscard:] u_kln = u_kln[:,:,ndiscard:] u_n = u_n[ndiscard:] # Truncate to number of specified conformations to use if (nuse): u_kln_replica = u_kln_replica[:,:,0:nuse] u_kln = u_kln[:,:,0:nuse] u_n = u_n[0:nuse] # Subsample data to obtain uncorrelated samples N_k = np.zeros(nstates, np.int32) indices = timeseries.subsampleCorrelatedData(u_n, g=g) # indices of uncorrelated samples #print u_n # DEBUG #indices = range(0,u_n.size) # DEBUG - assume samples are uncorrelated N = len(indices) # number of uncorrelated samples N_k[:] = N u_kln[:,:,0:N] = u_kln[:,:,indices] logger.info("number of uncorrelated samples:") logger.info(N_k) logger.info("") # Compute average enthalpies. H_k = np.zeros([nstates], np.float64) # H_i[i] is estimated enthalpy of state i dH_k = np.zeros([nstates], np.float64) for k in range(nstates): H_k[k] = u_kln[k,k,:].mean() dH_k[k] = u_kln[k,k,:].std() / np.sqrt(N) return (H_k, dH_k)
def estimate_enthalpies(ncfile, ndiscard = 0, nuse = None): """Estimate enthalpies of all alchemical states. ARGUMENTS ncfile (NetCDF) - input YANK netcdf file OPTIONAL ARGUMENTS ndiscard (int) - number of iterations to discard to equilibration nuse (int) - number of iterations to use (after discarding) TODO: Automatically determine 'ndiscard'. TODO: Combine some functions with estimate_free_energies. """ # Get current dimensions. niterations = ncfile.variables['energies'].shape[0] nstates = ncfile.variables['energies'].shape[1] natoms = ncfile.variables['energies'].shape[2] # Extract energies. print "Reading energies..." energies = ncfile.variables['energies'] u_kln_replica = numpy.zeros([nstates, nstates, niterations], numpy.float64) for n in range(niterations): u_kln_replica[:,:,n] = energies[n,:,:] print "Done." # Deconvolute replicas print "Deconvoluting replicas..." u_kln = numpy.zeros([nstates, nstates, niterations], numpy.float64) for iteration in range(niterations): state_indices = ncfile.variables['states'][iteration,:] u_kln[state_indices,:,iteration] = energies[iteration,:,:] print "Done." # Compute total negative log probability over all iterations. u_n = numpy.zeros([niterations], numpy.float64) for iteration in range(niterations): u_n[iteration] = numpy.sum(numpy.diagonal(u_kln[:,:,iteration])) #print u_n # DEBUG outfile = open('u_n.out', 'w') for iteration in range(niterations): outfile.write("%8d %24.3f\n" % (iteration, u_n[iteration])) outfile.close() # Discard initial data to equilibration. u_kln_replica = u_kln_replica[:,:,ndiscard:] u_kln = u_kln[:,:,ndiscard:] u_n = u_n[ndiscard:] # Truncate to number of specified conformations to use if (nuse): u_kln_replica = u_kln_replica[:,:,0:nuse] u_kln = u_kln[:,:,0:nuse] u_n = u_n[0:nuse] # Subsample data to obtain uncorrelated samples N_k = numpy.zeros(nstates, numpy.int32) indices = timeseries.subsampleCorrelatedData(u_n) # indices of uncorrelated samples #print u_n # DEBUG #indices = range(0,u_n.size) # DEBUG - assume samples are uncorrelated N = len(indices) # number of uncorrelated samples N_k[:] = N u_kln[:,:,0:N] = u_kln[:,:,indices] print "number of uncorrelated samples:" print N_k print "" # Compute average enthalpies. H_k = numpy.zeros([nstates], numpy.float64) # H_i[i] is estimated enthalpy of state i dH_k = numpy.zeros([nstates], numpy.float64) for k in range(nstates): H_k[k] = u_kln[k,k,:].mean() dH_k[k] = u_kln[k,k,:].std() / numpy.sqrt(N) return (H_k, dH_k)