Example #1
0
def subsampletimeseries(timeser, xyzn, N_k):
    """
    Return a subsampled timeseries based on statistical inefficiency calculations.
    Parameters
    ----------
    timeser: the timeseries to be subsampled
    xyzn: the coordinates associated with each frame of the timeseries to be subsampled
    N_k: original # of samples in each timeseries

    Returns
    ---------
    N_k_sub: new number of samples per timeseries
    ts_sub: the subsampled timeseries
    xyz_sub: the subsampled configuration series
    """
    # Make a copy of the timeseries and make sure is numpy array of floats
    ts = timeser
    xyz = xyzn

    # initialize array of statistical inefficiencies
    g = np.zeros(len(ts), np.float64)

    for i, t in enumerate(ts):
        if np.count_nonzero(t) == 0:
            g[i] = np.float(1.)
            print "WARNING FLAG"
        else:
            g[i] = timeseries.statisticalInefficiency(t)

    N_k_sub = np.array([
        len(timeseries.subsampleCorrelatedData(t, g=b)) for t, b in zip(ts, g)
    ])
    ind = [timeseries.subsampleCorrelatedData(t, g=b) for t, b in zip(ts, g)]

    if (N_k_sub == N_k).all():
        ts_sub = ts
        xyz_sub = xyz
        print "No sub-sampling occurred"
    else:
        print "Sub-sampling..."
        ts_sub = np.array([
            t[timeseries.subsampleCorrelatedData(t, g=b)]
            for t, b in zip(ts, g)
        ])
        #for c in xyz:
        #    xyz_sub = [c[timeseries.subsampleCorrelatedData(t,g=b)] for t,b in zip(ts,g)]
        for i, j in enumerate(xyz):
            xyz_sub = [j[ii] for ii in ind[i]]

    return ts_sub, N_k_sub, xyz_sub, ind
Example #2
0
def subsample_data_along_axis(data, subsample_rate, axis):
    """
    Generate a decorrelated version of a given input data and subsample_rate along a single axis.

    Parameters
    ----------
    data : np.array-like of any dimension length
    subsample_rate : float or int
        Rate at which to draw samples. A sample is considered decorrelated after every ceil(subsample_rate) of
        indices along data and the specified axis
    axis : int
        axis along which to apply the subsampling

    Returns
    -------
    subsampled_data : ndarray of same number of dimensions as data
        Data will be subsampled along the given axis

    """
    # TODO: find a name for the function that clarifies that decorrelation
    # TODO:             is determined exclusively by subsample_rate?
    cast_data = np.asarray(data)
    data_shape = cast_data.shape
    # Since we already have g, we can just pass any appropriate shape to the subsample function
    indices = timeseries.subsampleCorrelatedData(np.zeros(data_shape[axis]),
                                                 g=subsample_rate)
    subsampled_data = np.take(cast_data, indices, axis=axis)
    return subsampled_data
Example #3
0
def compute_timeseries(reduced_potentials):
    """
    Use pymbar timeseries to compute the uncorrelated samples in an array of reduced potentials.  Returns the uncorrelated sample indices.

    Arguments
    ---------
    reduced_potentials : np.array of floats
        reduced potentials from which a timeseries is to be extracted

    Returns
    -------
    t0 : int
        production region index
    g : float
        statistical inefficiency
    Neff_max : int
        effective number of samples in production region
    full_uncorrelated_indices : list of ints
        uncorrelated indices

    """
    from pymbar import timeseries
    t0, g, Neff_max = timeseries.detectEquilibration(
        reduced_potentials)  #computing indices of uncorrelated timeseries
    A_t_equil = reduced_potentials[t0:]
    uncorrelated_indices = timeseries.subsampleCorrelatedData(A_t_equil, g=g)
    A_t = A_t_equil[uncorrelated_indices]
    full_uncorrelated_indices = [i + t0 for i in uncorrelated_indices]

    return [t0, g, Neff_max, A_t, full_uncorrelated_indices]
Example #4
0
def prepWindow(filename, tstart=0, tstop=None):
    """
    Read window .traj file, compute correlation times, subsample data.

    Parameters
    ----------
    filename: string name of the file to process.
       For *.traj file, assumes all lines are data (e.g. no comment lines).
    tstart: integer nanosecond start time
    tstop: integer nanosecond stop time

    Returns
    -------
    counts: int, number of entries for this particular window
    winZ: numpy list containing SUBSAMPLED data for this window from tstart to tstop

    """
    # Parse data.
    n, z_sub = parseWindow(filename, tstart, tstop)

    # Compute correlation times for z (actual spring center position) timeseries.
    g = timeseries.statisticalInefficiency(z_sub)
    print "Correlation time for %s is %10.3f" % (re.split('\W+',
                                                          filename)[1], g)
    indices = timeseries.subsampleCorrelatedData(z_sub, g)

    # Subsample data.
    zsublen = len(indices)
    z_sub = z_sub[indices]
    return zsublen, z_sub
Example #5
0
def calc_df(u_kln):
    """
    u_kln should be (nstates) x (nstates) x (nframes)
    note that u_kln should be normalized by kT already
    where each element is 
        a config from frame `n` of a trajectory conducted with state `k`
        with energy recalculated using parameters of state `l`
    """
    dims = u_kln.shape
    if dims[0] != dims[1]:
        raise ValueError(
            "dimensions {} of u_kln should be square in the first two indices".
            format(dims))
    nstates = dims[0]

    N_k = np.zeros([nstates], np.int32)  # number of uncorrelated samples
    for k in range(nstates):
        [nequil, g, Neff_max] = timeseries.detectEquilibration(u_kln[k, k, :])
        indices = timeseries.subsampleCorrelatedData(u_kln[k, k, :], g=g)
        N_k[k] = len(indices)
        u_kln[k, :, 0:N_k[k]] = u_kln[k, :, indices].T
    # Compute free energy differences and statistical uncertainties
    mbar = MBAR(u_kln, N_k)
    [DeltaF_ij, dDeltaF_ij, Theta_ij] = mbar.getFreeEnergyDifferences()

    # save data?

    return DeltaF_ij, dDeltaF_ij
def gamma_rt(cos,wat,r):
        """
        Calculate the preferential interaction coefficient (gamma) of a protein with water and a cosolvent.
        
        ***ALL DISTANCES ARE IN NANOMETERS***
        
        Input: cos, wat, r
          - cos : (T frames) X (N cosolvent molecules) array, the minimum distance of each cosolvent molecule to the protein Van der Waals surface for each frame.
          - wat : (T frames) X (M water molecules) array, the minimum distance of each water molecule to the protein Van der Waals surface for each frame. 
          - r : float, distance dividing the local and bulk domains of the solvent.
        
        Returns: gamma, sample
          - gamma : (T frames) array, gamma for the given r, for each inputted frame.
          - sample : list, the N_effective independent frames of gamma to be used for calculation of the time average of gamma. Obtained using the method of Chodera (2016).
          
        References:
          - BM Baynes and BL Trout. Proteins in mixed solvents: a molecular-level perspective. J. Phys. Chem. B. 107, 14058-14067 (2003).
          - D Shukla, C Shinde, and BL Trout. Molecular computations of preferential interaction coefficients of proteins. J. Phys. Chem. B. 113, 12546-12554 (2009).
          - JD Chodera. J. Chem. Theor. Comput. 12, 1799 (2016).
        """
        n_i_x = np.sum(cos > r,axis=1).astype(float)
        n_ii_x = np.sum(cos < r,axis=1).astype(float)
        n_i_w = np.sum(wat > r,axis=1).astype(float)
        n_ii_w = np.sum(wat < r,axis=1).astype(float)
        gamma = n_ii_x - n_ii_w * (n_i_x/n_i_w)
        sample = subsampleCorrelatedData(gamma)
        return gamma, sample
Example #7
0
def subsample_data_along_axis(data, subsample_rate, axis):
    """
    Generate a decorrelated version of a given input data and subsample_rate along a single axis.

    Parameters
    ----------
    data : np.array-like of any dimension length
    subsample_rate : float or int
        Rate at which to draw samples. A sample is considered decorrelated after every ceil(subsample_rate) of
        indices along data and the specified axis
    axis : int
        axis along which to apply the subsampling

    Returns
    -------
    subsampled_data : ndarray of same number of dimensions as data
        Data will be subsampled along the given axis

    """
    # TODO: find a name for the function that clarifies that decorrelation
    # TODO:             is determined exclusively by subsample_rate?
    cast_data = np.asarray(data)
    data_shape = cast_data.shape
    # Since we already have g, we can just pass any appropriate shape to the subsample function
    indices = timeseries.subsampleCorrelatedData(np.zeros(data_shape[axis]), g=subsample_rate)
    subsampled_data = np.take(cast_data, indices, axis=axis)
    return subsampled_data
Example #8
0
def calcTension(energy_data, verbose=False):
    dE1 = energy_data[:, 1] - energy_data[:, 0]
    dE2 = energy_data[:, 2] - energy_data[:, 0]
    BdE1 = dE1 / kTkJmol
    BdE2 = dE2 / kTkJmol

    nstates = 2
    nframes = len(dE1)
    u_kln = np.zeros([nstates, nstates, nframes], np.float64)
    u_kln[0, 1, :] = BdE1
    u_kln[1, 0, :] = BdE2

    N_k = np.zeros([nstates], np.int32)  # number of uncorrelated samples
    for k in range(nstates):
        [nequil, g, Neff_max] = timeseries.detectEquilibration(u_kln[k, k, :])
        indices = timeseries.subsampleCorrelatedData(u_kln[k, k, :], g=g)
        N_k[k] = len(indices)
        u_kln[k, :, 0:N_k[k]] = u_kln[k, :, indices].T
    if verbose:
        print("...found {} uncorrelated samples out of {} total samples...".
              format(N_k, nframes))

    if verbose: print("=== Computing free energy differences ===")
    mbar = MBAR(u_kln, N_k)
    [DeltaF_ij, dDeltaF_ij, Theta_ij] = mbar.getFreeEnergyDifferences()

    tension = DeltaF_ij[
        0,
        1] / da * 1e18 * kT  #(in J/m^2). note da already has a factor of two for the two areas!
    tensionError = dDeltaF_ij[0, 1] / da * 1e18 * kT
    if verbose:
        print('tension (pymbar): {} +/- {}N/m'.format(tension, tensionError))

    return tension, tensionError
Example #9
0
File: plotXY.py Project: vtlim/misc
def subsample(x, y_mat, num_cols=None):
    """
    Parameters
    ----------
    x : numpy array
        1-dimensional array with x-data, such as timestep.
    y_mat : can take various forms:
        - list of numpy arrays, such as grouping 1-column data into smaller data series
        - 1D numpy array, such as subsampling 1-column data
        - multidimensional numpy array, if data has many columns
    num_cols : int (opt.)
        Number of data series for the input y_mat. Use this value to loop
        over the input data, since it can be formatted as 1- or N-dimensional
        list or numpy array. If num_cols not specified, the value will be
        extracted from input data using find_num_cols function.

    Returns
    -------
    x_mat : list
        multi-dimensional array of the same shape as z_mat
    z_mat : list
        multi-dimesional array in which z_mat[i][j] is the jth value in the ith data series.

    """
    from pymbar import timeseries

    x_mat = []
    z_mat = [] # subsampled y_mat

    if num_cols is None:
        num_cols = find_num_cols(y_mat)

    for i in range(num_cols):

        # list of np arrays
        if type(y_mat) is list and len(y_mat[0]) > 1:
            y = y_mat[i]

        # 1D np array
        elif type(y_mat) is np.ndarray and len(y_mat.shape) == 1:
            y = y_mat

        # multidimensional np array
        else:
            y = y_mat[:,i]

        # compute correlation times
        g = timeseries.statisticalInefficiency(y)
        indices = timeseries.subsampleCorrelatedData(y, g)

        # subsample data
        y_sub = y[indices]
        x_sub = x[indices]
        z_mat.append(y_sub)
        x_mat.append(x_sub)

        print("\nLength of original timeseries data: %d" % len(y) )
        print("\nLength of subsampled timeseries data: %d" % len(y_sub) )

    return x_mat, z_mat
Example #10
0
def get_decorrelated_samples(replica_positions, replica_energies,
                             temperature_list):
    """
        Given a set of replica exchange trajectories, energies, and associated temperatures, this function returns decorrelated samples, as obtained from pymbar with timeseries.subsampleCorrelatedData.

        :param replica_positions: Positions array for the replica exchange data for which we will write PDB files
        :type replica_positions: `Quantity() <http://docs.openmm.org/development/api-python/generated/simtk.unit.quantity.Quantity.html>`_ ( np.array( [n_replicas,cgmodel.num_beads,3] ), simtk.unit )

        :param replica_energies: List of dimension num_replicas X simulation_steps, which gives the energies for all replicas at all simulation steps 
        :type replica_energies: List( List( float * simtk.unit.energy for simulation_steps ) for num_replicas )

        :param temperature_list: List of temperatures for the simulation data.
        :type temperature_list: List( float * simtk.unit.temperature )

        :returns:
           - configurations ( List( `Quantity() <http://docs.openmm.org/development/api-python/generated/simtk.unit.quantity.Quantity.html>`_ (n_decorrelated_samples,cgmodel.num_beads,3), simtk.unit ) ) - A list of decorrelated samples
           - energies ( List( `Quantity() <http://docs.openmm.org/development/api-python/generated/simtk.unit.quantity.Quantity.html>`_ ) ) - The energies for the decorrelated samples (configurations)

        """
    all_poses = []
    all_energies = []

    for replica_index in range(len(replica_positions)):
        energies = replica_energies[replica_index][replica_index]
        [t0, g, Neff_max] = timeseries.detectEquilibration(energies)
        energies_equil = energies[t0:]
        poses_equil = replica_positions[replica_index][t0:]
        indices = timeseries.subsampleCorrelatedData(energies_equil)
        for index in indices:
            all_energies.append(energies_equil[index])
            all_poses.append(poses_equil[index])

    all_energies = np.array([float(energy) for energy in all_energies])

    return (all_poses, all_energies)
Example #11
0
def avg_density(dcd_file,lastframe_file, outdata):
    trj = md.load(dcd_file, top=lastframe_file)
    volume = trj.unitcell_lengths.prod(1)
    mass = sum([a.element.mass for a in trj.top.atoms]) / 6.0221413e23
    density_nounit = mass / volume
    density = density_nounit * u.gram / u.nanometer**3
    A_t = np.array(density_nounit )
    indices = ts.subsampleCorrelatedData(A_t)
    ind_density = density[indices]
    avg_ind_density = ind_density.mean().in_units_of(u.gram / u.liter)
    std_ind_density = ind_density.std().in_units_of(u.gram / u.liter)
    N = len(indices)
    stderr_ind_density = std_ind_density / (N**0.5)
    temps = []
    fid = open(outdata, 'r')
    fid.next()
    for line in fid:
        dtemp = float(line.split(',')[1])
        temps.append(dtemp)
    fid.close()
    temps = np.array(temps)
    avg_temp = temps.mean()
    density_file = 'density_'+dcd_file[:-4]+'_indstd.dat'
    f = open(density_file, 'w')
    f.write("Average density of the system:\n")
    f.write(str(avg_ind_density))
    f.write("\nStandard Deviation of density:\n")
    f.write(str(std_ind_density))
    f.write("\nStandard Error of the density:\n")
    f.write(str(stderr_ind_density))
    f.write("\nAverage Temperature of the system:\n")
    f.write(str(avg_temp))
    f.close()
Example #12
0
    def subsample_gradients(self):
        r''' method to subsample gradients and get a better estiamte.
        '''
        if self.percentage == 100 and not self.subsample:
            warnings.warn(
                "You are not subsampling your data according to the statistical inefficiency nor are "
                "you discarding initial data. Please set percentage to another value than 100!"
            )
        percentage_removal = (self._N_k *
                              (1 - self.percentage / 100.0)).astype('int32')
        self._subsampled_N_k_gradients = self._N_k - percentage_removal
        N_max = int(numpy.max(self._subsampled_N_k_gradients))
        self._subsampled_grad_kn = numpy.zeros(shape=(self._N_k.shape[0],
                                                      N_max))
        for p in range(percentage_removal.shape[0]):
            start = percentage_removal[p]
            finish = percentage_removal[p] + N_max
            self._subsampled_grad_kn[p, :] = self._gradients_kn[p,
                                                                start:finish]
        if N_max <= 50:
            warnings.warn(
                "You have reduced your data to less than 50 samples, the results from these might not "
                "be trustworthy. If you don't want to add more samples consider rerunning the analysis using the percentage option."
            )
        #if subsampling is percentage, then we are done here, otherwise we will now subsample according to timeseries

        if self.subsample:
            print(
                "#Subsampling gradients according to statistical inefficiency")
            #first we compute statistical inefficiency
            self._gradients_kn = self._subsampled_grad_kn.copy()
            self._N_k = self._subsampled_N_k_gradients.copy()

            g_k = numpy.zeros(shape=(self._gradients_kn.shape[0]))
            self._subsampled_N_k_gradients = numpy.zeros(
                shape=(self._gradients_kn.shape[0]))
            for i in range(g_k.shape[0]):
                g_k[i] = timeseries.statisticalInefficiency(
                    self._gradients_kn[i, :])
            g = int(numpy.max(g_k))
            #now we need to figure out what the indices in the data are for subsampling
            indices_k = []
            for i in range(g_k.shape[0]):
                indices_k.append(
                    timeseries.subsampleCorrelatedData(
                        self._gradients_kn[i, :], g=g))
                self._subsampled_N_k_gradients[i] = len(indices_k[i])
            N_max = int(numpy.max(self._subsampled_N_k_gradients))
            if N_max <= 50:
                warnings.warn(
                    "You have reduced your data to less than 50 samples, the results from these might not "
                    "be trustworthy. If you don't want to add more samples consider rerunning the analysis using the percentage option."
                )
            self._subsampled_grad_kn = numpy.zeros(
                [self._gradients_kn.shape[0], N_max], numpy.float64)
            for k in range(self._gradients_kn.shape[0]):
                self._subsampled_grad_kn[k, :] = self._gradients_kn[
                    k, indices_k[k]]
Example #13
0
def subsample(enthalpies):
    """
    Subsamples the enthalpies using John Chodera's code.
    This is probably better than the simple cutoff we normally use.
    No output -- it modifies the lists directly
    """
    # Use automatic equilibration detection and pymbar.timeseries to subsample
    [t0, g, Neff_max] = timeseries.detectEquilibration(enthalpies)
    enthalpies = enthalpies[t0:]
    return timeseries.subsampleCorrelatedData(enthalpies, g=g)
Example #14
0
def calc_statistics(_data):
    t0, g, Neff = timeseries.detectEquilibration(_data)
    data_equil = _data[t0:]
    indices_subsampled = timeseries.subsampleCorrelatedData(data_equil, g=g)
    sub_data = data_equil[indices_subsampled]

    avg = sub_data.mean()
    std = sub_data.std()
    err = sub_data.std() / np.sqrt(len(indices_subsampled))
    summary = [avg, std, err, t0, g, Neff]
    return summary
Example #15
0
 def _construct_decorrelation_mask(self, sim_collection, rep, skip):
     enes = sim_collection.reps_energies[rep]
     ops = sim_collection.reps_order_params[rep]
     steps = enes.steps
     rpots = utility.calc_reduced_potentials(enes, ops,
                                             sim_collection.conditions)
     start_i, g, Neff = timeseries.detectEquilibration(rpots, nskip=skip)
     template = '{:<8} {:<8} {:<3} {:<4.1f} {:<.1f}'
     print(template.format(sim_collection.conditions.fileformat, steps,
             start_i, g, Neff))
     indices = (timeseries.subsampleCorrelatedData(rpots[start_i:], g=skip*g))
     return [i + start_i for i in indices]
Example #16
0
def decorrelate(traj, facs=None, verbose=False, name=None):
    traj = np.array(traj)
    if traj.ndim == 1:
        idx = timeseries.subsampleCorrelatedData(traj)
        n0 = traj.size
        n1 = len(idx)
        res = traj[idx]
    elif facs is not None:
        # The cleanest way to decorrelate multi-dimensional trajectories would probably
        # be a sort of "parallel-decorrelation", taking frames in a way that both trajectories
        # are independently decorrelated. pymbar does not offer this functionality, so for
        # now, here's a work-around: We'll decorrelate such that
        #     traj_sum = facs[0]*traj[0, :] + facs[1]*traj[1, :] + ...
        # is decorrelated.
        # Use case:
        #     traj_sum = 1.0 * U + P * V
        traj_sum = np.zeros(traj.shape[1])
        for n, f in enumerate(facs):
            traj_sum += f * traj[n]
        idx = timeseries.subsampleCorrelatedData(traj_sum)
        n0 = traj.shape[1]
        n1 = len(idx)
        res = traj[:, idx]
    else:
        raise NotImplementedError('trajectory.decorrelate() is not implemented for '
                                  'trajectories with more than 1 dimension.')
    if verbose:
        n = n0 - n1
        if not name:
            name = 'Trajectory'
        if n == 0:
            print('{:s} decorrelation: No frames discarded for decorrelation.'.format(name))
        elif n == 1:
            print('{:s} decorrelation: 1 frame ({:.1%} of '
                  'trajectory) discarded for decorrelation.'.format(name, 1/n0))
        else:
            print('{:s} decorrelation: {:d} frames ({:.1%} of '
                  'trajectory) discarded for decorrelation.'.format(name, n, n/n0))

    return res
Example #17
0
def compute_timeseries(reduced_potentials: np.array) -> list:
    """
    Use pymbar timeseries to compute the uncorrelated samples in an array of reduced potentials.  Returns the uncorrelated sample indices.
    """
    from pymbar import timeseries
    t0, g, Neff_max = timeseries.detectEquilibration(
        reduced_potentials)  #computing indices of uncorrelated timeseries
    A_t_equil = reduced_potentials[t0:]
    uncorrelated_indices = timeseries.subsampleCorrelatedData(A_t_equil, g=g)
    A_t = A_t_equil[uncorrelated_indices]
    full_uncorrelated_indices = [i + t0 for i in uncorrelated_indices]

    return [t0, g, Neff_max, A_t, full_uncorrelated_indices]
Example #18
0
    def gather_dg(self, u_kln, nstates):
        # Subsample data to extract uncorrelated equilibrium timeseries
        N_k = np.zeros([nstates], np.int32)  # number of uncorrelated samples
        for k in range(nstates):
            [_, g, __] = timeseries.detectEquilibration(u_kln[k, k, :])
            indices = timeseries.subsampleCorrelatedData(u_kln[k, k, :], g=g)
            N_k[k] = len(indices)
            u_kln[k, :, 0:N_k[k]] = u_kln[k, :, indices].T
        # Compute free energy differences and statistical uncertainties
        mbar = MBAR(u_kln, N_k)
        [DeltaF_ij, dDeltaF_ij, _] = mbar.getFreeEnergyDifferences()
        print("Number of uncorrelated samples per state: {}".format(N_k))

        return DeltaF_ij, dDeltaF_ij
Example #19
0
def get_stats(data):
    """
    later, can generalize, to use one column for decorrelating and getting reference indices
    """
    [t0, g, Neff] = timeseries.detectEquilibration(data)
    data_equil = data[t0:]
    indices = timeseries.subsampleCorrelatedData(data_equil, g=g)
    sub_data = data_equil[indices]
    
    avg = sub_data.mean()
    std = sub_data.std()
    err = sub_data.std()/np.sqrt( len(indices) )

    return avg,std,err, t0,g,Neff, sub_data
Example #20
0
def getNkandUkln():
    # u_kln = u_klt
    # N_k = [maxn]*K
    # return (N_k, u_kln)

    """Identifies uncorrelated samples and updates the arrays of the reduced potential energy and dhdlt retaining data entries of these samples only."""
    u_kln = np.zeros([K,K,maxn], np.float64) # u_kln[k,m,n] is the reduced potential energy of uncorrelated sample index n from state k evaluated at state m
    N_k = np.zeros(K, int) # N_k[k] is the number of uncorrelated samples from state k
    g = np.zeros(K,float) # autocorrelation times for the data
    print "Number of correlated and uncorrelated samples:\n\n%8s %10s %12s %12s" % ('Lambda', 'N', 'N_k', 'N/N_k')
    for k in range(K):
        if k == 0:
            g[k] = timeseries.statisticalInefficiency(u_klt[k,k+1,:])
            indices = np.array(timeseries.subsampleCorrelatedData(u_klt[k,k+1,:])) # indices of uncorrelated samples
        else:
            g[k] = timeseries.statisticalInefficiency(u_klt[k,k-1,:])
            indices = np.array(timeseries.subsampleCorrelatedData(u_klt[k,k-1,:]))
        N = len(indices) # number of uncorrelated samples
        N_k[k] = N # Store the number of uncorrelated samples from state k.
        for l in range(K):
            u_kln[k,l,0:N] = u_klt[k,l,indices]
        print "%6.2f %12s %12s %12.2f" % (l_list[k], maxn, N_k[k], g[k])
    print ''
    return (N_k, u_kln)
Example #21
0
    def subsample_energies(self):
        r''' This subsamples u_kln according to percentage, i.e. remove initial equilibration data and then can additionally subsample according to timeseries

        '''
        #removing percent
        if self.percentage == 100 and not self.subsample:
            warnings.warn("You are not subsampling your data according to the statistical inefficiency nor are "
                           "you discarding initial data. Please set percentage to another value than 100!")

        percentage_removal = (self._N_k*(1-self.percentage/100.0)).astype('int32')
        self._subsampled_N_k_energies = self._N_k-percentage_removal
        N_max = int(numpy.max(self._subsampled_N_k_energies))
        self._subsampled_u_kln = numpy.zeros(shape=(self._N_k.shape[0], self._N_k.shape[0], N_max))
        self._subsampled_energies_kn = numpy.zeros(shape=(self._N_k.shape[0], N_max))
        for k in range(0, self._N_k.shape[0]):
            self._subsampled_u_kln[k] = self._u_kln[k,:,percentage_removal[k]:percentage_removal[k]+N_max]
            self._subsampled_energies_kn[k] = self._energies_kn[k,percentage_removal[k]:percentage_removal[k]+N_max]
        if N_max <=50:
            warnings.warn("You have reduced your data to less than 50 samples, the results from these might not "
                           "be trustworthy. If you don't want to add more samples consider rerunning the analysis using the percentage option.")

        #Now we are doing some additional subsampling according to timeseries analysis
        if self.subsample:
            print("#Subsampling energies according to statistical inefficiency for pymbar")

            self._u_kln = self._subsampled_u_kln.copy()
            self._N_k = self._subsampled_N_k_energies.copy()
            self._energies_kn = self._subsampled_energies_kn.copy()
            #first we compute statistical inefficiency
            g_k = numpy.zeros(shape=(self._energies_kn.shape[0]))
            for i in range(g_k.shape[0]):
                g_k[i] = timeseries.statisticalInefficiency(self._energies_kn[i,percentage_removal[i]:])
            g = numpy.max(g_k)
            #now we need to figure out what the indices in the data are for subsampling
            indices_k = []
            self._subsampled_N_k_energies = numpy.zeros(shape=(self._energies_kn.shape[0]))
            for i in range(g_k.shape[0]):
                indices_k.append(timeseries.subsampleCorrelatedData(self._energies_kn[i,:], g=g))
                self._subsampled_N_k_energies[i]=len(indices_k[i])
            #self._subsampled_N_k_energies = (numpy.ceil(self._N_k / g)).astype(int)
            N_max = int(numpy.max(self._subsampled_N_k_energies))
            if N_max <=50:
                warnings.warn("You have reduced your data to less than 50 samples, the results from these might not "
                               "be trustworthy. If you don't want to add more samples consider rerunning the analysis using the percentage option.")
            self._subsampled_u_kln = numpy.zeros([self._gradients_kn.shape[0],self._gradients_kn.shape[0], N_max], numpy.float64)
            for k in range(self._gradients_kn.shape[0]):
                self._subsampled_u_kln[k,:,:] = self._u_kln[k,:,indices_k[k]].transpose()
def read_concentration(files, discard=10, fast=False):
    """
    Calculate the mean concentration and standard error from numerous numerous simulations, where each simulation has
    a fixed chemical potential. Timeseries analysis is used to determine equilibrium properties.

    Parameters
    ----------
    files: list of str
        the path to each results file that will be analysed.
    discard: int
        the initial amount of data to throw away
    fast: bool
        whether to perform the fast varient of the time series analysis
    """
    concentration = np.zeros(len(files))
    standard_error = np.zeros(len(files))
    delta_mu = np.zeros(len(files))
    lower = np.zeros(len(files))
    upper = np.zeros(len(files))
    for i in range(len(files)):
        ncfile = Dataset(files[i], 'r')
        volume = ncfile.groups['Sample state data']['volume'][:]
        #ncations = ncfile.groups['Sample state data']['species counts'][:, 1]
        nsalt = np.min(ncfile.groups['Sample state data']['species counts'][:, 1:2], axis=1)
        delta_mu[i] = ncfile.groups['Control parameters']['delta_chem'][0]
        ncfile.close()

        # Get the concentration in Molarity
        c = 1.0 * nsalt / volume * 1.66054

        # Estimate the mean and standard error with timeseries analysis
        t_equil, stat_ineff, n_eff = timeseries.detectEquilibration(c[discard:], fast=fast)
        #mu, sigma, num_batches, conf_width = misc_tools.batch_estimate_2(c[(discard + t_equil):], stat_ineff)
        #print("{0} batches for {1}".format(num_batches, files[i]))
        c_equil = c[(discard + t_equil):]
        concentration[i] = np.mean(c_equil)
        independent_inds = timeseries.subsampleCorrelatedData(c_equil, g=stat_ineff, conservative=True)
        mu_samps = misc_tools.bootstrap_estimates(c_equil[independent_inds])
        lower[i] = np.percentile(mu_samps, 2.5)
        upper[i] = np.percentile(mu_samps, 97.5)
        standard_error[i] = mu_samps.std()

    return concentration, standard_error, delta_mu, lower, upper
Example #23
0
def decorrelate(traj, verbose=False, name=None):
    traj = np.array(traj)
    if traj.ndim == 1:
        idx = timeseries.subsampleCorrelatedData(traj)
        n0 = traj.size
        n1 = len(idx)
        res = traj[idx]
    elif traj.ndim == 2:
        # pymbar doesn't offer to decorrelate two samples, so let's do it ourselves
        # and just use the decorrelation of the sample more strongly correlated
        #
        # calculate (maximal) inefficiency
        g1 = timeseries.statisticalInefficiency(traj[0])
        g2 = timeseries.statisticalInefficiency(traj[1])
        g = np.max([g1, g2])
        # calculate index
        n0 = traj.shape[1]
        idx = np.unique(
            np.array(np.round(np.arange(0, int(n0 / g + .5)) * g), dtype=int))
        idx = idx[idx < n0]
        n1 = len(idx)
        res = traj[:, idx]
    else:
        raise NotImplementedError(
            'trajectory.decorrelate() is not implemented for '
            'trajectories with more than 1 dimension.')
    if verbose:
        n = n0 - n1
        if not name:
            name = 'Trajectory'
        if n == 0:
            print('{:s} decorrelation: No frames discarded for decorrelation.'.
                  format(name))
        elif n == 1:
            print('{:s} decorrelation: 1 frame ({:.1%} of '
                  'trajectory) discarded for decorrelation.'.format(
                      name, 1 / n0))
        else:
            print('{:s} decorrelation: {:d} frames ({:.1%} of '
                  'trajectory) discarded for decorrelation.'.format(
                      name, n, n / n0))

    return res
Example #24
0
def equil_sample(
        data, threshold_fraction=0.0, threshold_neff=1, conservative=True
    ):
    """Returns a statistically independent subset of an array of data.

    Parameters
    ----------
    data : numpy.typing.Arraylike
        1-D time dependent data to check for equilibration.
    threshold_fraction : float, optional, default=0.8
        Fraction of data expected to be equilibrated.
    threshold_neff : int, optional, default=100
        Minimum amount of effectively correlated samples to consider a_t
        'equilibrated'.
    conservative : bool, default=True
        if set to True, uniformly-spaced indices are chosen with interval
        ceil(g), where g is the statistical inefficiency.  
        Otherwise, indices are chosen non-uniformly with interval of
        approximately g in order to end up with approximately T/g total indices

    Returns
    -------
    (numpy.ndarray, numpy.ndarray, int, int)

    """
    is_equil, prod_start, ineff, Neff = is_equilibrated(
            data, threshold_fraction, threshold_neff
            )

    if is_equil:
        uncorr_indices = timeseries.subsampleCorrelatedData(
                data[prod_start:], g=ineff, conservative=conservative
            )
        uncorr_sample = data[prod_start:][uncorr_indices]
        return(uncorr_sample, uncorr_indices, prod_start, Neff)

    else:
        raise ValueError(
            "Property does not have requisite threshold of production data "
            "expected. More production data is needed, or the threshold needs "
            "to be lowered. See is_equilibrated for more information."
        )
Example #25
0
    def subsample_energies(self):
        if self.subsample_method!='timeseries':
            print("We are only eliminating samples from the beginning of the data and are still working with highly"
                  " correlated data!")

            if self.percentage ==100:
                RuntimeWarning("You are not subsampling your data according to the statistical inefficiency nor are"
                               "you discarding initial data. Please set percentage to another value than 100!")

            percentage_removal = self._N_k*(1-self.percentage/100.0)
            self._subsampled_N_k_energies = self._N_k-percentage_removal
            N_max = np.max(self._subsampled_N_k_energies)
            self._subsampled_u_kln = np.zeros(shape=(self._N_k.shape[0], self._N_k.shape[0], N_max))
            for i in range(percentage_removal.shape[0]):
                for j in range(percentage_removal.shape[0]):
                    self._subsampled_u_kln[i,j,:] = self._u_kln[i,j,percentage_removal[j]:]
            if N_max <=100:
                RuntimeWarning("You have reduced your data to less than 100 samples, the results from these might not "
                               "be trustworthy. ")
        else:
            print("We are doing a timeseries analysis using the timeseries analysis module in pymbar and will subsample"
                  " according to that.")

            #first we compute statistical inefficiency
            g_k = np.zeros(shape=(self._energies_kn.shape[0]))
            for i in range(g_k.shape[0]):
                g_k[i] = timeseries.statisticalInefficiency(self._energies_kn[i,:])
            g = np.max(g_k)
            #now we need to figure out what the indices in the data are for subsampling
            indices_k = []
            self._subsampled_N_k_energies = np.zeros(shape=(self._gradients_kn.shape[0]))
            for i in range(g_k.shape[0]):
                indices_k.append(timeseries.subsampleCorrelatedData(self._energies_kn[i,:], g=g))
                self._subsampled_N_k_energies[i]=len(indices_k[i])
            #self._subsampled_N_k_energies = (np.ceil(self._N_k / g)).astype(int)
            N_max = np.max(self._subsampled_N_k_energies)
            if N_max <=100:
                RuntimeWarning("You have reduced your data to less than 100 samples, the results from these might not "
                               "be trustworthy. ")
            self._subsampled_u_kln = np.zeros([self._gradients_kn.shape[0],self._gradients_kn.shape[0], N_max], np.float64)
            for k in range(self._gradients_kn.shape[0]):
                self._subsampled_u_kln[k,:,:] = self._u_kln[k,:,indices_k[k]].transpose()
def individual_analysis_procedure(temperature):

    ###
    #
    # This subroutine analyzes a timeseries for 'temperature',
    # and generates a set of decorrelated sample energies and distances,
    # which are used in later sampling to generate a free energy surface.
    #
    ###
    if (search_for_existing_data and not (os.path.exists(
            str(output_dir + str(temperature) + "/uncorrelated_distances.dat"))
                                          )) or not (search_for_existing_data):
        output_obj = open(str(output_dir + str(temperature) + "/sim_data.dat"),
                          'r')
        E_total_all_temp = np.array(
            [l.split(',')[3] for l in output_obj.readlines()]
        )  # E_total_all_temp temporarily stores the total energies from NaCl simulation output
        output_obj.close()
        distances = util.get_distances(
            str(output_dir + str(temperature) + "/coordinates.pdb"),
            simulation_steps)  # Read in the distances
        E_total_all = np.array(
            np.delete(E_total_all_temp, 0, 0), dtype=float
        )  # E_total_all stores total energies from NaCl simulation output, after re-typing
        [t0, g, Neff_max] = timeseries.detectEquilibration(
            E_total_all, nskip=nskip
        )  # Identify the indices of samples with high statistical efficiency (g)
        E_total_equil = E_total_all[
            t0:]  # Using the index for the equilibration time (t0), truncate the time-series data before this index
        uncorrelated_energy_indices = timeseries.subsampleCorrelatedData(
            E_total_equil, g=g)  # Determine indices of uncorrelated samples
        np.savetxt(
            str(output_dir + str(temperature) +
                '/uncorrelated_total_energies.dat'),
            E_total_equil[uncorrelated_energy_indices]
        )  # Write uncorrelated total energies to file
        np.savetxt(
            str(output_dir + str(temperature) + '/uncorrelated_distances.dat'),
            distances[uncorrelated_energy_indices]
        )  # Write uncorrelated Na-Cl distances to file
        return
    def gather_dg(self, u_kln, nstates):
        u_kln = np.vstack(u_kln)
        # Subsample data to extract uncorrelated equilibrium timeseries
        N_k = np.zeros([nstates], np.int32)  # number of uncorrelated samples
        for k in range(nstates):
            [_, g, __] = timeseries.detectEquilibration(u_kln[k, k, :])
            indices = timeseries.subsampleCorrelatedData(u_kln[k, k, :], g=g)
            N_k[k] = len(indices)
            u_kln[k, :, 0:N_k[k]] = u_kln[k, :, indices].T
        # Compute free energy differences and statistical uncertainties
        mbar = MBAR(u_kln, N_k)
        [DeltaF_ij, dDeltaF_ij, _] = mbar.getFreeEnergyDifferences()
        logger.debug(
            "Number of uncorrelated samples per state: {}".format(N_k))
        logger.debug("Relative free energy change for {0} = {1} +- {2}".format(
            self.name, DeltaF_ij[0, nstates - 1] * self.kTtokcal,
            dDeltaF_ij[0, nstates - 1] * self.kTtokcal))

        return DeltaF_ij[0, nstates -
                         1] * self.kTtokcal, dDeltaF_ij[0, nstates -
                                                        1] * self.kTtokcal
Example #28
0
    def subsampling(self, integratedACF=True):
        """
        Performs inline subsampling based on the statistical inefficiency ``g``
        of the specified attribute `acfun` of :class:`sample`, aiming at
        obtaining a sample of :term:`IID` configurations. Subsampling is done
        via jumps of varying sizes around ``g``, so that the sample size decays
        by a factor of approximately ``1/g``.

        Parameters
        ----------
            integratedACF : bool, optional, default=True
                If true, the integrated :term:`ACF` method :cite:`Chodera_2007`
                will be used for computing the statistical inefficiency.
                Otherwise, the :term:`OBM` method will be used instead.

        Returns
        -------
            :class:`sample`
                Although the subsampling is done inline, the new sample is
                returned for chaining purposes.

        """
        n = len(self.dataset)
        if mics.verbose:
            info("\n=== Subsampling via %s ===" %
                 ("integrated ACF" if integratedACF else "OBM"))
            info("Original sample size:", n)
        if integratedACF:
            y = multimap([self.acfun.lambdify()], self.dataset)
            g = timeseries.statisticalInefficiency(y[0])
        else:
            g = n / self.neff
        new = timeseries.subsampleCorrelatedData(self.dataset.index, g)
        self.dataset = self.dataset.reindex(new)
        self.neff = len(new)
        if mics.verbose:
            info("Statistical inefficiency:", g)
            info("New sample size:", self.neff)
        return self
Example #29
0
    def equilibrate_and_subsample(self, u_kln_replica, u_kln, u_n, ndiscard=0, nuse=None):
        """Equilibrate, truncate, and subsample uncorrelated samples.

        Parameters
        ----------
        ndiscard : int, optinoal, default=0
            number of iterations to discard to equilibration
        nuse : int, optional, default=None
            maximum number of iterations to use (after discarding)
        
        Returns
        -------
        """
        
        logger.info("Discarding initial data as equilibration (ndiscard = %d)" % ndiscard)
        u_kln_replica = u_kln_replica[:,:,ndiscard:]
        u_kln = u_kln[:,:, ndiscard:]
        u_n = u_n[ndiscard:]

        
        if nuse is not None:
            logger.info("Truncating to number of specified conforamtions to use(nuse = %d)" % nuse)
            u_kln_replica = u_kln_replica[:,:,0:nuse]
            u_kln = u_kln[:,:,0:nuse]
            u_n = u_n[0:nuse]
        
        logger.info("Subsample data to obtain uncorrelated samples")
        N_k = np.zeros(self.n_states, np.int32)    
        indices = timeseries.subsampleCorrelatedData(u_n) # indices of uncorrelated samples

        N = len(indices) # number of uncorrelated samples
        N_k[:] = N      
        u_kln[:, :, 0:N] = u_kln[:, :, indices]
        logger.info("number of uncorrelated samples:")
        logger.info(N_k)
        logger.info("")
        
        return u_kln_replica, u_kln, u_n, N_k, N
def estimate_free_energies(ncfile, ndiscard=0, nuse=None, g=1.0, replicas=None):
    """Estimate free energies of all alchemical states.

    ARGUMENTS
       ncfile (NetCDF) - input YANK netcdf file

    OPTIONAL ARGUMENTS
       ndiscard (int) - number of iterations to discard to equilibration (default: 0)
       nuse (int) - maximum number of iterations to use (after discarding) (default: None)
       g (float) - statistical inefficiency to use for subsampleing (default: 1.0)
       replicas (list of int) - if specified, only use these replicas for estimating the free energies (default: None)

    TODO: Automatically determine 'ndiscard'.
    """

    # Get current dimensions.
    niterations = ncfile.variables['energies'].shape[0]
    nstates = ncfile.variables['energies'].shape[1]
    natoms = ncfile.variables['energies'].shape[2]

    # Extract energies.
    energies = ncfile.variables['energies']
    u_kln_replica = numpy.zeros([nstates, nstates, niterations], numpy.float64)
    for n in range(niterations):
        u_kln_replica[:,:,n] = energies[n,:,:]

    # Extract states.
    states_kn_replica = numpy.zeros([nstates, niterations], numpy.int32)
    for n in range(niterations):
        states_kn_replica[:,n] = ncfile.variables['states'][n,:]

    # Discard initial data to equilibration.
    u_kln_replica = u_kln_replica[:,:,ndiscard:]
    states_kn_replica = states_kn_replica[:,ndiscard:]


    # If specified, truncate to number of specified conformations to use.
    if (nuse):
        u_kln_replica = u_kln_replica[:,:,0:nuse]
        states_kn_replica = states_kn_replica[:,0:nuse]

    # Subsample data to obtain uncorrelated samples
    A_n = u_kln_replica[0,0,:]
    indices = timeseries.subsampleCorrelatedData(A_n, g=g) # indices of uncorrelated samples
    N = len(indices) # number of uncorrelated samples
    u_kln_replica[:,:,0:N] = u_kln_replica[:,:,indices]
    states_kn_replica[:,0:N] = states_kn_replica[:,indices]

    # Deconvolute replicas to obtain energies by state.
    u_kln = numpy.zeros([nstates, nstates, N], numpy.float64)
    if replicas is None:
        # Use all replicas.
        N_k = N * numpy.ones(nstates, numpy.int32)
        for n in range(N):
            state_indices = states_kn_replica[:,n]
            u_kln[state_indices,:,n] = u_kln_replica[:,:,n]
    else:
        # Use only specified replicas.
        N_k = numpy.zeros(nstates, numpy.int32)
        for n in range(N):
            state_indices = ncfile.variables['states'][n,:]
            for replica in replicas:
                state_index = states_kn_replica[replica,n]
                u_kln[state_index,:,N_k[state_index]] = u_kln_replica[replica,:,n]
                N_k[state_index] += 1

    #===================================================================================================
    # Estimate free energy difference with MBAR.
    #===================================================================================================

    # Initialize MBAR (computing free energy estimates, which may take a while)
    mbar = MBAR(u_kln, N_k, verbose = False, maximum_iterations = 50000) # use slow self-consistent-iteration (the default)

    # Get matrix of dimensionless free energy differences and uncertainty estimate.
    (Deltaf_ij, dDeltaf_ij) = mbar.getFreeEnergyDifferences(uncertainty_method='svd-ew')

    # Return free energy differences and an estimate of the covariance.
    return (Deltaf_ij, dDeltaf_ij)
    if len(fep_columns) > 0:
        for i in range(len(fep_columns)):
            reduced_fep_data.append(numpy.zeros([K,N_samples], numpy.float64))
    for k in range(K):
        # Extract timeseries.
        A_t = biasing_variable_kt[0][k,:]
        # Compute statistical inefficiency.
        try:
            g = timeseries.statisticalInefficiency(A_t)
        except Exception as e:
            print str(e)
            print A_t

        # Subsample data.
        if subsample_trajectories:
            indices = timeseries.subsampleCorrelatedData(A_t, g=g)
        else:
            indices = timeseries.subsampleCorrelatedData(A_t, g=1)
        N = len(indices) # number of uncorrelated samples
        print "k = %5d : g = %.1f, N = %d" % (k, g, N)
        for i in range(nbiases):
            biasing_variable_kn[i][k,0:N] = biasing_variable_kt[i][k,indices]
        for i in range(nperturbations+1):
            U_kn[i][k,0:N] = U_kt[i][k,indices]
        if not cluster_binning:
            pmf_variable_kn_1[k,0:N] = pmf_variable_kt_1[k,indices]
            if ndim == 2:
                pmf_variable_kn_2[k,0:N] = pmf_variable_kt_2[k,indices]
        if cluster_binning:
            cluster_bin_kn[k,0:N] = cluster_bin_kt[k,indices]
        if len(expectation_columns) > 0:
        infile.close()
        # Parse data.
        n = 0
        for line in lines:
            if line[0] != '#' and line[0] != '@':
                tokens = line.split()            
                u_kn[k,n] = beta_k[k] * (float(tokens[2]) - float(tokens[1])) # reduced potential energy without umbrella restraint
                n += 1

    # Compute correlation times for potential energy and chi
    # timeseries.  If the temperatures differ, use energies to determine samples; otherwise, use the cosine of chi
            
    if (DifferentTemperatures):        
        g_k[k] = timeseries.statisticalInefficiency(u_kn[k,:], u_kn[k,0:N_k[k]])
        print "Correlation time for set %5d is %10.3f" % (k,g_k[k])
        indices = timeseries.subsampleCorrelatedData(u_kn[k,0:N_k[k]])
    else:
        chi_radians = chi_kn[k,0:N_k[k]]/(180.0/numpy.pi)
        g_cos = timeseries.statisticalInefficiency(numpy.cos(chi_radians))
        g_sin = timeseries.statisticalInefficiency(numpy.sin(chi_radians))
        print "g_cos = %.1f | g_sin = %.1f" % (g_cos, g_sin)
        g_k[k] = max(g_cos, g_sin)
        print "Correlation time for set %5d is %10.3f" % (k,g_k[k])
        indices = timeseries.subsampleCorrelatedData(chi_radians, g=g_k[k]) 
    # Subsample data.
    N_k[k] = len(indices)
    u_kn[k,0:N_k[k]] = u_kn[k,indices]
    chi_kn[k,0:N_k[k]] = chi_kn[k,indices]

N_max = numpy.max(N_k) # shorten the array size
u_kln = numpy.zeros([K,K,N_max], numpy.float64) # u_kln[k,l,n] is the reduced potential energy of snapshot n from umbrella simulation k evaluated at umbrella l
Example #33
0
def estimate_free_energies(ncfile, ndiscard = 0, nuse = None):
    """Estimate free energies of all alchemical states.

    ARGUMENTS
       ncfile (NetCDF) - input YANK netcdf file

    OPTIONAL ARGUMENTS
       ndiscard (int) - number of iterations to discard to equilibration
       nuse (int) - maximum number of iterations to use (after discarding)

    TODO: Automatically determine 'ndiscard'.
    """

    # Get current dimensions.
    niterations = ncfile.variables['energies'].shape[0]
    nstates = ncfile.variables['energies'].shape[1]
    natoms = ncfile.variables['energies'].shape[2]

    # Extract energies.
    logger.info("Reading energies...")
    energies = ncfile.variables['energies']
    u_kln_replica = np.zeros([nstates, nstates, niterations], np.float64)
    for n in range(niterations):
        u_kln_replica[:,:,n] = energies[n,:,:]
    logger.info("Done.")

    # Deconvolute replicas
    logger.info("Deconvoluting replicas...")
    u_kln = np.zeros([nstates, nstates, niterations], np.float64)
    for iteration in range(niterations):
        state_indices = ncfile.variables['states'][iteration,:]
        u_kln[state_indices,:,iteration] = energies[iteration,:,:]
    logger.info("Done.")

    # Compute total negative log probability over all iterations.
    u_n = np.zeros([niterations], np.float64)
    for iteration in range(niterations):
        u_n[iteration] = np.sum(np.diagonal(u_kln[:,:,iteration]))
    #logger.info(u_n

    # DEBUG
    outfile = open('u_n.out', 'w')
    for iteration in range(niterations):
        outfile.write("%8d %24.3f\n" % (iteration, u_n[iteration]))
    outfile.close()

    # Discard initial data to equilibration.
    u_kln_replica = u_kln_replica[:,:,ndiscard:]
    u_kln = u_kln[:,:,ndiscard:]
    u_n = u_n[ndiscard:]

    # Truncate to number of specified conforamtions to use
    if (nuse):
        u_kln_replica = u_kln_replica[:,:,0:nuse]
        u_kln = u_kln[:,:,0:nuse]
        u_n = u_n[0:nuse]
    
    # Subsample data to obtain uncorrelated samples
    N_k = np.zeros(nstates, np.int32)    
    indices = timeseries.subsampleCorrelatedData(u_n) # indices of uncorrelated samples
    #print u_n # DEBUG
    #indices = range(0,u_n.size) # DEBUG - assume samples are uncorrelated
    N = len(indices) # number of uncorrelated samples
    N_k[:] = N      
    u_kln[:,:,0:N] = u_kln[:,:,indices]
    logger.info("number of uncorrelated samples:")
    logger.info(N_k)
    logger.info("")

    #===================================================================================================
    # Estimate free energy difference with MBAR.
    #===================================================================================================   
   
    # Initialize MBAR (computing free energy estimates, which may take a while)
    logger.info("Computing free energy differences...")
    mbar = MBAR(u_kln, N_k, verbose = False, method = 'self-consistent-iteration', maximum_iterations = 50000) # use slow self-consistent-iteration (the default)
    #mbar = MBAR(u_kln, N_k, verbose = True, method = 'Newton-Raphson') # use faster Newton-Raphson solver

    # Get matrix of dimensionless free energy differences and uncertainty estimate.
    logger.info("Computing covariance matrix...")
    (Deltaf_ij, dDeltaf_ij) = mbar.getFreeEnergyDifferences(uncertainty_method='svd-ew')
   
#    # Matrix of free energy differences
    logger.info("Deltaf_ij:")
    for i in range(nstates):
        for j in range(nstates):
            print "%8.3f" % Deltaf_ij[i,j],
        print ""        
    
#    print Deltaf_ij
#    # Matrix of uncertainties in free energy difference (expectations standard deviations of the estimator about the true free energy)
    logger.info("dDeltaf_ij:")
    for i in range(nstates):
        for j in range(nstates):
            print "%8.3f" % dDeltaf_ij[i,j],
        print ""        

    # Return free energy differences and an estimate of the covariance.
    return (Deltaf_ij, dDeltaf_ij)
g_k = zeros([K], float64)
for k in range(K):
    # Compute statistical inefficiency for extension timeseries
    g = timeseries.statisticalInefficiency(x_kt[k,0:T_k[k]], x_kt[k,0:T_k[k]])
    # store statistical inefficiency
    g_k[k] = g
    print("timeseries %d : g = %.1f, %.0f uncorrelated samples (of %d total samples)" % (k+1, g, floor(T_k[k] / g), T_k[k]))
    N_max = max(N_max, ceil(T_k[k] / g) + 1)

# Subsample trajectory position data.
x_kn = zeros([K, N_max], float64)
bin_kn = zeros([K, N_max], int32)
N_k = zeros([K], int32)
for k in range(K):
    # Compute correlation times for potential energy and chi timeseries.
    indices = timeseries.subsampleCorrelatedData(x_kt[k,0:T_k[k]])
    # Store subsampled positions.
    N_k[k] = len(indices)
    x_kn[k,0:N_k[k]] = x_kt[k,indices]
    bin_kn[k,0:N_k[k]] = bin_kt[k,indices]

# Set arbitrary zeros for external biasing potential.
x0_k = zeros([K], float64) # x position corresponding to zero of potential
for k in range(K):
    x0_k[k] = x_kn[k,0:N_k[k]].mean()
print("x0_k = ")
print(x0_k)

# Compute bias energies in units of kT.
u_kln = zeros([K,K,N_max], float64) # u_kln[k,l,n] is the reduced (dimensionless) relative potential energy of snapshot n from umbrella simulation k evaluated at umbrella l
for k in range(K):
Example #35
0
def overlap_check(reference_system, positions, platform_name=None, precision=None, nsteps=50, nsamples=200, factory_args=None, cached_trajectory_filename=None):
    """
    Test overlap between reference system and alchemical system by running a short simulation.

    Parameters
    ----------
    reference_system : simtk.openmm.System
       The reference System object to compare with
    positions : simtk.unit.Quantity with units compatible with nanometers
       The positions to assess energetics for.
    platform_name : str, optional, default=None
       The name of the platform to use for benchmarking.
    nsteps : int, optional, default=50
       Number of molecular dynamics steps between samples.
    nsamples : int, optional, default=100
       Number of samples to collect.
    factory_args : dict(), optional, default=None
       Arguments passed to AbsoluteAlchemicalFactory.
    cached_trajectory_filename : str, optional, default=None
       If specified, attempt to cache (or reuse) trajectory.

    """

    # Create a fully-interacting alchemical state.
    factory = AbsoluteAlchemicalFactory(reference_system, **factory_args)
    alchemical_state = AlchemicalState()
    alchemical_system = factory.createPerturbedSystem(alchemical_state)

    temperature = 300.0 * unit.kelvin
    collision_rate = 5.0 / unit.picoseconds
    timestep = 2.0 * unit.femtoseconds
    kT = (kB * temperature)

    # Select platform.
    platform = None
    if platform_name:
        platform = openmm.Platform.getPlatformByName(platform_name)

    # Create integrators.
    reference_integrator = openmm.LangevinIntegrator(temperature, collision_rate, timestep)
    alchemical_integrator = openmm.VerletIntegrator(timestep)

    # Create contexts.
    if platform:
        reference_context = openmm.Context(reference_system, reference_integrator, platform)
        alchemical_context = openmm.Context(alchemical_system, alchemical_integrator, platform)
    else:
        reference_context = openmm.Context(reference_system, reference_integrator)
        alchemical_context = openmm.Context(alchemical_system, alchemical_integrator)

    ncfile = None
    if cached_trajectory_filename:
        cache_mode = 'write'

        # Try reading from cache
        from netCDF4 import Dataset
        if os.path.exists(cached_trajectory_filename):
            try:
                ncfile = Dataset(cached_trajectory_filename, 'r')
                if (ncfile.variables['positions'].shape == (nsamples, reference_system.getNumParticles(), 3)):
                    # Read the cache if everything matches
                    cache_mode = 'read'
            except:
                pass

        if cache_mode == 'write':
            # If anything went wrong, create a new cache.
            try:
                (pathname, filename) = os.path.split(cached_trajectory_filename)
                if not os.path.exists(pathname): os.makedirs(pathname)
                ncfile = Dataset(cached_trajectory_filename, 'w', format='NETCDF4')
                ncfile.createDimension('samples', 0)
                ncfile.createDimension('atoms', reference_system.getNumParticles())
                ncfile.createDimension('spatial', 3)
                ncfile.createVariable('positions', 'f4', ('samples', 'atoms', 'spatial'))
            except Exception as e:
                logger.info(str(e))
                logger.info('Could not create a trajectory cache (%s).' % cached_trajectory_filename)
                ncfile = None

    # Collect simulation data.
    reference_context.setPositions(positions)
    du_n = np.zeros([nsamples], np.float64) # du_n[n] is the
    print()
    import click
    with click.progressbar(range(nsamples)) as bar:
        for sample in bar:
            if cached_trajectory_filename and (cache_mode == 'read'):
                # Load cached frames.
                positions = unit.Quantity(ncfile.variables['positions'][sample,:,:], unit.nanometers)
                reference_context.setPositions(positions)
            else:
                # Run dynamics.
                reference_integrator.step(nsteps)

            # Get reference energies.
            reference_state = reference_context.getState(getEnergy=True, getPositions=True)
            reference_potential = reference_state.getPotentialEnergy()
            if np.isnan(reference_potential/kT):
                raise Exception("Reference potential is NaN")

            # Get alchemical energies.
            alchemical_context.setPositions(reference_state.getPositions(asNumpy=True))
            alchemical_state = alchemical_context.getState(getEnergy=True)
            alchemical_potential = alchemical_state.getPotentialEnergy()
            if np.isnan(alchemical_potential/kT):
                raise Exception("Alchemical potential is NaN")

            du_n[sample] = (alchemical_potential - reference_potential) / kT

            if cached_trajectory_filename and (cache_mode == 'write') and (ncfile is not None):
                ncfile.variables['positions'][sample,:,:] = reference_state.getPositions(asNumpy=True) / unit.nanometers

    # Clean up.
    del reference_context, alchemical_context
    if cached_trajectory_filename and (ncfile is not None):
        ncfile.close()

    # Discard data to equilibration and subsample.
    from pymbar import timeseries
    [t0, g, Neff] = timeseries.detectEquilibration(du_n)
    indices = timeseries.subsampleCorrelatedData(du_n, g=g)
    du_n = du_n[indices]

    # Compute statistics.
    from pymbar import EXP
    [DeltaF, dDeltaF] = EXP(du_n)

    # Raise an exception if the error is larger than 3kT.
    MAX_DEVIATION = 3.0 # kT
    if (dDeltaF > MAX_DEVIATION):
        report = "DeltaF = %12.3f +- %12.3f kT (%5d samples, g = %6.1f)" % (DeltaF, dDeltaF, Neff, g)
        raise Exception(report)

    return
Example #36
0
#------------------------------------------------------------------------
# Read Data From File
#------------------------------------------------------------------------

print("")
print("Preparing data:")
T_from_file = read_simulation_temps(simulation,NumTemps)
E_from_file = read_total_energies(simulation,TE_COL_NUM)
K = len(T_from_file)
N_k = numpy.zeros(K,numpy.int32)
g = numpy.zeros(K,numpy.float64)

for k in range(K):  # subsample the energies
   g[k] = timeseries.statisticalInefficiency(E_from_file[k])
   indices = numpy.array(timeseries.subsampleCorrelatedData(E_from_file[k],g=g[k])) # indices of uncorrelated samples
   N_k[k] = len(indices) # number of uncorrelated samples
   E_from_file[k,0:N_k[k]] = E_from_file[k,indices]

#------------------------------------------------------------------------
# Insert Intermediate T's and corresponding blank U's and E's
#------------------------------------------------------------------------
Temp_k = T_from_file
minT = T_from_file[0]
maxT = T_from_file[len(T_from_file) - 1]
#beta = 1/(k*BT)
#T = 1/(kB*beta)
if dertype == 'temperature':
    minv = minT
    maxv = maxT
elif dertype == 'beta':   # actually going in the opposite direction as beta for logistical reasons
Example #37
0
def main():    
    usage = """
        usage: %prog [options] <metadata file>
    """
    
    parser = optparse.OptionParser(usage)
    parser.add_option("-o", "--outfile", dest="output_file", default='mbar_pmf.out', help="Output file for PMF [default: %default]")
    parser.add_option("-t", "--temperature", dest="temperature", default=300., type="float", help="Initial temperature in K [default: %default K]")
    parser.add_option("-b", "--bins", dest="bins", default=50, type="int", help="Number of bins for 1D PMF [default: %default]")
    parser.add_option("-d", "--double", dest="double_k", default=False, action='store_true', help="Double the k values [default: %default]")
    parser.add_option("-c", "--kcal", dest="kcal_k", default=False, action='store_true', help="Convert k values from kcal to kJ [default: %default]")
    parser.add_option("-s", "--skip-subsampling", dest="skip_subsampling", default=False, action='store_true', help="Skip data subsampling [default: %default]")
    parser.add_option("-v", "--verbose", dest="verbose", default=False, action="store_true", help="Verbose output from PyMBAR [default: %default]")
    
    (options, args) = parser.parse_args()
    
    if len(args) < 1:
        parser.error('No metadata file passed')
    elif not os.path.exists(args[0]):
        parser.error('Metadata file not found')
    
    metadata = [] # stores metadata per umbrella
    N_max = 0 # the max number of snapshots per umbrella
    different_temperatures = False # flag to know if we are reading in energies for the snapshots
    
    # open the wham metadata file
    print "Opening metadata file %s" % args[0]
    f = open(args[0], 'r')
    metadata_lines = f.readlines()
    f.close()
    
    # first get all the metadata and count the max number of snapshots per umbrella
    for line in metadata_lines:
        # skip comments
        if line.startswith('#'):
            continue
        # split lines based on spaces, but convert tabs to spaces first
        clean_split = filter(None, line.strip().expandtabs().split(' '))
        if not os.path.exists(clean_split[0]):
            print "Data file %s doesn't exist, skipping this replica" % clean_split[0]
            continue
        else:
            # get the number of snapshots for the replica
            nsnapshots = file_len(clean_split[0])
            # /path/to/timeseries/file  loc_win_min spring  [correl time] [temperature]
            k = float(clean_split[2])
            if options.double_k:
                k = k*2.0
            if options.kcal_k:
                k = k*4.184
            
            current_meta = { 'path': clean_split[0], 'coord': float(clean_split[1]), 'k': k, 'n': nsnapshots }
            #   K_k[k] = float(tokens[1]) * (numpy.pi/180)**2 # spring constant (read in kJ/mol/rad**2, converted to kJ/mol/deg**2)    
            
            if len(clean_split) >= 4:
                # TODO: temperature the 4rd or 5th value???
                # temperature might be the 4th value...
                current_meta['t'] = float(clean_split[3])
                different_temperatures = True
            metadata.append(current_meta)
    
    N_max = numpy.max([ w['n'] for w in metadata ])
    print "Max number of snapshots %d" % N_max
    
    # now allocate the memory for the arrays
    K = len(metadata)
    T_k = numpy.ones(K,float)*options.temperature # inital temperatures are all equal
    beta_k = 1.0/(kB*T_k)   # beta factor for the different temperatures
    
    data = numpy.zeros([K,N_max], numpy.float64) # the snapshot data
    u_kn = numpy.zeros([K,N_max], numpy.float64) # u_kn[k,n] is the reduced potential energy without umbrella restraints of snapshot n of umbrella simulation k
    u_kln = numpy.zeros([K,K,N_max], numpy.float64) # u_kln[k,l,n] is the reduced potential energy of snapshot n from umbrella simulation k evaluated at umbrella l
    g_k = numpy.zeros([K],numpy.float32) # correlation time

    data_min = [] # will set the min and max data values later
    data_max = []
    
    # Now loop through each datafile and extract the data
    for i, w in enumerate(metadata):
        print "Reading %s..." % w['path']
        f = open(w['path'], 'r')
        lines = f.readlines()
        f.close()
        
        clean_split_lines = [ filter(None, line.strip().expandtabs().split(' ')) for line in lines if not line.startswith('#') ]

        if different_temperatures:
            raise Exception('Differen\'t temperatures aren\'t supported yet')
            # if different temperatures are specified the metadata file, 
            # then we need the energies to compute the PMF, found in the third column
            # for j,l in enumerate(clean_split_lines):
            #     data[i,j] = float(l[1]) # second column is the coordinate
            #     # third column will be the system's potential energy
            #     potential_energy = float(l[2])
            #     dchi = w['coord']-float(l[1])
            #     restraint_potential = k_multiplier*w['k']*(dchi**2)
            #     # TODO: given the coordinate and the restraining potential, calculate the umbrella restraint
            #     u_kn[i,j] = beta_k[i] * (potential_energy-restraint_potential) # reduced potential energy without umbrella restraint
            #         
            # # Compute correlation times for potential energy and timeseries.
            # # If the temperatures differ, use energies to determine samples; otherwise, use the cosine of chi
            # g_k[i] = timeseries.statisticalInefficiency(u_kn[i,:], u_kn[i,:])
            # indices = timeseries.subsampleCorrelatedData(u_kn[i,:])
        else:
            # no temperature column
            for j,l in enumerate(clean_split_lines):
                data[i,j] = float(l[1])
            dataset = numpy.cos(data[i,:w['n']])
            g_k[i] = timeseries.statisticalInefficiency(dataset,dataset)
            if not options.skip_subsampling:
                indices = timeseries.subsampleCorrelatedData(dataset)

        if options.skip_subsampling:
            data_max.append(numpy.max(data[i]))
            data_min.append(numpy.min(data[i]))
            w['n'] = len(data[i])
            u_kn[i,0:w['n']] = u_kn[i]
            data[i,0:w['n']] = data[i]
        else:
            # get min and max for data, used for binning ranges
            data_max.append(numpy.max(data[i,indices]))
            data_min.append(numpy.min(data[i,indices]))
            # Subsample the data
            w['n'] = len(indices)
            u_kn[i,0:w['n']] = u_kn[i,indices]
            data[i,0:w['n']] = data[i,indices]
            print "Correlation time for set %5d is %10.3f" % (i,g_k[i])

    print "Finished reading data files"
    # Set zero of u_kn -- this is arbitrary.
    u_kn -= u_kn.min()

    # Construct torsion bins
    print "Binning data..."
    
    data_min = numpy.min(data_min)
    data_max = numpy.max(data_max)
    delta = (data_max - data_min) / float(options.bins)
    
    print "Min coord: %f" % data_min
    print "Max coord: %f" % data_max
    print "Delta for binning %f" % delta
    # compute bin centers
    bin_center_i = numpy.zeros([options.bins], numpy.float64)
    for i in range(options.bins):
        bin_center_i[i] = data_min + delta/2 + delta * i
    
    # Bin data
    bin_kn = numpy.zeros([K,N_max], numpy.int32)-1
    # for each window
    for k in range(K):
        # for 0 to the number of snapshots in the window k
        for n in range(metadata[k]['n']):            
            # Compute bin assignment.
            bin_kn[k,n] = int((data[k,n] - data_min) / delta)
            for l in range(K):
                # Compute minimum-image torsion deviation from umbrella center l
                dchi = data[k,n] - metadata[l]['coord']
                # Compute energy of snapshot n from simulation k in umbrella potential l
                u_kln[k,l,n] = u_kn[k,n] + beta_k[k]*metadata[l]['k']*(dchi**2)
    
    for i in range(options.bins):
        if numpy.sum(bin_kn==i) == 0:
            for j in range(options.bins):
                print "Bin: %d" % j
                print numpy.sum(bin_kn==j)
            raise Exception("At least one bin has no samples. Adjust bin sizes or eliminate empty bins to ensure at least one sample per bin.")        

    # Initialize MBAR.
    print "Running MBAR..."
    N_k = numpy.array([ w['n'] for w in metadata ], numpy.int32)
    mbar = pymbar.MBAR(u_kln, N_k, verbose=options.verbose, initialize='BAR')
    #mbar = pymbar.MBAR(u_kln, N_k, verbose=options.verbose)
    #mbar = pymbar.MBAR(u_kln, N_k, verbose = True, method = 'Newton-Raphson')

    # Compute PMF in unbiased potential (in units of kT).
    (f_i, df_i) = mbar.computePMF(u_kn, bin_kn, options.bins)

    # Write out PMF and save to file
    print "Saving PMF to file: %s" % options.output_file
    f = open(options.output_file, 'w')
    print "PMF (in units of kT)"
    print "%8s %8s %8s" % ('bin', 'f', 'df')
    f.write("#Coor   Free    +/-\n")
    for i in range(options.bins):
        print "%8.1f %8.3f %8.3f" % (bin_center_i[i], f_i[i], df_i[i])
        f.write("%8.1f %8.3f %8.3f\n" % (bin_center_i[i], f_i[i], df_i[i]))
    f.close()
    #thisDistDat = np.loadtxt('%s/prod_pullx.xvg'%adir)
    thisEDat = np.loadtxt('%s/prod_out.txt' % adir)
    thisDistDat = np.loadtxt('%s/prod_restraint.txt' % adir)

    #allRef[i] = thisDistDat[0, 2]
    allRef[i] = thisDistDat[0, 1]

    #Need to adjust so have energies with same frequency as distances
    #Should also have restraint energies in file so can subtract
    #thisEnergy = thisEDat[::2,2] - thisEDat[::2,1]
    #thisDist = thisDistDat[:,1]
    thisEnergy = thisEDat[:, 2] - thisDistDat[:, 3]
    thisDist = thisDistDat[:, 2]

    #Only take uncorrelated samples...
    uncorrinds = timeseries.subsampleCorrelatedData(thisEnergy)
    #uncorrinds = np.arange(len(thisEnergy))
    numSamples[i] = len(uncorrinds)

    print "For %s, have %i independent samples." % (adir, len(uncorrinds))

    allU = np.hstack((allU, thisEnergy[uncorrinds]))
    allDist = np.hstack((allDist, thisDist[uncorrinds]))

    #Plot histogram with uncorrelated indices
    thisHist, thisBins = np.histogram(thisDist[uncorrinds],
                                      bins='auto',
                                      density=False)
    binCents = 0.5 * (thisBins[:-1] + thisBins[1:])
    histAx.plot(binCents, thisHist)
Example #39
0
def overlap_check(reference_system, positions, receptor_atoms, ligand_atoms, platform_name=None, annihilate_electrostatics=True, annihilate_sterics=False, precision=None, nsteps=50, nsamples=200):
    """
    Test overlap between reference system and alchemical system by running a short simulation.

    Parameters
    ----------
    reference_system : simtk.openmm.System
       The reference System object to compare with
    positions : simtk.unit.Quantity with units compatible with nanometers
       The positions to assess energetics for.
    receptor_atoms : list of int
       The list of receptor atoms.
    ligand_atoms : list of int
       The list of ligand atoms to alchemically modify.
    platform_name : str, optional, default=None
       The name of the platform to use for benchmarking.
    annihilate_electrostatics : bool, optional, default=True
       If True, electrostatics will be annihilated; if False, decoupled.
    annihilate_sterics : bool, optional, default=False
       If True, sterics will be annihilated; if False, decoupled.
    nsteps : int, optional, default=50
       Number of molecular dynamics steps between samples.
    nsamples : int, optional, default=100
       Number of samples to collect.

    """

    # Create a fully-interacting alchemical state.
    factory = AbsoluteAlchemicalFactory(reference_system, ligand_atoms=ligand_atoms)
    alchemical_state = AlchemicalState()
    alchemical_system = factory.createPerturbedSystem(alchemical_state)

    temperature = 300.0 * units.kelvin
    collision_rate = 5.0 / units.picoseconds
    timestep = 2.0 * units.femtoseconds
    kT = (kB * temperature)

    # Select platform.
    platform = None
    if platform_name:
        platform = openmm.Platform.getPlatformByName(platform_name)

    # Create integrators.
    reference_integrator = openmm.LangevinIntegrator(temperature, collision_rate, timestep)
    alchemical_integrator = openmm.VerletIntegrator(timestep)

    # Create contexts.
    if platform:
        reference_context = openmm.Context(reference_system, reference_integrator, platform)
        alchemical_context = openmm.Context(alchemical_system, alchemical_integrator, platform)
    else:
        reference_context = openmm.Context(reference_system, reference_integrator)
        alchemical_context = openmm.Context(alchemical_system, alchemical_integrator)

    # Collect simulation data.
    reference_context.setPositions(positions)
    du_n = np.zeros([nsamples], np.float64) # du_n[n] is the
    for sample in range(nsamples):
        # Run dynamics.
        reference_integrator.step(nsteps)

        # Get reference energies.
        reference_state = reference_context.getState(getEnergy=True, getPositions=True)
        reference_potential = reference_state.getPotentialEnergy()

        # Get alchemical energies.
        alchemical_context.setPositions(reference_state.getPositions())
        alchemical_state = alchemical_context.getState(getEnergy=True)
        alchemical_potential = alchemical_state.getPotentialEnergy()

        du_n[sample] = (alchemical_potential - reference_potential) / kT

    # Clean up.
    del reference_context, alchemical_context

    # Discard data to equilibration and subsample.
    from pymbar import timeseries
    [t0, g, Neff] = timeseries.detectEquilibration(du_n)
    indices = timeseries.subsampleCorrelatedData(du_n, g=g)
    du_n = du_n[indices]

    # Compute statistics.
    from pymbar import EXP
    [DeltaF, dDeltaF] = EXP(du_n)

    # Raise an exception if the error is larger than 3kT.
    MAX_DEVIATION = 3.0 # kT
    if (dDeltaF > MAX_DEVIATION):
        report = "DeltaF = %12.3f +- %12.3f kT (%5d samples, g = %6.1f)" % (DeltaF, dDeltaF, Neff, g)
        raise Exception(report)

    return
Example #40
0
def extract_ncfile_energies(ncfile, ndiscard=0, nuse=None, g=None):
    """
    Extract and decorelate energies from the ncfile to gather common data for other functions

    Parameters
    ----------
    ncfile : NetCDF
       Input YANK netcdf file
    ndiscard : int, optional, default=0
       Number of iterations to discard to equilibration
    nuse : int, optional, default=None
       Maximum number of iterations to use (after discarding)
    g : int, optional, default=None
       Statistical inefficiency to use if desired; if None, will be computed.

    TODO
    ----
    * Automatically determine 'ndiscard'.

    """
    # Get current dimensions.
    niterations = ncfile.variables['energies'].shape[0]
    nstates = ncfile.variables['energies'].shape[1]
    natoms = ncfile.variables['energies'].shape[2]

    # Extract energies.
    logger.info("Reading energies...")
    energies = ncfile.variables['energies']
    u_kln_replica = np.zeros([nstates, nstates, niterations], np.float64)
    for n in range(niterations):
        u_kln_replica[:,:,n] = energies[n,:,:]
    logger.info("Done.")

    # Deconvolute replicas
    logger.info("Deconvoluting replicas...")
    u_kln = np.zeros([nstates, nstates, niterations], np.float64)
    for iteration in range(niterations):
        state_indices = ncfile.variables['states'][iteration,:]
        u_kln[state_indices,:,iteration] = energies[iteration,:,:]
    logger.info("Done.")

    # Compute total negative log probability over all iterations.
    u_n = np.zeros([niterations], np.float64)
    for iteration in range(niterations):
        u_n[iteration] = np.sum(np.diagonal(u_kln[:,:,iteration]))

    # Discard initial data to equilibration.
    u_kln_replica = u_kln_replica[:,:,ndiscard:]
    u_kln = u_kln[:,:,ndiscard:]
    u_n = u_n[ndiscard:]

    # Truncate to number of specified conforamtions to use
    if (nuse):
        u_kln_replica = u_kln_replica[:,:,0:nuse]
        u_kln = u_kln[:,:,0:nuse]
        u_n = u_n[0:nuse]

    # Subsample data to obtain uncorrelated samples
    N_k = np.zeros(nstates, np.int32)
    indices = timeseries.subsampleCorrelatedData(u_n, g=g) # indices of uncorrelated samples
    #print(u_n) # DEBUG
    #indices = range(0,u_n.size) # DEBUG - assume samples are uncorrelated
    N = len(indices) # number of uncorrelated samples
    N_k[:] = N
    u_kln = u_kln[:,:,indices]
    logger.info("number of uncorrelated samples:")
    logger.info(N_k)
    logger.info("")

    # Check for the expanded cutoff states, and subsamble as needed
    try:
        u_ln_full_raw = ncfile.variables['fully_interacting_expanded_cutoff_energies'][:].T #Its stored as nl, need in ln
        u_ln_non_raw = ncfile.variables['noninteracting_expanded_cutoff_energies'][:].T 
        fully_interacting_u_ln = np.zeros(u_ln_full_raw.shape)
        noninteracting_u_ln = np.zeros(u_ln_non_raw.shape)
        # Deconvolute the fully interacting state
        for iteration in range(niterations):
            state_indices = ncfile.variables['states'][iteration,:]
            fully_interacting_u_ln[state_indices,iteration] = u_ln_full_raw[:,iteration]
            noninteracting_u_ln[state_indices,iteration] = u_ln_non_raw[:,iteration]
        # Discard non-equilibrated samples
        fully_interacting_u_ln = fully_interacting_u_ln[:,ndiscard:]
        fully_interacting_u_ln = fully_interacting_u_ln[:,indices]
        noninteracting_u_ln = noninteracting_u_ln[:,ndiscard:]
        noninteracting_u_ln = noninteracting_u_ln[:,indices]
        # Augment u_kln to accept the new state
        u_kln_new = np.zeros([nstates + 2, nstates + 2, N], np.float64)
        N_k_new = np.zeros(nstates + 2, np.int32)
        # Insert energies
        u_kln_new[1:-1,0,:] = fully_interacting_u_ln
        u_kln_new[1:-1,-1,:] = noninteracting_u_ln
        # Fill in other energies
        u_kln_new[1:-1,1:-1,:] = u_kln 
        N_k_new[1:-1] = N_k
        # Notify users
        logger.info("Found expanded cutoff states in the energies!")
        logger.info("Free energies will be reported relative to them instead!")
        # Reset values, last step in case something went wrong so we dont overwrite u_kln on accident
        u_kln = u_kln_new
        N_k = N_k_new
    except:
        pass

    return u_kln, N_k, u_n
out.write("Molecule, Log D +/-, HPD95%[low, high]\n")
debug.write("Molecule mean - median = difference")
used_samples.write("Molecule, equilibration, N samples")
# curdir = os.getcwd()
# os.makedirs("plots", )
# os.chdir("plots")


for mol in sorted(list(x.logd.keys())):
    print("Processing {}".format(mol))
    # sns.plt.figure()
    trace = numpy.asarray(mc.trace("LogD_{}".format(mol))[:])
    # Burn in and thinning estimated using pymbar
    burnin = detectEquilibration(trace)[0]
    trace= trace[burnin:]
    uncorrelated_indices = subsampleCorrelatedData(trace)
    trace=trace[uncorrelated_indices]

    median = pymc.utils.quantiles(trace)[50]
    mean = numpy.mean(trace)
    lower, upper = pymc.utils.hpd(trace, 0.05)
    lower_s = to_precision(lower,2) # string of number with 2 sig digits
    upper_s = to_precision(upper,2)
    logd = ufloat(mean, numpy.std(trace))

    # Formats the mean and error by the correct amount of significant digits
    out.write("{0}, {1:.1u}, [{2}, {3}]\n".format(mol, logd, lower_s, upper_s ))
    debug.write("{}: {} - {} = {}".format(mol, mean, median, mean-median))
    used_samples.write("{}, {}, {}".format(mol, burnin, len(uncorrelated_indices)))
    # pymc.Matplot.plot(trace, "LogD_{}".format(mol))
    # sns.plt.figure()
Example #42
0
def SimulateAlchemy(path, niter, nsteps_per_iter, nlambda):
    """Calculates the binding free energy of a ligand names 'UNL' using alchemy.
    One step corresponds to two femtoseconds.
    """
    prmtop = app.AmberPrmtopFile(f'{path}/com.prmtop')
    inpcrd = app.AmberInpcrdFile(f'{path}/com.inpcrd')
    system = prmtop.createSystem(implicitSolvent=app.GBn2,
                                 nonbondedMethod=app.CutoffNonPeriodic,
                                 nonbondedCutoff=1.0 * unit.nanometers,
                                 constraints=app.HBonds,
                                 rigidWater=True,
                                 ewaldErrorTolerance=0.0005)

    # Detect ligand indices
    ligand_ind = []
    for atm in prmtop.topology.atoms():
        # OpenEye make the ligand name 'UNL'
        if atm.residue.name == 'UNL':
            ligand_ind.append(atm.index)
    ligand_ind = set(ligand_ind)
    AddAlchemyForces(system, ligand_ind)

    integrator = mm.LangevinIntegrator(300 * unit.kelvin,
                                       1.0 / unit.picoseconds,
                                       2.0 * unit.femtoseconds)
    integrator.setConstraintTolerance(0.00001)
    # TODO: The issues here are the same as the mmgbsa.py script
    # TODO: This should just recognize whatever the computer is capable of, not force CUDA.
    # TODO: I am not sure if mixed precision is necessary. Just need to be consistent
    platform = mm.Platform.getPlatformByName('CUDA')
    properties = {'CudaPrecision': 'mixed'}
    simulation = app.Simulation(prmtop.topology, system, integrator, platform)
    simulation.context.setPositions(inpcrd.positions)
    simulation.minimizeEnergy()

    ### Now simulate system
    import numpy as np
    from pymbar import MBAR, timeseries
    lambdas = np.linspace(1.0, 0.0, nlambda)
    # Save the potential energies for MBAR
    u_kln = np.zeros([nlambda, nlambda, niter])
    kT = unit.AVOGADRO_CONSTANT_NA * unit.BOLTZMANN_CONSTANT_kB * integrator.getTemperature(
    )
    # TODO: This runs in series. Someone comfortable with MPI should help parallelize this.
    for k in range(nlambda):
        for i in range(niter):
            print('state %5d iteration %5d / %5d' % (k, i, niter))
            simulation.context.setParameter('lambda', lambdas[k])
            integrator.step(nsteps_per_iter)
            for l in range(nlambda):
                simulation.context.setParameter('lambda', lambdas[l])
                u_kln[k, l, i] = simulation.context.getState(
                    getEnergy=True).getPotentialEnergy() / kT

    # Subsample to reduce variation
    N_k = np.zeros([nlambda], np.int32)  # number of uncorrelated samples
    for k in range(nlambda):
        [t0, g, Neff_max] = timeseries.detectEquilibration(u_kln[k, k, :])
        # TODO: maybe should use 't0:' instead of ':' in third index
        indices = timeseries.subsampleCorrelatedData(u_kln[k, k, :], g=g)
        N_k[k] = len(indices)
        u_kln[k, :, 0:N_k[k]] = u_kln[k, :, indices].T
    # Calculate the energy difference
    # TODO: I've never worked with pymbar beyond the timeseries function. I'm not sure how the error in DeltaF is calculated, and I don't know what Theta is right now.
    mbar = MBAR(u_kln, N_k)
    [DeltaF_ij, dDeltaF_ij, Theta_ij] = mbar.getFreeEnergyDifferences()
    return DeltaF_ij[0][-1], dDeltaF_ij[0][-1]
   # Estimate the statistical inefficiency of the simulation by analyzing the timeseries of interest.
   # We use the max of cos and sin of the phi and psi timeseries because they are periodic angles.
   # The 
   print "Computing statistical inefficiencies..."
   g_cosphi = timeseries.statisticalInefficiencyMultiple(numpy.cos(phi_kt_replica * numpy.pi / 180.0))
   print "g_cos(phi) = %.1f" % g_cosphi
   g_sinphi = timeseries.statisticalInefficiencyMultiple(numpy.sin(phi_kt_replica * numpy.pi / 180.0))
   print "g_sin(phi) = %.1f" % g_sinphi   
   g_cospsi = timeseries.statisticalInefficiencyMultiple(numpy.cos(psi_kt_replica * numpy.pi / 180.0))
   print "g_cos(psi) = %.1f" % g_cospsi
   g_sinpsi = timeseries.statisticalInefficiencyMultiple(numpy.sin(psi_kt_replica * numpy.pi / 180.0))
   print "g_sin(psi) = %.1f" % g_sinpsi
   # Subsample data with maximum of all correlation times.
   print "Subsampling data..."
   g = numpy.max(numpy.array([g_cosphi, g_sinphi, g_cospsi, g_sinpsi]))
   indices = timeseries.subsampleCorrelatedData(U_kt[k,:], g = g)   
   print "Using g = %.1f to obtain %d uncorrelated samples per temperature" % (g, len(indices))
   N_max = int(numpy.ceil(T / g)) # max number of samples per temperature   
   U_kn = numpy.zeros([K, N_max], numpy.float64)
   phi_kn = numpy.zeros([K, N_max], numpy.float64)
   psi_kn = numpy.zeros([K, N_max], numpy.float64)
   N_k = N_max * numpy.ones([K], numpy.int32)
   for k in range(K):
      U_kn[k,:] = U_kt[k,indices]
      phi_kn[k,:] = phi_kt[k,indices]
      psi_kn[k,:] = psi_kt[k,indices]
   print "%d uncorrelated samples per temperature" % N_max
         
#===================================================================================================
# Generate a list of indices of all configurations in kn-indexing
#===================================================================================================
Example #44
0
                getEnergy=True).getPotentialEnergy() / (kT)
            print(potential_energy)
            u_kln[k, l, iteration] = potential_energy
            outline += ",%.4f" % potential_energy
        simfile.write(outline)

simfile.close()

print("**************************************************")
print("Estimation of free energy with MBAR ...")
#try an on the fly mbar estimation
# Subsample data to extract uncorrelated equilibrium timeseries
N_k = np.zeros([nstates], np.int32)  # number of uncorrelated samples
for k in range(nstates):
    [nequil, g, Neff_max] = timeseries.detectEquilibration(u_kln[k, k, :])
    indices = timeseries.subsampleCorrelatedData(u_kln[k, k, :], g=g)
    N_k[k] = len(indices)
    u_kln[k, :, 0:N_k[k]] = u_kln[k, :, indices].T
# Compute free energy differences and statistical uncertainties
mbar = MBAR(u_kln,
            N_k,
            verbose=True,
            method="adaptive",
            relative_tolerance=1e-10)  #, initialize="BAR")
[DeltaF_ij, dDeltaF_ij,
 Theta_ij] = mbar.getFreeEnergyDifferences(uncertainty_method='svd-ew')

print('DeltaF_ij (kcal/mol):')
print(DeltaF_ij[0, nstates - 1] * 298.0 * 0.001987204)

mbar_fe = DeltaF_ij[0, nstates - 1] * 298.0 * 0.001987204
    # Subsample trajectories based on Hconf
    hconf = zeros([K, N_max], float64)
    volume = zeros([K, N_max], float64)
    uconf = zeros([K, N_max], float64)
    N_k = zeros([K], int32)

    if correlated_data == 1:
        hconf = zeros([K, T_max], float64)
        N_ksam = zeros([K], int32)
        indices2 = zeros([T_max], int32)
        for k in range(1,T_max):
            indices2[k] = k
        for k in range(K):
            # Compute correlation times
            if not hconf_original[k,0] == 0:
                indices = timeseries.subsampleCorrelatedData(hconf_original[k,0:T_k[k]])
            # Store subsampled positions
                if len(indices) >= 1000:
                    N_ksam[k] = len(indices)
                    hconf[k,0:N_ksam[k]] = hconf_original[k,indices]
                    volume[k,0:N_ksam[k]] = volume_original[k,indices]
                    uconf[k,0:N_ksam[k]] = uconf_original[k,indices]
                else:
                    N_ksam[k] = len(indices2)
                    hconf[k,0:N_ksam[k]] = hconf_original[k,indices2]
                    volume[k,0:N_ksam[k]] = volume_original[k,indices2]
                    uconf[k,0:N_ksam[k]] = uconf_original[k,indices2]
                print('\n')
            else:
                N_ksam[k] = len(indices2)
                hconf[k,0:N_ksam[k]] = hconf_original[k,indices2]
            if line[0] != '#' and line[0] != '@':
                tokens = line.split()
                print(tokens)
                u_kn[k, n] = beta_k[k] * (
                    float(tokens[2]) - float(tokens[1])
                )  # reduced potential energy without umbrella restraint
                n += 1

    # Compute correlation times for potential energy and chi
    # timeseries.  If the temperatures differ, use energies to determine samples; otherwise, use the cosine of chi

    if (DifferentTemperatures):
        g_k[k] = timeseries.statisticalInefficiency(u_kn[k, :], u_kn[k,
                                                                     0:N_k[k]])
        print("Correlation time for set %5d is %10.3f" % (k, g_k[k]))
        indices = timeseries.subsampleCorrelatedData(u_kn[k, 0:N_k[k]])
    else:
        chi_radians = chi_kn[k, 0:N_k[k]] / (180.0 / numpy.pi)
        g_cos = timeseries.statisticalInefficiency(numpy.cos(chi_radians))
        g_sin = timeseries.statisticalInefficiency(numpy.sin(chi_radians))
        print("g_cos = %.1f | g_sin = %.1f" % (g_cos, g_sin))
        g_k[k] = max(g_cos, g_sin)
        print("Correlation time for set %5d is %10.3f" % (k, g_k[k]))
        indices = timeseries.subsampleCorrelatedData(chi_radians, g=g_k[k])
    # Subsample data.
    N_k[k] = len(indices)
    u_kn[k, 0:N_k[k]] = u_kn[k, indices]
    chi_kn[k, 0:N_k[k]] = chi_kn[k, indices]

N_max = numpy.max(N_k)  # shorten the array size
u_kln = numpy.zeros(
Example #47
0
kT = unit.AVOGADRO_CONSTANT_NA * unit.BOLTZMANN_CONSTANT_kB * integrator.getTemperature()
for k in range(nstates):
    for iteration in range(niterations):
        print('state %5d iteration %5d / %5d' % (k, iteration, niterations))
        # Set alchemical state
        context.setParameter('lambda', lambdas[k])
        # Run some dynamics
        integrator.step(nsteps)
        # Compute energies at all alchemical states
        for l in range(nstates):
            context.setParameter('lambda', lambdas[l])
            u_kln[k,l,iteration] = context.getState(getEnergy=True).getPotentialEnergy() / kT

# Estimate free energy of Lennard-Jones particle insertion
from pymbar import MBAR, timeseries
# Subsample data to extract uncorrelated equilibrium timeseries
N_k = np.zeros([nstates], np.int32) # number of uncorrelated samples
for k in range(nstates):
    [nequil, g, Neff_max] = timeseries.detectEquilibration(u_kln[k,k,:])
    indices = timeseries.subsampleCorrelatedData(u_kln[k,k,:], g=g)
    N_k[k] = len(indices)
    u_kln[k,:,0:N_k[k]] = u_kln[k,:,indices].T
# Compute free energy differences and statistical uncertainties
mbar = MBAR(u_kln, N_k)
[DeltaF_ij, dDeltaF_ij, Theta_ij] = mbar.getFreeEnergyDifferences()

print('DeltaF_ij (kT):')
print(DeltaF_ij)
print('dDeltaF_ij (kT):')
print(dDeltaF_ij)
Example #48
0
def generate_simulation_data(database, parameters, cid):
    """
    Regenerate simulation data for given parameters.

    ARGUMENTS

    database (dict) - database of molecules
    parameters (dict) - dictionary of GBSA parameters keyed on GBSA atom types

    """

    platform = openmm.Platform.getPlatformByName("Reference")

    from pymbar import timeseries

    entry = database[cid]
    molecule = entry["molecule"]
    iupac_name = entry["iupac"]

    # Retrieve vacuum system.
    vacuum_system = copy.deepcopy(entry["system"])

    # Retrieve OpenMM System.
    solvent_system = copy.deepcopy(entry["system"])

    # Get nonbonded force.
    forces = {
        solvent_system.getForce(index).__class__.__name__: solvent_system.getForce(index)
        for index in range(solvent_system.getNumForces())
    }
    nonbonded_force = forces["NonbondedForce"]

    # Add GBSA term
    gbsa_force = openmm.GBSAOBCForce()
    gbsa_force.setNonbondedMethod(openmm.GBSAOBCForce.NoCutoff)  # set no cutoff
    gbsa_force.setSoluteDielectric(1)
    gbsa_force.setSolventDielectric(78)

    # Build indexable list of atoms.
    atoms = [atom for atom in molecule.GetAtoms()]
    natoms = len(atoms)

    # Assign GBSA parameters.
    for (atom_index, atom) in enumerate(atoms):
        [charge, sigma, epsilon] = nonbonded_force.getParticleParameters(atom_index)
        atomtype = atom.GetStringData("gbsa_type")  # GBSA atomtype
        radius = parameters["%s_%s" % (atomtype, "radius")] * units.angstroms
        scalingFactor = parameters["%s_%s" % (atomtype, "scalingFactor")]
        gbsa_force.addParticle(charge, radius, scalingFactor)

    # Add the force to the system.
    solvent_system.addForce(gbsa_force)

    # Create context for solvent system.
    timestep = 2.0 * units.femtosecond
    collision_rate = 20.0 / units.picoseconds
    temperature = entry["temperature"]
    integrator = openmm.LangevinIntegrator(temperature, collision_rate, timestep)
    context = openmm.Context(vacuum_system, integrator, platform)

    # Set the coordinates.
    positions = entry["positions"]
    context.setPositions(positions)

    # Minimize.
    openmm.LocalEnergyMinimizer.minimize(context)

    # Simulate, saving periodic snapshots of configurations.
    kT = kB * temperature
    beta = 1.0 / kT

    initial_time = time.time()
    nsteps_per_iteration = 2500
    niterations = 200
    x_n = np.zeros([niterations, natoms, 3], np.float32)  # positions, in nm
    u_n = np.zeros([niterations], np.float64)  # energy differences, in kT
    for iteration in range(niterations):
        integrator.step(nsteps_per_iteration)
        state = context.getState(getEnergy=True, getPositions=True)
        x_n[iteration, :, :] = state.getPositions(asNumpy=True) / units.nanometers
        u_n[iteration] = beta * state.getPotentialEnergy()

    if np.any(np.isnan(u_n)):
        raise Exception("Encountered NaN for molecule %s | %s" % (cid, iupac_name))

    final_time = time.time()
    elapsed_time = final_time - initial_time

    # Clean up.
    del context, integrator

    # Discard initial transient to equilibration.
    [t0, g, Neff_max] = timeseries.detectEquilibration(u_n)
    x_n = x_n[t0:, :, :]
    u_n = u_n[t0:]

    # Subsample to remove correlation.
    indices = timeseries.subsampleCorrelatedData(u_n, g=g)
    x_n = x_n[indices, :, :]
    u_n = u_n[indices]

    # Store data.
    entry["x_n"] = x_n
    entry["u_n"] = u_n

    print "%48s | %64s | simulation %12.3f s | %5d samples discarded | %5d independent samples remain" % (
        cid,
        iupac_name,
        elapsed_time,
        t0,
        len(indices),
    )

    return [cid, entry]
Example #49
0
def DoBAR(fwds, revs, label, verbose):
    """

    BAR to combine fwd and rev data of dGs.
    Here, don't multiply dGs_R by -1 since BAR calls for reverse work value.

    Parameters
    ----------
    fwds: dictionary of forward work values for each window
    revs: dictionary of reverse work values for each window
    label: string label of what it is (only for printing output)

    Returns
    -------
    dgs: 1D list of accumulated list of energy values. Ex. if each step was 2,
       then dgs would be [0,2,4...]
    gsdlist: 1D list of accompanying stdevs to the dgs list

    """

    fwd_ss = {} # subsampled version of fwds
    rev_ss = {} # subsampled version of revs
    dg_bar = np.zeros([len(fwds)], np.float64)  # allocate storage: dG steps
    gsd_bar = np.zeros([len(fwds)], np.float64) # allocate storage: dG stdev steps
    dgs = np.zeros([len(fwds)], np.float64)     # allocate storage: dG accumulated
    gsdlist = np.zeros([len(fwds)], np.float64) # allocate storage: dG stdev accum


    #corr_time = np.zeros([len(fwds)], np.float64)
    corr_time = {}
    for key, value in fwds.items(): # this notation changes in python3: http://tinyurl.com/j3uq3me
        # compute correlation time
        g = timeseries.statisticalInefficiency(value)
        corr_time[key] = [g]
        # compute indices of UNcorrelated timeseries, then extract those samples
        indices = timeseries.subsampleCorrelatedData(value, g)
        fwd_ss[key] = value[indices]

    for key, value in revs.items(): # this notation changes in python3: http://tinyurl.com/j3uq3me
        # compute correlation time
        g = timeseries.statisticalInefficiency(value)
        corr_time[key].append(g)
        # compute indices of UNcorrelated timeseries, then extract those samples
        indices = timeseries.subsampleCorrelatedData(value, g)
        rev_ss[key] = value[indices]

    bar = {}
    # then apply BAR estimator to get dG for each step
    for kF, kR in zip(sorted(fwd_ss.keys()), sorted(list(rev_ss.keys()), reverse=True)):
        dg_bar[kF], gsd_bar[kF] = BAR(fwd_ss[kF],rev_ss[kR])
        bar[kF] = [ np.sum(dg_bar), dg_bar[kF], gsd_bar[kF] ]

    # calculate the net dG standard deviation = sqrt[ sum(s_i^2) ]
    gsd = (np.sum(np.power(gsd_bar, 2)))**0.5

    net = 0.
    netsd = 0.
    for i, g in enumerate(dg_bar):
        # accumulate net dGs into running sums (plot this)
        dgs[i] = dg_bar[i] + net
        net = dgs[i]
        # combine the stdevs: s = sqrt(s1^2 + s2^2 + ...)
        gsdlist[i] = ((gsd_bar[i])**2.+(netsd)**2.)**0.5
        netsd = gsdlist[i]


    if verbose == True:
        print('\n\n#####---Correlation Times for dG_{}--#####'.format(label))
        print('Window'.rjust(3), 'F'.rjust(5), 'R'.rjust(9))
        for k,v in corr_time.items():
            print("{:3d} {:10.3f} {:10.3f}".format(k, v[0], v[1]) )

        print("\n\n#####---BAR estimator for dG_{}---#####".format(label))
        print('Window'.rjust(3), 'dG'.rjust(5), 'ddG'.rjust(11), "Uncert.".rjust(11))
        print("---------------------------------------------------------")


        for k, v in bar.items():
            str = '{:3d} {:10.4f} {:10.4f} +- {:3.4f}'.format(k, v[0], v[1], v[2])
            print(str)

    print(("\nNet dG_{} energy difference = {:.4f} +- {:.4f} kcal/mol".format(label, np.sum(dg_bar), gsd)))

    return dgs, gsdlist
Example #50
0
#######################################################################
#           Subsample {U,A}_kn_correlated to be uncorrelated          #
#######################################################################

print "Subsampling to achieve uncorrelated data"
if stat_inefficiency == None:
   print "(1 of 2) Calculating statistical inefficiency (i = ",
   stdout.flush()
   for d in range(N_CVs):
      statnew = timeseries.statisticalInefficiencyMultiple(A_ikn_correlated[d])
      stat_inefficiency = max([stat_inefficiency, statnew])
   print stat_inefficiency, ")"
else:
   print "(1 of 2) Using given statistical inefficiency (i =", str(stat_inefficiency) + ")"

indices = timeseries.subsampleCorrelatedData(U_kn_correlated[0,:], g = stat_inefficiency)
N_uncorrelated_samples = len(indices)

print "(2 of 2) Subsampling to achieve", N_uncorrelated_samples, "samples per replica"

U_kn  = zeros([      N_replicas+N_output_temps,N_uncorrelated_samples], float32)
A_ikn = zeros([N_CVs,N_replicas+N_output_temps,N_uncorrelated_samples], float32)

for k in range(N_replicas):
   U_kn[k] = U_kn_correlated[k][indices]
   for d in range(N_CVs):
      A_ikn[d][k] = A_ikn_correlated[d][k][indices]

print ""

#######################################################################
Example #51
0
def estimate_free_energies(ncfile, ndiscard=0, nuse=None, g=None):
    """
    Estimate free energies of all alchemical states.

    Parameters
    ----------
    ncfile : NetCDF
       Input YANK netcdf file
    ndiscard : int, optional, default=0
       Number of iterations to discard to equilibration
    nuse : int, optional, default=None
       Maximum number of iterations to use (after discarding)
    g : int, optional, default=None
       Statistical inefficiency to use if desired; if None, will be computed.

    TODO
    ----
    * Automatically determine 'ndiscard'.

    """

    # Get current dimensions.
    niterations = ncfile.variables['energies'].shape[0]
    nstates = ncfile.variables['energies'].shape[1]
    natoms = ncfile.variables['energies'].shape[2]

    # Extract energies.
    logger.info("Reading energies...")
    energies = ncfile.variables['energies']
    u_kln_replica = np.zeros([nstates, nstates, niterations], np.float64)
    for n in range(niterations):
        u_kln_replica[:,:,n] = energies[n,:,:]
    logger.info("Done.")

    # Deconvolute replicas
    logger.info("Deconvoluting replicas...")
    u_kln = np.zeros([nstates, nstates, niterations], np.float64)
    for iteration in range(niterations):
        state_indices = ncfile.variables['states'][iteration,:]
        u_kln[state_indices,:,iteration] = energies[iteration,:,:]
    logger.info("Done.")

    # Compute total negative log probability over all iterations.
    u_n = np.zeros([niterations], np.float64)
    for iteration in range(niterations):
        u_n[iteration] = np.sum(np.diagonal(u_kln[:,:,iteration]))
    #logger.info(u_n

    # DEBUG
    outfile = open('u_n.out', 'w')
    for iteration in range(niterations):
        outfile.write("%8d %24.3f\n" % (iteration, u_n[iteration]))
    outfile.close()

    # Discard initial data to equilibration.
    u_kln_replica = u_kln_replica[:,:,ndiscard:]
    u_kln = u_kln[:,:,ndiscard:]
    u_n = u_n[ndiscard:]

    # Truncate to number of specified conforamtions to use
    if (nuse):
        u_kln_replica = u_kln_replica[:,:,0:nuse]
        u_kln = u_kln[:,:,0:nuse]
        u_n = u_n[0:nuse]

    # Subsample data to obtain uncorrelated samples
    N_k = np.zeros(nstates, np.int32)
    indices = timeseries.subsampleCorrelatedData(u_n, g=g) # indices of uncorrelated samples
    #print u_n # DEBUG
    #indices = range(0,u_n.size) # DEBUG - assume samples are uncorrelated
    N = len(indices) # number of uncorrelated samples
    N_k[:] = N
    u_kln[:,:,0:N] = u_kln[:,:,indices]
    logger.info("number of uncorrelated samples:")
    logger.info(N_k)
    logger.info("")

    #===================================================================================================
    # Estimate free energy difference with MBAR.
    #===================================================================================================

    # Initialize MBAR (computing free energy estimates, which may take a while)
    logger.info("Computing free energy differences...")
    mbar = MBAR(u_kln, N_k)

    # Get matrix of dimensionless free energy differences and uncertainty estimate.
    logger.info("Computing covariance matrix...")

    try:
        # pymbar 2
        (Deltaf_ij, dDeltaf_ij) = mbar.getFreeEnergyDifferences()
    except ValueError:
        # pymbar 3
        (Deltaf_ij, dDeltaf_ij, theta_ij) = mbar.getFreeEnergyDifferences()

#    # Matrix of free energy differences
    logger.info("Deltaf_ij:")
    for i in range(nstates):
        str_row = ""
        for j in range(nstates):
            str_row += "%8.3f" % Deltaf_ij[i, j]
        logger.info(str_row)

#    print Deltaf_ij
#    # Matrix of uncertainties in free energy difference (expectations standard deviations of the estimator about the true free energy)
    logger.info("dDeltaf_ij:")
    for i in range(nstates):
        str_row = ""
        for j in range(nstates):
            str_row += "%8.3f" % dDeltaf_ij[i, j]
        logger.info(str_row)

    # Return free energy differences and an estimate of the covariance.
    return (Deltaf_ij, dDeltaf_ij)
Example #52
0
def estimate_enthalpies(ncfile, ndiscard=0, nuse=None, g=None):
    """
    Estimate enthalpies of all alchemical states.

    Parameters
    ----------
    ncfile : NetCDF
       Input YANK netcdf file
    ndiscard : int, optional, default=0
       Number of iterations to discard to equilibration
    nuse : int, optional, default=None
       Number of iterations to use (after discarding)
    g : int, optional, default=None
       Statistical inefficiency to use if desired; if None, will be computed.

    TODO
    ----
    * Automatically determine 'ndiscard'.
    * Combine some functions with estimate_free_energies.

    """

    # Get current dimensions.
    niterations = ncfile.variables['energies'].shape[0]
    nstates = ncfile.variables['energies'].shape[1]
    natoms = ncfile.variables['energies'].shape[2]

    # Extract energies.
    logger.info("Reading energies...")
    energies = ncfile.variables['energies']
    u_kln_replica = np.zeros([nstates, nstates, niterations], np.float64)
    for n in range(niterations):
        u_kln_replica[:,:,n] = energies[n,:,:]
    logger.info("Done.")

    # Deconvolute replicas
    logger.info("Deconvoluting replicas...")
    u_kln = np.zeros([nstates, nstates, niterations], np.float64)
    for iteration in range(niterations):
        state_indices = ncfile.variables['states'][iteration,:]
        u_kln[state_indices,:,iteration] = energies[iteration,:,:]
    logger.info("Done.")

    # Compute total negative log probability over all iterations.
    u_n = np.zeros([niterations], np.float64)
    for iteration in range(niterations):
        u_n[iteration] = np.sum(np.diagonal(u_kln[:,:,iteration]))
    #print u_n

    # DEBUG
    outfile = open('u_n.out', 'w')
    for iteration in range(niterations):
        outfile.write("%8d %24.3f\n" % (iteration, u_n[iteration]))
    outfile.close()

    # Discard initial data to equilibration.
    u_kln_replica = u_kln_replica[:,:,ndiscard:]
    u_kln = u_kln[:,:,ndiscard:]
    u_n = u_n[ndiscard:]

    # Truncate to number of specified conformations to use
    if (nuse):
        u_kln_replica = u_kln_replica[:,:,0:nuse]
        u_kln = u_kln[:,:,0:nuse]
        u_n = u_n[0:nuse]

    # Subsample data to obtain uncorrelated samples
    N_k = np.zeros(nstates, np.int32)
    indices = timeseries.subsampleCorrelatedData(u_n, g=g) # indices of uncorrelated samples
    #print u_n # DEBUG
    #indices = range(0,u_n.size) # DEBUG - assume samples are uncorrelated
    N = len(indices) # number of uncorrelated samples
    N_k[:] = N
    u_kln[:,:,0:N] = u_kln[:,:,indices]
    logger.info("number of uncorrelated samples:")
    logger.info(N_k)
    logger.info("")

    # Compute average enthalpies.
    H_k = np.zeros([nstates], np.float64) # H_i[i] is estimated enthalpy of state i
    dH_k = np.zeros([nstates], np.float64)
    for k in range(nstates):
        H_k[k] = u_kln[k,k,:].mean()
        dH_k[k] = u_kln[k,k,:].std() / np.sqrt(N)

    return (H_k, dH_k)
def estimate_enthalpies(ncfile, ndiscard = 0, nuse = None):
    """Estimate enthalpies of all alchemical states.

    ARGUMENTS
       ncfile (NetCDF) - input YANK netcdf file

    OPTIONAL ARGUMENTS
       ndiscard (int) - number of iterations to discard to equilibration
       nuse (int) - number of iterations to use (after discarding)

    TODO: Automatically determine 'ndiscard'.
    TODO: Combine some functions with estimate_free_energies.
    """

    # Get current dimensions.
    niterations = ncfile.variables['energies'].shape[0]
    nstates = ncfile.variables['energies'].shape[1]
    natoms = ncfile.variables['energies'].shape[2]

    # Extract energies.
    print "Reading energies..."
    energies = ncfile.variables['energies']
    u_kln_replica = numpy.zeros([nstates, nstates, niterations], numpy.float64)
    for n in range(niterations):
        u_kln_replica[:,:,n] = energies[n,:,:]
    print "Done."

    # Deconvolute replicas
    print "Deconvoluting replicas..."
    u_kln = numpy.zeros([nstates, nstates, niterations], numpy.float64)
    for iteration in range(niterations):
        state_indices = ncfile.variables['states'][iteration,:]
        u_kln[state_indices,:,iteration] = energies[iteration,:,:]
    print "Done."

    # Compute total negative log probability over all iterations.
    u_n = numpy.zeros([niterations], numpy.float64)
    for iteration in range(niterations):
        u_n[iteration] = numpy.sum(numpy.diagonal(u_kln[:,:,iteration]))
    #print u_n

    # DEBUG
    outfile = open('u_n.out', 'w')
    for iteration in range(niterations):
        outfile.write("%8d %24.3f\n" % (iteration, u_n[iteration]))
    outfile.close()

    # Discard initial data to equilibration.
    u_kln_replica = u_kln_replica[:,:,ndiscard:]
    u_kln = u_kln[:,:,ndiscard:]
    u_n = u_n[ndiscard:]

    # Truncate to number of specified conformations to use
    if (nuse):
        u_kln_replica = u_kln_replica[:,:,0:nuse]
        u_kln = u_kln[:,:,0:nuse]
        u_n = u_n[0:nuse]

    # Subsample data to obtain uncorrelated samples
    N_k = numpy.zeros(nstates, numpy.int32)
    indices = timeseries.subsampleCorrelatedData(u_n) # indices of uncorrelated samples
    #print u_n # DEBUG
    #indices = range(0,u_n.size) # DEBUG - assume samples are uncorrelated
    N = len(indices) # number of uncorrelated samples
    N_k[:] = N
    u_kln[:,:,0:N] = u_kln[:,:,indices]
    print "number of uncorrelated samples:"
    print N_k
    print ""

    # Compute average enthalpies.
    H_k = numpy.zeros([nstates], numpy.float64) # H_i[i] is estimated enthalpy of state i
    dH_k = numpy.zeros([nstates], numpy.float64)
    for k in range(nstates):
        H_k[k] = u_kln[k,k,:].mean()
        dH_k[k] = u_kln[k,k,:].std() / numpy.sqrt(N)

    return (H_k, dH_k)