Esempio n. 1
0
def test_statistical_inefficiency_fft():
    X, Y, energy = generate_data()
    timeseries.statisticalInefficiency_fft(X[0])
    timeseries.statisticalInefficiency_fft(X[0]**2)
    timeseries.statisticalInefficiency_fft(energy[0])

    g0 = timeseries.statisticalInefficiency_fft(X[0])
    g1 = timeseries.statisticalInefficiency(X[0])
    g2 = timeseries.statisticalInefficiency(X[0], X[0])
    g3 = timeseries.statisticalInefficiency(X[0], fft=True)
    eq(g0, g1)
    eq(g0, g2)
    eq(g0, g3)
def test_statistical_inefficiency_fft():
    X, Y, energy = generate_data()
    timeseries.statisticalInefficiency_fft(X[0])
    timeseries.statisticalInefficiency_fft(X[0] ** 2)
    timeseries.statisticalInefficiency_fft(energy[0])
    
    g0 = timeseries.statisticalInefficiency_fft(X[0])
    g1 = timeseries.statisticalInefficiency(X[0])
    g2 = timeseries.statisticalInefficiency(X[0], X[0])
    g3 = timeseries.statisticalInefficiency(X[0], fft=True)
    eq(g0, g1)
    eq(g0, g2)
    eq(g0, g3)
Esempio n. 3
0
def detect_equilibration(A_t):
    """
    Automatically detect equilibrated region.

    ARGUMENTS

    A_t (np.array) - timeseries

    RETURNS

    t (int) - start of equilibrated data
    g (float) - statistical inefficiency of equilibrated data
    Neff_max (float) - number of uncorrelated samples   
    
    """
    T = A_t.size

    # Special case if timeseries is constant.
    if A_t.std() == 0.0:
        return (0, 1, T)

    g_t = np.ones([T - 1], np.float32)
    Neff_t = np.ones([T - 1], np.float32)
    for t in range(T - 1):
        g_t[t] = timeseries.statisticalInefficiency(A_t[t:T])
        Neff_t[t] = (T - t + 1) / g_t[t]

    Neff_max = Neff_t.max()
    t = Neff_t.argmax()
    g = g_t[t]

    return (t, g, Neff_max)
Esempio n. 4
0
def get_decorrelation_time(timeseries_to_analyze):
    """
    Compute the decorrelation times given a timeseries.

    See the ``pymbar.timeseries.statisticalInefficiency`` for full documentation
    """
    return timeseries.statisticalInefficiency(timeseries_to_analyze)
Esempio n. 5
0
def get_decorrelation_time(timeseries_to_analyze):
    """
    Compute the decorrelation times given a timeseries.

    See the ``pymbar.timeseries.statisticalInefficiency`` for full documentation
    """
    return timeseries.statisticalInefficiency(timeseries_to_analyze)
Esempio n. 6
0
def prepWindow(filename, tstart=0, tstop=None):
    """
    Read window .traj file, compute correlation times, subsample data.

    Parameters
    ----------
    filename: string name of the file to process.
       For *.traj file, assumes all lines are data (e.g. no comment lines).
    tstart: integer nanosecond start time
    tstop: integer nanosecond stop time

    Returns
    -------
    counts: int, number of entries for this particular window
    winZ: numpy list containing SUBSAMPLED data for this window from tstart to tstop

    """
    # Parse data.
    n, z_sub = parseWindow(filename, tstart, tstop)

    # Compute correlation times for z (actual spring center position) timeseries.
    g = timeseries.statisticalInefficiency(z_sub)
    print "Correlation time for %s is %10.3f" % (re.split('\W+',
                                                          filename)[1], g)
    indices = timeseries.subsampleCorrelatedData(z_sub, g)

    # Subsample data.
    zsublen = len(indices)
    z_sub = z_sub[indices]
    return zsublen, z_sub
def test_statistical_inefficiency():
    """Test the statistical inefficiency calculation utility."""

    data_size = 200000

    random_array = np.random.rand(data_size)
    numpy_vector_array = []

    for i in range(data_size):
        numpy_vector_array.append([random_array[i]])

    a = np.array(numpy_vector_array)

    statistical_inefficiency = timeseries.calculate_statistical_inefficiency(
        a, minimum_samples=3)
    pymbar_statistical_inefficiency = pymbar_timeseries.statisticalInefficiency(
        a, mintime=3)

    print(
        "utils: {}, pymbar: {}",
        statistical_inefficiency,
        pymbar_statistical_inefficiency,
    )

    assert abs(statistical_inefficiency -
               pymbar_statistical_inefficiency) < 0.00001
Esempio n. 8
0
def detect_equilibration(A_t):
    """
    Automatically detect equilibrated region.

    ARGUMENTS

    A_t (np.array) - timeseries

    RETURNS

    t (int) - start of equilibrated data
    g (float) - statistical inefficiency of equilibrated data
    Neff_max (float) - number of uncorrelated samples   
    
    """
    T = A_t.size

    # Special case if timeseries is constant.
    if A_t.std() == 0.0:
        return (0, 1, T)
    
    g_t = np.ones([T-1], np.float32)
    Neff_t = np.ones([T-1], np.float32)
    for t in range(T-1):
        g_t[t] = timeseries.statisticalInefficiency(A_t[t:T])
        Neff_t[t] = (T-t+1) / g_t[t]
    
    Neff_max = Neff_t.max()
    t = Neff_t.argmax()
    g = g_t[t]
    
    return (t, g, Neff_max)
Esempio n. 9
0
File: plotXY.py Progetto: vtlim/misc
def subsample(x, y_mat, num_cols=None):
    """
    Parameters
    ----------
    x : numpy array
        1-dimensional array with x-data, such as timestep.
    y_mat : can take various forms:
        - list of numpy arrays, such as grouping 1-column data into smaller data series
        - 1D numpy array, such as subsampling 1-column data
        - multidimensional numpy array, if data has many columns
    num_cols : int (opt.)
        Number of data series for the input y_mat. Use this value to loop
        over the input data, since it can be formatted as 1- or N-dimensional
        list or numpy array. If num_cols not specified, the value will be
        extracted from input data using find_num_cols function.

    Returns
    -------
    x_mat : list
        multi-dimensional array of the same shape as z_mat
    z_mat : list
        multi-dimesional array in which z_mat[i][j] is the jth value in the ith data series.

    """
    from pymbar import timeseries

    x_mat = []
    z_mat = [] # subsampled y_mat

    if num_cols is None:
        num_cols = find_num_cols(y_mat)

    for i in range(num_cols):

        # list of np arrays
        if type(y_mat) is list and len(y_mat[0]) > 1:
            y = y_mat[i]

        # 1D np array
        elif type(y_mat) is np.ndarray and len(y_mat.shape) == 1:
            y = y_mat

        # multidimensional np array
        else:
            y = y_mat[:,i]

        # compute correlation times
        g = timeseries.statisticalInefficiency(y)
        indices = timeseries.subsampleCorrelatedData(y, g)

        # subsample data
        y_sub = y[indices]
        x_sub = x[indices]
        z_mat.append(y_sub)
        x_mat.append(x_sub)

        print("\nLength of original timeseries data: %d" % len(y) )
        print("\nLength of subsampled timeseries data: %d" % len(y_sub) )

    return x_mat, z_mat
Esempio n. 10
0
def equilibrium_detection(df, series=None, lower=None, upper=None, step=None):
    """Subsample a DataFrame using automated equilibrium detection on a timeseries.

    If `series` is ``None``, then this function will behave the same as
    :func:`slicing`.

    Parameters
    ----------
    df : DataFrame
        DataFrame to subsample according to equilibrium detection on `series`.
    series : Series
        Series to detect equilibration on. If ``None``, no equilibrium
        detection-based subsampling will be performed.
    lower : float
        Lower bound to pre-slice `series` data from.
    upper : float
        Upper bound to pre-slice `series` to (inclusive).
    step : int
        Step between `series` items to pre-slice by.

    Returns
    -------
    DataFrame
        `df` subsampled according to subsampled `series`.

    See Also
    --------
    pymbar.timeseries.detectEquilibration : detailed background

    """
    if _check_multiple_times(df):
        raise KeyError("Duplicate time values found; equilibrium detection "
                       "is only meaningful for a single, contiguous, "
                       "and sorted timeseries.")

    if not _check_sorted(df):
        raise KeyError("Equilibrium detection only works as expected if "
                       "values are sorted by time, increasing.")

    if series is not None:
        series = slicing(series, lower=lower, upper=upper, step=step)

        # calculate statistical inefficiency of series
        statinef = statisticalInefficiency(series)

        # calculate statistical inefficiency of series, with equilibrium detection
        t, statinef, Neff_max = detectEquilibration(series.values)

        # we round up
        statinef = int(np.rint(statinef))

        # subsample according to statistical inefficiency
        series = series.iloc[t::statinef]

        df = df.loc[series.index]
    else:
        df = slicing(df, lower=lower, upper=upper, step=step)

    return df
Esempio n. 11
0
def mean_and_uncertainty(series: Series, inefficiency=None) -> (float, float):
    from pymbar import timeseries

    ave = np.mean(series)
    array = np.array(series)
    if inefficiency == None:
        inefficiency = timeseries.statisticalInefficiency(array)
    return ave, np.std(array, ddof=1) / math.sqrt(len(array) / inefficiency)
Esempio n. 12
0
    def subsample_gradients(self):
        r''' method to subsample gradients and get a better estiamte.
        '''
        if self.percentage == 100 and not self.subsample:
            warnings.warn(
                "You are not subsampling your data according to the statistical inefficiency nor are "
                "you discarding initial data. Please set percentage to another value than 100!"
            )
        percentage_removal = (self._N_k *
                              (1 - self.percentage / 100.0)).astype('int32')
        self._subsampled_N_k_gradients = self._N_k - percentage_removal
        N_max = int(numpy.max(self._subsampled_N_k_gradients))
        self._subsampled_grad_kn = numpy.zeros(shape=(self._N_k.shape[0],
                                                      N_max))
        for p in range(percentage_removal.shape[0]):
            start = percentage_removal[p]
            finish = percentage_removal[p] + N_max
            self._subsampled_grad_kn[p, :] = self._gradients_kn[p,
                                                                start:finish]
        if N_max <= 50:
            warnings.warn(
                "You have reduced your data to less than 50 samples, the results from these might not "
                "be trustworthy. If you don't want to add more samples consider rerunning the analysis using the percentage option."
            )
        #if subsampling is percentage, then we are done here, otherwise we will now subsample according to timeseries

        if self.subsample:
            print(
                "#Subsampling gradients according to statistical inefficiency")
            #first we compute statistical inefficiency
            self._gradients_kn = self._subsampled_grad_kn.copy()
            self._N_k = self._subsampled_N_k_gradients.copy()

            g_k = numpy.zeros(shape=(self._gradients_kn.shape[0]))
            self._subsampled_N_k_gradients = numpy.zeros(
                shape=(self._gradients_kn.shape[0]))
            for i in range(g_k.shape[0]):
                g_k[i] = timeseries.statisticalInefficiency(
                    self._gradients_kn[i, :])
            g = int(numpy.max(g_k))
            #now we need to figure out what the indices in the data are for subsampling
            indices_k = []
            for i in range(g_k.shape[0]):
                indices_k.append(
                    timeseries.subsampleCorrelatedData(
                        self._gradients_kn[i, :], g=g))
                self._subsampled_N_k_gradients[i] = len(indices_k[i])
            N_max = int(numpy.max(self._subsampled_N_k_gradients))
            if N_max <= 50:
                warnings.warn(
                    "You have reduced your data to less than 50 samples, the results from these might not "
                    "be trustworthy. If you don't want to add more samples consider rerunning the analysis using the percentage option."
                )
            self._subsampled_grad_kn = numpy.zeros(
                [self._gradients_kn.shape[0], N_max], numpy.float64)
            for k in range(self._gradients_kn.shape[0]):
                self._subsampled_grad_kn[k, :] = self._gradients_kn[
                    k, indices_k[k]]
Esempio n. 13
0
def si_data_bar_dhdl(data_bar_dhdl):
    si_l = {}
    for l in data_bar_dhdl:
        si_l[l] = []
        for i in range(len(data_bar_dhdl[l])):
            temp_si = timeseries.statisticalInefficiency(
                data_bar_dhdl[l][i].loc[:, l])
            si_l[l].append(temp_si)
    return si_l
Esempio n. 14
0
def decorrelate(traj, verbose=False, name=None):
    traj = np.array(traj)
    if traj.ndim == 1:
        idx = timeseries.subsampleCorrelatedData(traj)
        n0 = traj.size
        n1 = len(idx)
        res = traj[idx]
    elif traj.ndim == 2:
        # pymbar doesn't offer to decorrelate two samples, so let's do it ourselves
        # and just use the decorrelation of the sample more strongly correlated
        #
        # calculate (maximal) inefficiency
        g1 = timeseries.statisticalInefficiency(traj[0])
        g2 = timeseries.statisticalInefficiency(traj[1])
        g = np.max([g1, g2])
        # calculate index
        n0 = traj.shape[1]
        idx = np.unique(
            np.array(np.round(np.arange(0, int(n0 / g + .5)) * g), dtype=int))
        idx = idx[idx < n0]
        n1 = len(idx)
        res = traj[:, idx]
    else:
        raise NotImplementedError(
            'trajectory.decorrelate() is not implemented for '
            'trajectories with more than 1 dimension.')
    if verbose:
        n = n0 - n1
        if not name:
            name = 'Trajectory'
        if n == 0:
            print('{:s} decorrelation: No frames discarded for decorrelation.'.
                  format(name))
        elif n == 1:
            print('{:s} decorrelation: 1 frame ({:.1%} of '
                  'trajectory) discarded for decorrelation.'.format(
                      name, 1 / n0))
        else:
            print('{:s} decorrelation: {:d} frames ({:.1%} of '
                  'trajectory) discarded for decorrelation.'.format(
                      name, n, n / n0))

    return res
Esempio n. 15
0
def si_skips_data_dEs(dEs, nfr_mul, skip=1):
    c = 0
    skips = []
    for i in range(len(nfr_mul)):
        n_frms_list = nfr_mul[i]
        for n_frms in n_frms_list:
            if n_frms:
                c_end = c + n_frms
                temp_si = timeseries.statisticalInefficiency(dEs[i][c:c_end])
                skips.append(int(temp_si * skip) + 1)
                c = c_end
    return skips
Esempio n. 16
0
def test_statistical_inefficiency_fft_gaussian():

    # Run multiple times to get things with and without negative "spikes" at C(1)
    for i in range(5):
        x = np.random.normal(size=100000)
        g0 = timeseries.statisticalInefficiency(x, fast=False)
        g1 = timeseries.statisticalInefficiency(x, x, fast=False)
        g2 = timeseries.statisticalInefficiency_fft(x)
        g3 = timeseries.statisticalInefficiency(x, fft=True)
        eq(g0, g1, decimal=5)
        eq(g0, g2, decimal=5)
        eq(g0, g3, decimal=5)

        eq(np.log(g0), np.log(1.0), decimal=1)

    for i in range(5):
        x = np.random.normal(size=100000)
        x = np.repeat(
            x, 3
        )  # e.g. Construct correlated gaussian e.g. [a, b, c] -> [a, a, a, b, b, b, c, c, c]
        g0 = timeseries.statisticalInefficiency(x, fast=False)
        g1 = timeseries.statisticalInefficiency(x, x, fast=False)
        g2 = timeseries.statisticalInefficiency_fft(x)
        g3 = timeseries.statisticalInefficiency(x, fft=True)
        eq(g0, g1, decimal=5)
        eq(g0, g2, decimal=5)
        eq(g0, g3, decimal=5)

        eq(np.log(g0), np.log(3.0), decimal=1)
def test_statistical_inefficiency_fft_gaussian():
    
    # Run multiple times to get things with and without negative "spikes" at C(1)
    for i in range(5):
        x = np.random.normal(size=100000)
        g0 = timeseries.statisticalInefficiency(x, fast=False)
        g1 = timeseries.statisticalInefficiency(x, x, fast=False)
        g2 = timeseries.statisticalInefficiency_fft(x)
        g3 = timeseries.statisticalInefficiency(x, fft=True)
        eq(g0, g1, decimal=5)
        eq(g0, g2, decimal=5)
        eq(g0, g3, decimal=5)
        
        eq(np.log(g0), np.log(1.0), decimal=1)

    for i in range(5):
        x = np.random.normal(size=100000)
        x = np.repeat(x, 3)  # e.g. Construct correlated gaussian e.g. [a, b, c] -> [a, a, a, b, b, b, c, c, c]
        g0 = timeseries.statisticalInefficiency(x, fast=False)
        g1 = timeseries.statisticalInefficiency(x, x, fast=False)
        g2 = timeseries.statisticalInefficiency_fft(x)
        g3 = timeseries.statisticalInefficiency(x, fft=True)
        eq(g0, g1, decimal=5)
        eq(g0, g2, decimal=5)
        eq(g0, g3, decimal=5)

        eq(np.log(g0), np.log(3.0), decimal=1)
Esempio n. 18
0
def subsampletimeseries(timeser, xyzn, N_k):
    """
    Return a subsampled timeseries based on statistical inefficiency calculations.
    Parameters
    ----------
    timeser: the timeseries to be subsampled
    xyzn: the coordinates associated with each frame of the timeseries to be subsampled
    N_k: original # of samples in each timeseries

    Returns
    ---------
    N_k_sub: new number of samples per timeseries
    ts_sub: the subsampled timeseries
    xyz_sub: the subsampled configuration series
    """
    # Make a copy of the timeseries and make sure is numpy array of floats
    ts = timeser
    xyz = xyzn

    # initialize array of statistical inefficiencies
    g = np.zeros(len(ts), np.float64)

    for i, t in enumerate(ts):
        if np.count_nonzero(t) == 0:
            g[i] = np.float(1.)
            print "WARNING FLAG"
        else:
            g[i] = timeseries.statisticalInefficiency(t)

    N_k_sub = np.array([
        len(timeseries.subsampleCorrelatedData(t, g=b)) for t, b in zip(ts, g)
    ])
    ind = [timeseries.subsampleCorrelatedData(t, g=b) for t, b in zip(ts, g)]

    if (N_k_sub == N_k).all():
        ts_sub = ts
        xyz_sub = xyz
        print "No sub-sampling occurred"
    else:
        print "Sub-sampling..."
        ts_sub = np.array([
            t[timeseries.subsampleCorrelatedData(t, g=b)]
            for t, b in zip(ts, g)
        ])
        #for c in xyz:
        #    xyz_sub = [c[timeseries.subsampleCorrelatedData(t,g=b)] for t,b in zip(ts,g)]
        for i, j in enumerate(xyz):
            xyz_sub = [j[ii] for ii in ind[i]]

    return ts_sub, N_k_sub, xyz_sub, ind
Esempio n. 19
0
def getNkandUkln():
    # u_kln = u_klt
    # N_k = [maxn]*K
    # return (N_k, u_kln)

    """Identifies uncorrelated samples and updates the arrays of the reduced potential energy and dhdlt retaining data entries of these samples only."""
    u_kln = np.zeros([K,K,maxn], np.float64) # u_kln[k,m,n] is the reduced potential energy of uncorrelated sample index n from state k evaluated at state m
    N_k = np.zeros(K, int) # N_k[k] is the number of uncorrelated samples from state k
    g = np.zeros(K,float) # autocorrelation times for the data
    print "Number of correlated and uncorrelated samples:\n\n%8s %10s %12s %12s" % ('Lambda', 'N', 'N_k', 'N/N_k')
    for k in range(K):
        if k == 0:
            g[k] = timeseries.statisticalInefficiency(u_klt[k,k+1,:])
            indices = np.array(timeseries.subsampleCorrelatedData(u_klt[k,k+1,:])) # indices of uncorrelated samples
        else:
            g[k] = timeseries.statisticalInefficiency(u_klt[k,k-1,:])
            indices = np.array(timeseries.subsampleCorrelatedData(u_klt[k,k-1,:]))
        N = len(indices) # number of uncorrelated samples
        N_k[k] = N # Store the number of uncorrelated samples from state k.
        for l in range(K):
            u_kln[k,l,0:N] = u_klt[k,l,indices]
        print "%6.2f %12s %12s %12.2f" % (l_list[k], maxn, N_k[k], g[k])
    print ''
    return (N_k, u_kln)
Esempio n. 20
0
    def subsample_energies(self):
        r''' This subsamples u_kln according to percentage, i.e. remove initial equilibration data and then can additionally subsample according to timeseries

        '''
        #removing percent
        if self.percentage == 100 and not self.subsample:
            warnings.warn("You are not subsampling your data according to the statistical inefficiency nor are "
                           "you discarding initial data. Please set percentage to another value than 100!")

        percentage_removal = (self._N_k*(1-self.percentage/100.0)).astype('int32')
        self._subsampled_N_k_energies = self._N_k-percentage_removal
        N_max = int(numpy.max(self._subsampled_N_k_energies))
        self._subsampled_u_kln = numpy.zeros(shape=(self._N_k.shape[0], self._N_k.shape[0], N_max))
        self._subsampled_energies_kn = numpy.zeros(shape=(self._N_k.shape[0], N_max))
        for k in range(0, self._N_k.shape[0]):
            self._subsampled_u_kln[k] = self._u_kln[k,:,percentage_removal[k]:percentage_removal[k]+N_max]
            self._subsampled_energies_kn[k] = self._energies_kn[k,percentage_removal[k]:percentage_removal[k]+N_max]
        if N_max <=50:
            warnings.warn("You have reduced your data to less than 50 samples, the results from these might not "
                           "be trustworthy. If you don't want to add more samples consider rerunning the analysis using the percentage option.")

        #Now we are doing some additional subsampling according to timeseries analysis
        if self.subsample:
            print("#Subsampling energies according to statistical inefficiency for pymbar")

            self._u_kln = self._subsampled_u_kln.copy()
            self._N_k = self._subsampled_N_k_energies.copy()
            self._energies_kn = self._subsampled_energies_kn.copy()
            #first we compute statistical inefficiency
            g_k = numpy.zeros(shape=(self._energies_kn.shape[0]))
            for i in range(g_k.shape[0]):
                g_k[i] = timeseries.statisticalInefficiency(self._energies_kn[i,percentage_removal[i]:])
            g = numpy.max(g_k)
            #now we need to figure out what the indices in the data are for subsampling
            indices_k = []
            self._subsampled_N_k_energies = numpy.zeros(shape=(self._energies_kn.shape[0]))
            for i in range(g_k.shape[0]):
                indices_k.append(timeseries.subsampleCorrelatedData(self._energies_kn[i,:], g=g))
                self._subsampled_N_k_energies[i]=len(indices_k[i])
            #self._subsampled_N_k_energies = (numpy.ceil(self._N_k / g)).astype(int)
            N_max = int(numpy.max(self._subsampled_N_k_energies))
            if N_max <=50:
                warnings.warn("You have reduced your data to less than 50 samples, the results from these might not "
                               "be trustworthy. If you don't want to add more samples consider rerunning the analysis using the percentage option.")
            self._subsampled_u_kln = numpy.zeros([self._gradients_kn.shape[0],self._gradients_kn.shape[0], N_max], numpy.float64)
            for k in range(self._gradients_kn.shape[0]):
                self._subsampled_u_kln[k,:,:] = self._u_kln[k,:,indices_k[k]].transpose()
def detect_equilibration(A_t, nskip=1, method='fft'):
    """
    Automatically detect equilibrated region.

    ARGUMENTS

    A_t (numpy.array) - timeseries

    OPTIONAL ARGUMENTS

    nskip (int) - resolution of analysis for determining equilibration (default: 1)
    method (string) - method to use for statistical inefficiency calculation (default: 'fft')

    RETURNS

    t0 (int) - start of equilibrated data
    g (float) - statistical inefficiency of equilibrated data
    Neff_max (float) - number of uncorrelated samples

    """
    T = A_t.size

    # Special case if timeseries is constant.
    if A_t.std() == 0.0:
        return (0, 1, T)

    indices = range(0, T-1, nskip)
    N = len(indices)
    t0_n = numpy.ones([N], numpy.float32)
    g_n = numpy.ones([N], numpy.float32)
    Neff_n = numpy.ones([N], numpy.float32)
    for n in range(N):
        t0 = nskip*n
        t0_n[n] = t0
        g_n[n] = timeseries.statisticalInefficiency(A_t[t0:T], method=method)
        Neff_n[n] = (T-t0) / g_n[n]

    Neff_max = Neff_n.max()
    n = Neff_n.argmax()
    t0 = t0_n[n]
    g = g_n[n]

    return (t0, g, Neff_max)
Esempio n. 22
0
def detect_equilibration(A_t, nskip=1, method='fft'):
    """
    Automatically detect equilibrated region.

    ARGUMENTS

    A_t (numpy.array) - timeseries

    OPTIONAL ARGUMENTS

    nskip (int) - resolution of analysis for determining equilibration (default: 1)
    method (string) - method to use for statistical inefficiency calculation (default: 'fft')

    RETURNS

    t0 (int) - start of equilibrated data
    g (float) - statistical inefficiency of equilibrated data
    Neff_max (float) - number of uncorrelated samples

    """
    T = A_t.size

    # Special case if timeseries is constant.
    if A_t.std() == 0.0:
        return (0, 1, T)

    indices = range(0, T - 1, nskip)
    N = len(indices)
    t0_n = numpy.ones([N], numpy.float32)
    g_n = numpy.ones([N], numpy.float32)
    Neff_n = numpy.ones([N], numpy.float32)
    for n in range(N):
        t0 = nskip * n
        t0_n[n] = t0
        g_n[n] = timeseries.statisticalInefficiency(A_t[t0:T], method=method)
        Neff_n[n] = (T - t0) / g_n[n]

    Neff_max = Neff_n.max()
    n = Neff_n.argmax()
    t0 = t0_n[n]
    g = g_n[n]

    return (t0, g, Neff_max)
Esempio n. 23
0
    def subsample_energies(self):
        if self.subsample_method!='timeseries':
            print("We are only eliminating samples from the beginning of the data and are still working with highly"
                  " correlated data!")

            if self.percentage ==100:
                RuntimeWarning("You are not subsampling your data according to the statistical inefficiency nor are"
                               "you discarding initial data. Please set percentage to another value than 100!")

            percentage_removal = self._N_k*(1-self.percentage/100.0)
            self._subsampled_N_k_energies = self._N_k-percentage_removal
            N_max = np.max(self._subsampled_N_k_energies)
            self._subsampled_u_kln = np.zeros(shape=(self._N_k.shape[0], self._N_k.shape[0], N_max))
            for i in range(percentage_removal.shape[0]):
                for j in range(percentage_removal.shape[0]):
                    self._subsampled_u_kln[i,j,:] = self._u_kln[i,j,percentage_removal[j]:]
            if N_max <=100:
                RuntimeWarning("You have reduced your data to less than 100 samples, the results from these might not "
                               "be trustworthy. ")
        else:
            print("We are doing a timeseries analysis using the timeseries analysis module in pymbar and will subsample"
                  " according to that.")

            #first we compute statistical inefficiency
            g_k = np.zeros(shape=(self._energies_kn.shape[0]))
            for i in range(g_k.shape[0]):
                g_k[i] = timeseries.statisticalInefficiency(self._energies_kn[i,:])
            g = np.max(g_k)
            #now we need to figure out what the indices in the data are for subsampling
            indices_k = []
            self._subsampled_N_k_energies = np.zeros(shape=(self._gradients_kn.shape[0]))
            for i in range(g_k.shape[0]):
                indices_k.append(timeseries.subsampleCorrelatedData(self._energies_kn[i,:], g=g))
                self._subsampled_N_k_energies[i]=len(indices_k[i])
            #self._subsampled_N_k_energies = (np.ceil(self._N_k / g)).astype(int)
            N_max = np.max(self._subsampled_N_k_energies)
            if N_max <=100:
                RuntimeWarning("You have reduced your data to less than 100 samples, the results from these might not "
                               "be trustworthy. ")
            self._subsampled_u_kln = np.zeros([self._gradients_kn.shape[0],self._gradients_kn.shape[0], N_max], np.float64)
            for k in range(self._gradients_kn.shape[0]):
                self._subsampled_u_kln[k,:,:] = self._u_kln[k,:,indices_k[k]].transpose()
Esempio n. 24
0
def get_equilibration_data_per_sample(timeseries_to_analyze,
                                      fast=True,
                                      nskip=1):
    """
    Compute the correlation time and n_effective per sample.

    This is exactly what ``pymbar.timeseries.detectEquilibration`` does, but returns the per sample data

    See the ``pymbar.timeseries.detectEquilibration`` function for full documentation
    """
    A_t = timeseries_to_analyze
    T = A_t.size
    g_t = np.ones([T - 1], np.float32)
    Neff_t = np.ones([T - 1], np.float32)
    for t in range(0, T - 1, nskip):
        try:
            g_t[t] = timeseries.statisticalInefficiency(A_t[t:T], fast=fast)
        except:
            g_t[t] = (T - t + 1)
        Neff_t[t] = (T - t + 1) / g_t[t]
    return g_t, Neff_t
Esempio n. 25
0
    def subsampling(self, integratedACF=True):
        """
        Performs inline subsampling based on the statistical inefficiency ``g``
        of the specified attribute `acfun` of :class:`sample`, aiming at
        obtaining a sample of :term:`IID` configurations. Subsampling is done
        via jumps of varying sizes around ``g``, so that the sample size decays
        by a factor of approximately ``1/g``.

        Parameters
        ----------
            integratedACF : bool, optional, default=True
                If true, the integrated :term:`ACF` method :cite:`Chodera_2007`
                will be used for computing the statistical inefficiency.
                Otherwise, the :term:`OBM` method will be used instead.

        Returns
        -------
            :class:`sample`
                Although the subsampling is done inline, the new sample is
                returned for chaining purposes.

        """
        n = len(self.dataset)
        if mics.verbose:
            info("\n=== Subsampling via %s ===" %
                 ("integrated ACF" if integratedACF else "OBM"))
            info("Original sample size:", n)
        if integratedACF:
            y = multimap([self.acfun.lambdify()], self.dataset)
            g = timeseries.statisticalInefficiency(y[0])
        else:
            g = n / self.neff
        new = timeseries.subsampleCorrelatedData(self.dataset.index, g)
        self.dataset = self.dataset.reindex(new)
        self.neff = len(new)
        if mics.verbose:
            info("Statistical inefficiency:", g)
            info("New sample size:", self.neff)
        return self
Esempio n. 26
0
def get_equilibration_data_per_sample(timeseries_to_analyze, fast=True, max_subset=100):
    """
    Compute the correlation time and n_effective per sample with tuning to how you want your data formatted

    This is a modified pass-through to ``pymbar.timeseries.detectEquilibration`` does, returning the per sample data.

    It has been modified to specify the maximum number of time points to consider, evenly spaced over the timeseries.
    This is different than saying "I want analysis done every X for total points Y = len(timeseries)/X",
    this is "I want Y total analysis points"

    See the ``pymbar.timeseries.detectEquilibration`` function for full algorithm documentation

    Parameters
    ----------
    timeseries_to_analyze : np.ndarray
        1-D timeseries to analyze for equilibration
    max_subset : int >= 1 or None, optional, default: 100
        Maximum number of points in the ``timeseries_to_analyze`` on which to analyze the equilibration on.
        These are distributed uniformly over the timeseries so the final output will be size max_subset where indices
        are placed  approximately every ``(len(timeseries_to_analyze) - 1) / max_subset``.
        The full timeseries is used if the timeseries is smaller than ``max_subset`` or if ``max_subset`` is None
    fast : bool, optional. Default: True
        If True, will use faster (but less accurate) method to estimate correlation time
        passed on to timeseries module.

    Returns
    -------
    i_t : np.ndarray of int
        Indices of the timeseries which were sampled from
    g_i : np.ndarray of float
        Estimated statistical inefficiency at t in units of index count.
        Equal to 1 + 2 tau, where tau is the correlation time
        Will always be >= 1

        e.g. If g_i[x] = 4.3, then choosing x as your equilibration point means the every ``ceil(4.3)`` in
        ``timeseries_to_analyze`` will be decorrelated, so the fully equilibrated decorrelated timeseries would be
        indexed by [x, x+5, x+10, ..., X) where X is the final point in the ``timeseries_to_analyze``.

        The "index count" in this case is the by count of the ``timeseries_to_analyze`` indices, NOT the ``i_t``

    n_effective_i : np.ndarray of float
        Number of effective samples by subsampling every ``g_i`` from index t, does include fractional value, so true
        number of points will be the floor of this output.

        The "index count" in this case is the by count of the ``timeseries_to_analyze`` indices, NOT the ``i_t``

    """
    # Cast to array if not already
    series = np.array(timeseries_to_analyze)
    # Special trap for constant series
    time_size = series.size
    set_size = time_size - 1  # Cannot analyze the last entry
    # Set maximum
    if max_subset is None or set_size < max_subset:
        max_subset = set_size
    # Special trap for series of size 1
    if max_subset == 0:
        max_subset = 1
    # Special trap for constant or size 1 series
    if series.std() == 0.0 or max_subset == 1:
        return (np.arange(max_subset, dtype=int),  # i_t
                np.array([1]*max_subset),  # g_i
                np.arange(time_size, time_size-max_subset, -1)  # n_effective_i
                )
    g_i = np.ones([max_subset], np.float32)
    n_effective_i = np.ones([max_subset], np.float32)
    counter = np.arange(max_subset)
    i_t = np.floor(counter * time_size / max_subset).astype(int)
    for i, t in enumerate(i_t):
        try:
            g_i[i] = timeseries.statisticalInefficiency(series[t:], fast=fast)
        except:
            g_i[i] = (time_size - t + 1)
        n_effective_i[i] = (time_size - t + 1) / g_i[i]
    return i_t, g_i, n_effective_i
        # Parse data.
        n = 0
        for line in lines:
            if line[0] != '#' and line[0] != '@':
                tokens = line.split()
                print(tokens)
                u_kn[k, n] = beta_k[k] * (
                    float(tokens[2]) - float(tokens[1])
                )  # reduced potential energy without umbrella restraint
                n += 1

    # Compute correlation times for potential energy and chi
    # timeseries.  If the temperatures differ, use energies to determine samples; otherwise, use the cosine of chi

    if (DifferentTemperatures):
        g_k[k] = timeseries.statisticalInefficiency(u_kn[k, :], u_kn[k,
                                                                     0:N_k[k]])
        print("Correlation time for set %5d is %10.3f" % (k, g_k[k]))
        indices = timeseries.subsampleCorrelatedData(u_kn[k, 0:N_k[k]])
    else:
        chi_radians = chi_kn[k, 0:N_k[k]] / (180.0 / numpy.pi)
        g_cos = timeseries.statisticalInefficiency(numpy.cos(chi_radians))
        g_sin = timeseries.statisticalInefficiency(numpy.sin(chi_radians))
        print("g_cos = %.1f | g_sin = %.1f" % (g_cos, g_sin))
        g_k[k] = max(g_cos, g_sin)
        print("Correlation time for set %5d is %10.3f" % (k, g_k[k]))
        indices = timeseries.subsampleCorrelatedData(chi_radians, g=g_k[k])
    # Subsample data.
    N_k[k] = len(indices)
    u_kn[k, 0:N_k[k]] = u_kn[k, indices]
    chi_kn[k, 0:N_k[k]] = chi_kn[k, indices]
Esempio n. 28
0
def get_equilibration_data_per_sample(timeseries_to_analyze,
                                      fast=True,
                                      max_subset=100):
    """
    Compute the correlation time and n_effective per sample with tuning to how you want your data formatted

    This is a modified pass-through to ``pymbar.timeseries.detectEquilibration`` does, returning the per sample data.

    It has been modified to specify the maximum number of time points to consider, evenly spaced over the timeseries.
    This is different than saying "I want analysis done every X for total points Y = len(timeseries)/X",
    this is "I want Y total analysis points"

    See the ``pymbar.timeseries.detectEquilibration`` function for full algorithm documentation

    Parameters
    ----------
    timeseries_to_analyze : np.ndarray
        1-D timeseries to analyze for equilibration
    max_subset : int >= 1 or None, optional, default: 100
        Maximum number of points in the ``timeseries_to_analyze`` on which to analyze the equilibration on.
        These are distributed uniformly over the timeseries so the final output will be size max_subset where indices
        are placed  approximately every ``(len(timeseries_to_analyze) - 1) / max_subset``.
        The full timeseries is used if the timeseries is smaller than ``max_subset`` or if ``max_subset`` is None
    fast : bool, optional. Default: True
        If True, will use faster (but less accurate) method to estimate correlation time
        passed on to timeseries module.

    Returns
    -------
    i_t : np.ndarray of int
        Indices of the timeseries which were sampled from
    g_i : np.ndarray of float
        Estimated statistical inefficiency at t in units of index count.
        Equal to 1 + 2 tau, where tau is the correlation time
        Will always be >= 1

        e.g. If g_i[x] = 4.3, then choosing x as your equilibration point means the every ``ceil(4.3)`` in
        ``timeseries_to_analyze`` will be decorrelated, so the fully equilibrated decorrelated timeseries would be
        indexed by [x, x+5, x+10, ..., X) where X is the final point in the ``timeseries_to_analyze``.

        The "index count" in this case is the by count of the ``timeseries_to_analyze`` indices, NOT the ``i_t``

    n_effective_i : np.ndarray of float
        Number of effective samples by subsampling every ``g_i`` from index t, does include fractional value, so true
        number of points will be the floor of this output.

        The "index count" in this case is the by count of the ``timeseries_to_analyze`` indices, NOT the ``i_t``

    """
    # Cast to array if not already
    series = np.array(timeseries_to_analyze)
    # Special trap for constant series
    time_size = series.size
    set_size = time_size - 1  # Cannot analyze the last entry
    # Set maximum
    if max_subset is None or set_size < max_subset:
        max_subset = set_size
    # Special trap for series of size 1
    if max_subset == 0:
        max_subset = 1
    # Special trap for constant or size 1 series
    if series.std() == 0.0 or max_subset == 1:
        return (
            np.arange(max_subset, dtype=int),  # i_t
            np.array([1] * max_subset),  # g_i
            np.arange(time_size, time_size - max_subset, -1)  # n_effective_i
        )
    g_i = np.ones([max_subset], np.float32)
    n_effective_i = np.ones([max_subset], np.float32)
    counter = np.arange(max_subset)
    i_t = np.floor(counter * time_size / max_subset).astype(int)
    for i, t in enumerate(i_t):
        try:
            g_i[i] = timeseries.statisticalInefficiency(series[t:], fast=fast)
        except:
            g_i[i] = (time_size - t + 1)
        n_effective_i[i] = (time_size - t + 1) / g_i[i]
    return i_t, g_i, n_effective_i
Esempio n. 29
0
        cluster_bin_kn = -1*numpy.ones([K,N_samples], numpy.int32) # cluster_bin_kn[k,n] is the cluster bin index of snapshot n of umbrella simulation k
    N_k = numpy.zeros([K], numpy.int32) # N_k[k] is the number of uncorrelated samples from simulation index k
    reduced_expectation_data = []
    if len(expectation_columns) > 0:
        for i in range(len(expectation_columns)):
            reduced_expectation_data.append(numpy.zeros([K,N_samples], numpy.float64))
    reduced_fep_data = []
    if len(fep_columns) > 0:
        for i in range(len(fep_columns)):
            reduced_fep_data.append(numpy.zeros([K,N_samples], numpy.float64))
    for k in range(K):
        # Extract timeseries.
        A_t = biasing_variable_kt[0][k,:]
        # Compute statistical inefficiency.
        try:
            g = timeseries.statisticalInefficiency(A_t)
        except Exception as e:
            print str(e)
            print A_t

        # Subsample data.
        if subsample_trajectories:
            indices = timeseries.subsampleCorrelatedData(A_t, g=g)
        else:
            indices = timeseries.subsampleCorrelatedData(A_t, g=1)
        N = len(indices) # number of uncorrelated samples
        print "k = %5d : g = %.1f, N = %d" % (k, g, N)
        for i in range(nbiases):
            biasing_variable_kn[i][k,0:N] = biasing_variable_kt[i][k,indices]
        for i in range(nperturbations+1):
            U_kn[i][k,0:N] = U_kt[i][k,indices]
Esempio n. 30
0
#========================================================================

#------------------------------------------------------------------------
# Read Data From File
#------------------------------------------------------------------------

print("")
print("Preparing data:")
T_from_file = read_simulation_temps(simulation,NumTemps)
E_from_file = read_total_energies(simulation,TE_COL_NUM)
K = len(T_from_file)
N_k = numpy.zeros(K,numpy.int32)
g = numpy.zeros(K,numpy.float64)

for k in range(K):  # subsample the energies
   g[k] = timeseries.statisticalInefficiency(E_from_file[k])
   indices = numpy.array(timeseries.subsampleCorrelatedData(E_from_file[k],g=g[k])) # indices of uncorrelated samples
   N_k[k] = len(indices) # number of uncorrelated samples
   E_from_file[k,0:N_k[k]] = E_from_file[k,indices]

#------------------------------------------------------------------------
# Insert Intermediate T's and corresponding blank U's and E's
#------------------------------------------------------------------------
Temp_k = T_from_file
minT = T_from_file[0]
maxT = T_from_file[len(T_from_file) - 1]
#beta = 1/(k*BT)
#T = 1/(kB*beta)
if dertype == 'temperature':
    minv = minT
    maxv = maxT
Esempio n. 31
0
def subtest_mcmc_expectation(testsystem, move_set):
    if debug: 
        print testsystem.__class__.__name__
        print str(move_set)

    # Test settings.
    temperature = 298.0 * units.kelvin
    pressure = 1.0 * units.atmospheres
    nequil = 10 # number of equilibration iterations
    niterations = 20 # number of production iterations

    # Retrieve system and positions.
    [system, positions] = [testsystem.system, testsystem.positions]
    
    platform_name = 'Reference'
    from simtk.openmm import Platform
    platform = Platform.getPlatformByName(platform_name)

    # Compute properties.
    kB = units.BOLTZMANN_CONSTANT_kB * units.AVOGADRO_CONSTANT_NA
    kT = kB * temperature
    ndof = 3*system.getNumParticles() - system.getNumConstraints()

    # Create thermodynamic state
    from repex.thermodynamics import ThermodynamicState
    thermodynamic_state = ThermodynamicState(system=testsystem.system, temperature=temperature, pressure=pressure)

    # Create MCMC sampler.
    from repex.mcmc import MCMCSampler
    sampler = MCMCSampler(thermodynamic_state, move_set=move_set, platform=platform)

    # Create sampler state.
    from repex.mcmc import SamplerState
    sampler_state = SamplerState(system=testsystem.system, positions=testsystem.positions, platform=platform)

    # Equilibrate
    for iteration in range(nequil):
        #print "equilibration iteration %d / %d" % (iteration, nequil)

        # Update sampler state.
        sampler_state = sampler.run(sampler_state, 1)

    # Accumulate statistics.
    x_n = np.zeros([niterations], np.float64) # x_n[i] is the x position of atom 1 after iteration i, in angstroms
    potential_n = np.zeros([niterations], np.float64) # potential_n[i] is the potential energy after iteration i, in kT
    kinetic_n = np.zeros([niterations], np.float64) # kinetic_n[i] is the kinetic energy after iteration i, in kT
    temperature_n = np.zeros([niterations], np.float64) # temperature_n[i] is the instantaneous kinetic temperature from iteration i, in K
    volume_n = np.zeros([niterations], np.float64) # volume_n[i] is the volume from iteration i, in K
    for iteration in range(niterations):
        if debug: print "iteration %d / %d" % (iteration, niterations)

        # Update sampler state.
        sampler_state = sampler.run(sampler_state, 1)

        # Get statistics.
        potential_energy = sampler_state.potential_energy
        kinetic_energy = sampler_state.kinetic_energy
        total_energy = sampler_state.total_energy
        instantaneous_temperature = kinetic_energy * 2.0 / ndof / (units.BOLTZMANN_CONSTANT_kB * units.AVOGADRO_CONSTANT_NA)
        volume = sampler_state.volume
        
        #print "potential %8.1f kT | kinetic %8.1f kT | total %8.1f kT | volume %8.3f nm^3 | instantaneous temperature: %8.1f K" % (potential_energy/kT, kinetic_energy/kT, total_energy/kT, volume/(units.nanometers**3), instantaneous_temperature/units.kelvin)

        # Accumulate statistics.
        x_n[iteration] = sampler_state.positions[0,0] / units.angstroms
        potential_n[iteration] = potential_energy / kT
        kinetic_n[iteration] = kinetic_energy / kT
        temperature_n[iteration] = instantaneous_temperature / units.kelvin
        volume_n[iteration] = volume / (units.nanometers**3)

    # Compute expected statistics.
    if ('get_potential_expectation' in dir(testsystem)):
        # Skip this check if the std dev is zero.
        skip_test = False
        if (potential_n.std() == 0.0):
            skip_test = True
            if debug: print "Skipping potential test since variance is zero."
        if not skip_test:
            potential_expectation = testsystem.get_potential_expectation(thermodynamic_state) / kT
            potential_mean = potential_n.mean()            
            g = timeseries.statisticalInefficiency(potential_n, fast=True)
            dpotential_mean = potential_n.std() / np.sqrt(niterations / g)
            potential_error = potential_mean - potential_expectation
            nsigma = abs(potential_error) / dpotential_mean
            test_passed = True
            if (nsigma > NSIGMA_CUTOFF):
                test_passed = False

            if debug or (test_passed is False):
                print "Potential energy expectation"
                print "observed %10.5f +- %10.5f kT | expected %10.5f | error %10.5f +- %10.5f (%.1f sigma)" % (potential_mean, dpotential_mean, potential_expectation, potential_error, dpotential_mean, nsigma)
                if test_passed:
                    print "TEST PASSED"
                else:                
                    print "TEST FAILED"
                print "----------------------------------------------------------------------------"

    if ('get_volume_expectation' in dir(testsystem)):
        # Skip this check if the std dev is zero.
        skip_test = False
        if (volume_n.std() == 0.0):
            skip_test = True
            if debug: print "Skipping volume test."
        if not skip_test:
            volume_expectation = testsystem.get_volume_expectation(thermodynamic_state) / (units.nanometers**3)
            volume_mean = volume_n.mean()            
            g = timeseries.statisticalInefficiency(volume_n, fast=True)
            dvolume_mean = volume_n.std() / np.sqrt(niterations / g)
            volume_error = volume_mean - volume_expectation
            nsigma = abs(volume_error) / dvolume_mean
            test_passed = True
            if (nsigma > NSIGMA_CUTOFF):
                test_passed = False

            if debug or (test_passed is False):
                print "Volume expectation"
                print "observed %10.5f +- %10.5f kT | expected %10.5f | error %10.5f +- %10.5f (%.1f sigma)" % (volume_mean, dvolume_mean, volume_expectation, volume_error, dvolume_mean, nsigma)
                if test_passed:
                    print "TEST PASSED"
                else:                
                    print "TEST FAILED"
                print "----------------------------------------------------------------------------"
Esempio n. 32
0
def dA_Lambda_MBAR(plot_out=True,
                   MinL=0,
                   MaxL=100,
                   dL=5,
                   GAMMA=100,
                   exponent=4,
                   polymorphs='p1 p2',
                   Molecules=72,
                   Independent=4,
                   Temp=200,
                   Pressure=1,
                   potential='oplsaa',
                   hinge='DefaultHinge'):
    if (plot_out):
        import matplotlib  # for making plots, version 'matplotlib-1.1.0-1'; errors may pop up when using earlier versions
        import matplotlib.pyplot as plt
        font = {'family': 'normal', 'weight': 'normal', 'size': 16}
        matplotlib.rc('font', **font)

    # =============================================================================================
    # ENSURE THAT USER INPUTS ARE SENSIBLE
    # =============================================================================================
    # TEMPERATURE
    if Temp < 0:
        print("Invalid Temperature: " + str(Temp))
        sys.exit()

    if Pressure < 0:
        print("Invalid Pressure: " + str(Pressure))
        sys.exit()

    # LAMBDA
    if (MinL == -1) and (MaxL == -1) and (dL == -1) and (exponent == 1):
        print("Using default values!")

        # The Lambda points sampled
        Lambdas = [
            '000L', '010L', '020L', '030L', '040L', '050L', '060L', '070L',
            '080L', '090L', '100L'
        ]
    elif MinL < 0 or MaxL < 0 or dL < 0 or MinL > MaxL:
        print("Invalid Lambda Specifications")
        sys.exit()
    else:
        RawLambda = 0
        Lambdas = []
        lambda_names = np.arange(MinL, MaxL + dL, dL)
        Lambda_names = []
        Lambda_indicies = []
        index = 0
        while RawLambda < MaxL:
            if RawLambda >= MinL:
                Lambda_indicies.append(index)
                index += 1
            else:
                index += 1
                RawLambda = RawLambda + dL
                continue
            if exponent >= 0:
                Lambda = int(100 *
                             (float(RawLambda) / float(MaxL))**abs(exponent))
            else:
                Lambda = int(
                    100 *
                    (1 -
                     (float(MaxL - RawLambda) / float(MaxL))**abs(exponent)))
            Lambdas.append(Lambda)
            # Format the lambda point name
            if RawLambda < 10:
                Lambda_names.append('00' + str(int(RawLambda)) + 'L')
            elif RawLambda < 100:
                Lambda_names.append('0' + str(int(RawLambda)) + 'L')
            else:
                Lambda_names.append('100L')
            RawLambda = RawLambda + dL

        # Catch the final lambda point
        Lambdas.append(MaxL)
        Lambda_indicies.append(index)
        if MaxL < 10:
            Lambda_names.append('00' + str(int(MaxL)) + 'L')
        elif MaxL < 100:
            Lambda_names.append('0' + str(int(MaxL)) + 'L')
        else:
            Lambda_names.append('100L')

    # GAMMA
    if GAMMA < 0 or GAMMA > 100:
        print("Invalid Gamma Point: " + str(GAMMA))
        sys.exit()

    # POLYMORPH
    polymorphs = polymorphs.split()
    polymorph = []
    polymorph_short = []
    for i, token in enumerate(polymorphs):
        polymorph.append('Polymorph ' + str(token))
        polymorph_short.append(token)

    # POTENTIAL
    if potential not in [
            "oplsaa", "gromos", "designeda", "oplsaafakeg", "oplsaafakea"
    ]:
        print("Invalid Potential")
        print(
            "Supported potentials: oplsaa gromos designeda oplsaafakeg oplsaafakea"
        )
        sys.exit()

    # =============================================================================================
    # FORMAT INPUTS
    # =============================================================================================
    # POTENTIAL
    PotNAME = ""
    if potential == "oplsaa":
        PotNAME = "OPLS"
    elif potential == "gromos":
        PotNAME = "GROM"
    elif potential == "designeda":
        PotNAME = "DESA"
    elif potential == "oplsaafakeg":
        PotNAME = "FAKEG"
    elif potential == "oplsaafakea":
        PotNAME = "FAKEA"

    # OPTIONAL HINGE
    if str(GAMMA) == "100":
        hingeLetter = "L"
    else:
        hingeLetter = "R"

    if hinge == "DefaultHinge":
        hinges = ["_" + hingeLetter]
    else:
        # Read in each job
        hinges = []
        hingevect = hinge.split()
        for i, token in enumerate(hingevect):
            hinges.append("_" + hingeLetter + "_" + str(token))

    # =============================================================================================
    # READ IN RAW DATA
    # =============================================================================================
    # Constants.
    kB = 1.3806488e-23 * 6.0221413e23 / (1000.0 * 4.184
                                         )  # Boltzmann constant in kcal/mol
    omitK = []

    # Parameters
    T_k = Temp * np.ones(len(Lambdas), float)  # Convert temperatures to floats
    g_k = np.zeros([len(Lambdas)], float)
    K = len(Lambdas)  # How many states?

    # total number of states examined; none are unsampled
    Kbig = K + 0

    # maximum number of snapshots/simulation (could make this automated) - doesn't matter, as long as it's long enough.
    N_max = 200000

    # beta factor for the different temperatures
    beta_k = 1.0 / (kB * T_k)
    dA = np.zeros([len(polymorph), len(Lambdas)], float)
    ddA = np.zeros([len(polymorph), len(Lambdas)], float)
    convert_units = (0.2390057) * np.ones(
        len(Lambdas), float)  # Convert all energies to kcal/mol

    # Lines to ignore when reading in energies
    for i, poly in enumerate(polymorph):
        # Allocate storage for simulation data
        # N_k[k] is the total number of snapshots from alchemical state k
        N_k = np.zeros([Kbig], np.int32)

        # N_k_s[k,s] is the total number of snapshots from alchemical state k from seed s in 'unflipped segment j'
        N_ksj = np.zeros([Kbig, len(hinges), 100], np.int32)

        # u_kln[k,l,n] is the adjusted energy of snapshot n from simulation k
        u_kln = np.zeros([K, Kbig, N_max], np.float64)

        # dhdl_kln[k,l,n] is the restraint energy value of snapshop n from simulation k
        dhdl_kln = np.zeros([K, Kbig, N_max], np.float64)

        # dhdl_kn[k,n] is the derivative of energy with respect to lambda of snapshot n from simulation k
        dhdl_kn = np.zeros([K, N_max], np.float64)

        # Load in the data for each run
        for k in range(K):
            n = 0
            for s, hinge in enumerate(hinges):
                keepconfigs = np.arange(
                    N_max
                )  # The index of each configuration to keep in the MBAR analysis

                # cycle through all the input total energy data
                dirpath = polymorph_short[i] + '/restraints/' + str(
                    lambda_names[k])
                fname = dirpath + '/PROD.edr'
                dhdlname = dirpath + '/dhdl_PROD.xvg'

                if k not in omitK:
                    potential_energy = panedr.edr_to_df(
                        fname)['Potential'].values
                    print("loading " + fname)

                    dhdl_energy = np.loadtxt(dhdlname,
                                             comments=['#', '$', '@', '!'])
                    print("loading " + dhdlname)

                    # Removing any non-equilibrated points of the simulation
                    [start_production, _,
                     _] = timeseries.detectEquilibration(potential_energy)
                    potential_energy = potential_energy[start_production:]
                    dhdl_energy = dhdl_energy[start_production:]

                    # the energy of every configuration from each state evaluated at its sampled state
                    n = len(potential_energy)
                    u_kln[k, :, :n] = (float(Independent) / Molecules) * (
                        potential_energy.reshape(
                            (n, 1)) + dhdl_energy[:, 5:]).T * convert_units[k]
                    dhdl_kln[k, :, :n] = dhdl_energy[:,
                                                     5:].T * convert_units[k]
                    dhdl_kn[k, :n] = (
                        float(Independent) /
                        Molecules) * dhdl_energy[:, 4].T * convert_units[k]

                    # NSA: Can this go?
                    symbolcounter = 0

                    # Truncate the kept configuration list to be less than n
                    keepconfigs = [
                        j for j in keepconfigs
                        if j < (len(potential_energy) -
                                symbolcounter) and j >= 0
                    ]

                    # Split up the retained configurations into connected segments
                    j = 0
                    for a in range(len(keepconfigs)):
                        if a == 0:
                            continue
                        elif int(keepconfigs[a - 1]) + 1 != int(
                                keepconfigs[a]):
                            N_ksj[k, s, j] = a - (sum(N_ksj[k, s, 0:j]))
                            j += 1
                    # Catch the final segment
                    N_ksj[k, s, j] = len(keepconfigs) - sum(N_ksj[k, s, 0:j])
                    j += 1

            N_k[k] = n

        # convert to nondimensional units from kcal/mol
        u_kln *= beta_k[0]

        # all data loaded from the three sets
        u_kln_save = u_kln.copy()
        g_k = np.zeros([K])

        # Ignore the first state due to jumping
        print("Number of retained samples")
        print(N_k)

        # =============================================================================================
        # COMPUTE FREE ENERGY DIFFERENCE USING MBAR
        # =============================================================================================

        # Initialize MBAR.
        print("Running MBAR...")

        # generate the weights of each of the umbrella set
        mbar = pymbar.MBAR(u_kln,
                           N_k,
                           verbose=True,
                           subsampling_protocol=[{
                               'method': 'L-BFGS-B'
                           }])
        print("MBAR Converged...")

        for k in range(Kbig):
            w = np.exp(mbar.Log_W_nk[:, k])
            print("max weight in state %d is %12.7f" % (k, np.max(w)))
            neff = 1 / np.sum(w**2)
            print("Effective number of sample in state %d is %10.3f" %
                  (k, neff))
            print("Efficiency for state %d is %d/%d = %10.4f" %
                  (k, neff, len(w), neff / len(w)))

        # extract self-consistent weights and uncertainties
        (df_i, ddf_i, theta_i) = mbar.getFreeEnergyDifferences()

        print("Free Energies Optained...")

        # convert PMF to kcal/mol and normalize by the number of molecules
        df_i /= (beta_k[0] * float(Independent))
        ddf_i /= (beta_k[0] * float(Independent))

        dA[i, :] = df_i[-1]

        # =============================================================================================
        # COMPUTE UNCERTAINTY USING THE UNCORRELATED DATA
        # =============================================================================================
        for k in range(K):  # For each restraint state
            N_k[k] = 0
            n_old = 0
            if k not in omitK:
                for s in range(
                        len(hinges)
                ):  # For each independent trajectory of this restraint state
                    for j in range(
                            100
                    ):  # For each untossed segment of each independent trajectory of this restraint state
                        if N_ksj[k, s, j] == 0:
                            continue
                        # Feed in the segment and calculate correlation time
                        g_k[k] = timeseries.statisticalInefficiency(
                            dhdl_kn[k, n_old:(n_old + N_ksj[k, s, j])])
                        print(
                            "Correlation time for sampled state %d is %10.3f" %
                            (k, g_k[k]))

                        # subsample the data to get statistically uncorrelated data
                        # subsample indices within the segment
                        indices = np.array(
                            timeseries.subsampleCorrelatedData(
                                u_kln[k, k, n_old:(n_old + N_ksj[k, s, j])],
                                g=g_k[k])).astype(int)

                        # Apphend the uncorrelated configurations in the segment to the u_kln matrix
                        u_kln[k, :, N_k[k]:(N_k[k] +
                                            len(indices))] = u_kln_save[k, :, (
                                                indices + n_old)].transpose()
                        N_k[k] = N_k[k] + len(indices)
                        n_old += N_ksj[k, s, j]

        print("Number of retained samples")
        print(N_k)
        print("Number of retained samples from each seed")
        print(N_ksj)

        # generate the weights of each of the umbrella set
        mbar = pymbar.MBAR(u_kln,
                           N_k,
                           verbose=True,
                           subsampling_protocol=[{
                               'method': 'L-BFGS-B'
                           }])

        print("MBAR Converged...")
        # testing

        # extract self-consistent weights and uncertainties
        (df_u, ddf_u, theta_i) = mbar.getFreeEnergyDifferences()

        print("Free Energies Optained...")

        # convert PMF to kcal/mol and normalize by the number of molecules
        df_u /= (beta_k[0] * float(Independent))
        ddf_u /= (beta_k[0] * float(Independent))

        ddA[i, :] = ddf_u[-1]

        # Write out free energy differences
        print("Free Energy Difference (in units of kcal/mol)")
        print("  dA(Lambda) = A(Lambda) - A(Fully Restrained)")
        for k in range(Kbig):
            print("%8.3f %8.3f" % (df_i[k, -1], ddf_u[k, -1]))

    # =============================================================================================
    # PRINT THE FINAL DATA
    # =============================================================================================
    out_dA = np.zeros(len(polymorph))
    out_ddA = np.zeros(len(polymorph))
    for i, poly in enumerate(polymorph):
        out_dA[i] = dA[i, 0]  #Kbig - 1]
        out_ddA[i] = ddA[i, 0]  #Kbig - 1]

    # =============================================================================================
    # PLOT THE FINAL DATA
    # =============================================================================================
    if (plot_out) and polymorphs == 'all':
        # now plot the free energy change as a function of temperature
        fig = plt.figure(4)
        ax = fig.add_subplot(111)
        xlabel = 'Restraint Strength, $\lambda$'
        ylabel = 'Relative Free Energy (kcal/mol)'
        plt.xlabel(xlabel)
        plt.ylabel(ylabel)
        Xaxis = [float(j / 100.0) for j in Lambdas]

        if os.path.isfile('BootstrapStd_' + PotNAME + '_Polymorph1_' +
                          str(Molecules) + '_' + Tname + '_' + Pname +
                          '_dAvsL_All'):
            ddA[0, :] = MBARBootstrap.ExtractBootstrap('BootstrapStd_' +
                                                       PotNAME +
                                                       '_Polymorph1_' +
                                                       str(Molecules) + '_' +
                                                       Tname + '_' + Pname +
                                                       '_dAvsL_All')
        elif os.path.isfile('BootstrapStd_' + PotNAME + '_Polymorph2_' +
                            str(Molecules) + '_' + Tname + '_' + Pname +
                            '_dAvsL_All'):
            ddA[1, :] = MBARBootstrap.ExtractBootstrap('BootstrapStd_' +
                                                       PotNAME +
                                                       '_Polymorph2_' +
                                                       str(Molecules) + '_' +
                                                       Tname + '_' + Pname +
                                                       '_dAvsL_All')
        elif os.path.isfile('BootstrapStd_' + PotNAME + '_Polymorph2_' +
                            str(Molecules) + '_' + Tname + '_' + Pname +
                            '_dAvsL_All'):
            ddA[2, :] = MBARBootstrap.ExtractBootstrap('BootstrapStd_' +
                                                       PotNAME +
                                                       '_Polymorph3_' +
                                                       str(Molecules) + '_' +
                                                       Tname + '_' + Pname +
                                                       '_dAvsL_All')

        ax.errorbar(Xaxis,
                    dA[0, :],
                    color='b',
                    yerr=ddA[0, :],
                    label='Benzene I')
        ax.errorbar(Xaxis,
                    dA[1, :],
                    color='g',
                    yerr=ddA[1, :],
                    label='Benzene II')
        ax.errorbar(Xaxis,
                    dA[2, :],
                    color='r',
                    yerr=ddA[2, :],
                    label='Benzene III')
        plt.legend(loc='upper left')

        if len(hinges) > 1:
            filename = PotNAME + '_' + str(
                Molecules) + '_' + Tname + '_dAvsL.pdf'
        else:
            filename = PotNAME + '_' + str(
                Molecules) + '_' + Tname + hinge + '_dAvsL.pdf'
        plt.show()
    return out_dA, out_ddA
Esempio n. 33
0
def DoBAR(fwds, revs, label, verbose):
    """

    BAR to combine fwd and rev data of dGs.
    Here, don't multiply dGs_R by -1 since BAR calls for reverse work value.

    Parameters
    ----------
    fwds: dictionary of forward work values for each window
    revs: dictionary of reverse work values for each window
    label: string label of what it is (only for printing output)

    Returns
    -------
    dgs: 1D list of accumulated list of energy values. Ex. if each step was 2,
       then dgs would be [0,2,4...]
    gsdlist: 1D list of accompanying stdevs to the dgs list

    """

    fwd_ss = {} # subsampled version of fwds
    rev_ss = {} # subsampled version of revs
    dg_bar = np.zeros([len(fwds)], np.float64)  # allocate storage: dG steps
    gsd_bar = np.zeros([len(fwds)], np.float64) # allocate storage: dG stdev steps
    dgs = np.zeros([len(fwds)], np.float64)     # allocate storage: dG accumulated
    gsdlist = np.zeros([len(fwds)], np.float64) # allocate storage: dG stdev accum


    #corr_time = np.zeros([len(fwds)], np.float64)
    corr_time = {}
    for key, value in fwds.items(): # this notation changes in python3: http://tinyurl.com/j3uq3me
        # compute correlation time
        g = timeseries.statisticalInefficiency(value)
        corr_time[key] = [g]
        # compute indices of UNcorrelated timeseries, then extract those samples
        indices = timeseries.subsampleCorrelatedData(value, g)
        fwd_ss[key] = value[indices]

    for key, value in revs.items(): # this notation changes in python3: http://tinyurl.com/j3uq3me
        # compute correlation time
        g = timeseries.statisticalInefficiency(value)
        corr_time[key].append(g)
        # compute indices of UNcorrelated timeseries, then extract those samples
        indices = timeseries.subsampleCorrelatedData(value, g)
        rev_ss[key] = value[indices]

    bar = {}
    # then apply BAR estimator to get dG for each step
    for kF, kR in zip(sorted(fwd_ss.keys()), sorted(list(rev_ss.keys()), reverse=True)):
        dg_bar[kF], gsd_bar[kF] = BAR(fwd_ss[kF],rev_ss[kR])
        bar[kF] = [ np.sum(dg_bar), dg_bar[kF], gsd_bar[kF] ]

    # calculate the net dG standard deviation = sqrt[ sum(s_i^2) ]
    gsd = (np.sum(np.power(gsd_bar, 2)))**0.5

    net = 0.
    netsd = 0.
    for i, g in enumerate(dg_bar):
        # accumulate net dGs into running sums (plot this)
        dgs[i] = dg_bar[i] + net
        net = dgs[i]
        # combine the stdevs: s = sqrt(s1^2 + s2^2 + ...)
        gsdlist[i] = ((gsd_bar[i])**2.+(netsd)**2.)**0.5
        netsd = gsdlist[i]


    if verbose == True:
        print('\n\n#####---Correlation Times for dG_{}--#####'.format(label))
        print('Window'.rjust(3), 'F'.rjust(5), 'R'.rjust(9))
        for k,v in corr_time.items():
            print("{:3d} {:10.3f} {:10.3f}".format(k, v[0], v[1]) )

        print("\n\n#####---BAR estimator for dG_{}---#####".format(label))
        print('Window'.rjust(3), 'dG'.rjust(5), 'ddG'.rjust(11), "Uncert.".rjust(11))
        print("---------------------------------------------------------")


        for k, v in bar.items():
            str = '{:3d} {:10.4f} {:10.4f} +- {:3.4f}'.format(k, v[0], v[1], v[2])
            print(str)

    print(("\nNet dG_{} energy difference = {:.4f} +- {:.4f} kcal/mol".format(label, np.sum(dg_bar), gsd)))

    return dgs, gsdlist
        infile = open(filename, 'r')
        lines = infile.readlines()
        infile.close()
        # Parse data.
        n = 0
        for line in lines:
            if line[0] != '#' and line[0] != '@':
                tokens = line.split()            
                u_kn[k,n] = beta_k[k] * (float(tokens[2]) - float(tokens[1])) # reduced potential energy without umbrella restraint
                n += 1

    # Compute correlation times for potential energy and chi
    # timeseries.  If the temperatures differ, use energies to determine samples; otherwise, use the cosine of chi
            
    if (DifferentTemperatures):        
        g_k[k] = timeseries.statisticalInefficiency(u_kn[k,:], u_kn[k,0:N_k[k]])
        print "Correlation time for set %5d is %10.3f" % (k,g_k[k])
        indices = timeseries.subsampleCorrelatedData(u_kn[k,0:N_k[k]])
    else:
        chi_radians = chi_kn[k,0:N_k[k]]/(180.0/numpy.pi)
        g_cos = timeseries.statisticalInefficiency(numpy.cos(chi_radians))
        g_sin = timeseries.statisticalInefficiency(numpy.sin(chi_radians))
        print "g_cos = %.1f | g_sin = %.1f" % (g_cos, g_sin)
        g_k[k] = max(g_cos, g_sin)
        print "Correlation time for set %5d is %10.3f" % (k,g_k[k])
        indices = timeseries.subsampleCorrelatedData(chi_radians, g=g_k[k]) 
    # Subsample data.
    N_k[k] = len(indices)
    u_kn[k,0:N_k[k]] = u_kn[k,indices]
    chi_kn[k,0:N_k[k]] = chi_kn[k,indices]
Esempio n. 35
0
def doStatistics( filename ):
    array = np.genfromtxt( filename, skip_header = 100 , usecols = 1, dtype = float)
    return np.mean(array), np.std(array) / np.sqrt(len(array)/statisticalInefficiency(array))
Esempio n. 36
0
#========================================================================

#------------------------------------------------------------------------
# Read Data From File
#------------------------------------------------------------------------

print("")
print("Preparing data:")
T_from_file = read_simulation_temps(simulation, NumTemps)
E_from_file = read_total_energies(simulation, TE_COL_NUM)
K = len(T_from_file)
N_k = numpy.zeros(K, numpy.int32)
g = numpy.zeros(K, numpy.float64)

for k in range(K):  # subsample the energies
    g[k] = timeseries.statisticalInefficiency(E_from_file[k])
    indices = numpy.array(
        timeseries.subsampleCorrelatedData(
            E_from_file[k], g=g[k]))  # indices of uncorrelated samples
    N_k[k] = len(indices)  # number of uncorrelated samples
    E_from_file[k, 0:N_k[k]] = E_from_file[k, indices]

#------------------------------------------------------------------------
# Insert Intermediate T's and corresponding blank U's and E's
#------------------------------------------------------------------------
Temp_k = T_from_file
minT = T_from_file[0]
maxT = T_from_file[len(T_from_file) - 1]
#beta = 1/(k*BT)
#T = 1/(kB*beta)
if dtype == 'temperature':
Esempio n. 37
0
def statistical_inefficiency(df,
                             series=None,
                             lower=None,
                             upper=None,
                             step=None,
                             conservative=True,
                             drop_duplicates=False,
                             sort=False):
    """Subsample a DataFrame based on the calculated statistical inefficiency
    of a timeseries.

    If `series` is ``None``, then this function will behave the same as
    :func:`slicing`.

    Parameters
    ----------
    df : DataFrame
        DataFrame to subsample according statistical inefficiency of `series`.
    series : Series
        Series to use for calculating statistical inefficiency. If ``None``,
        no statistical inefficiency-based subsampling will be performed.
    lower : float
        Lower bound to pre-slice `series` data from.
    upper : float
        Upper bound to pre-slice `series` to (inclusive).
    step : int
        Step between `series` items to pre-slice by.
    conservative : bool
        ``True`` use ``ceil(statistical_inefficiency)`` to slice the data in uniform
        intervals (the default). ``False`` will sample at non-uniform intervals to
        closely match the (fractional) statistical_inefficieny, as implemented
        in :func:`pymbar.timeseries.subsampleCorrelatedData`.
    drop_duplicates : bool
        Drop the duplicated lines based on time.
    sort : bool
        Sort the Dataframe based on the time column.

    Returns
    -------
    DataFrame
        `df` subsampled according to subsampled `series`.

    Warning
    -------
    The `series` and the data to be sliced, `df`, need to have the same number
    of elements because the statistical inefficiency is calculated based on
    the index of the series (and not an associated time). At the moment there is
    no automatic conversion from a time to an index.

    Note
    ----
    For a non-integer statistical ineffciency :math:`g`, the default value
    ``conservative=True`` will provide _fewer_ data points than allowed by
    :math:`g` and thus error estimates will be _higher_. For large numbers of
    data points and converged free energies, the choice should not make a
    difference. For small numbers of data points, ``conservative=True``
    decreases a false sense of accuracy and is deemed the more careful and
    conservative approach.

    See Also
    --------
    pymbar.timeseries.statisticalInefficiency : detailed background
    pymbar.timeseries.subsampleCorrelatedData : used for subsampling


    .. versionchanged:: 0.2.0
       The ``conservative`` keyword was added and the method is now using
       ``pymbar.timeseries.statisticalInefficiency()``; previously, the statistical
       inefficiency was _rounded_ (instead of ``ceil()``) and thus one could
       end up with correlated data.

    """
    if _check_multiple_times(df):
        if drop_duplicates:
            if isinstance(df, pd.Series):
                # remove the duplicate based on time
                drop_duplicates_series = df.reset_index('time', name='').\
                    drop_duplicates('time')
                # Rest the time index
                lambda_names = [
                    'time',
                ]
                lambda_names.extend(drop_duplicates_series.index.names)
                df = drop_duplicates_series.set_index('time', append=True).\
                    reorder_levels(lambda_names)
            else:
                # remove the duplicate based on time
                drop_duplicates_df = df.reset_index('time').drop_duplicates(
                    'time')
                # Rest the time index
                lambda_names = [
                    'time',
                ]
                lambda_names.extend(drop_duplicates_df.index.names)
                df = drop_duplicates_df.set_index('time', append=True).\
                    reorder_levels(lambda_names)

            # Do the same withing with the series
            if series is not None:
                # remove the duplicate based on time
                drop_duplicates_series = series.reset_index('time', name='').\
                    drop_duplicates('time')
                # Rest the time index
                lambda_names = [
                    'time',
                ]
                lambda_names.extend(drop_duplicates_series.index.names)
                series = drop_duplicates_series.set_index('time', append=True).\
                    reorder_levels(lambda_names)

        else:
            raise KeyError(
                "Duplicate time values found; statistical inefficiency "
                "only works on a single, contiguous, "
                "and sorted timeseries.")

    if not _check_sorted(df):
        if sort:
            df = df.sort_index(level='time')

            if series is not None:
                series = series.sort_index(level='time')
        else:
            raise KeyError(
                "Statistical inefficiency only works as expected if "
                "values are sorted by time, increasing.")

    if series is not None:

        if (len(series) != len(df) or not all(
                series.reset_index()['time'] == df.reset_index()['time'])):
            raise ValueError(
                "series and data must be sampled at the same times")

        series = slicing(series, lower=lower, upper=upper, step=step)

        # calculate statistical inefficiency of series (could use fft=True but needs test)
        statinef = statisticalInefficiency(series, fast=False)

        # use the subsampleCorrelatedData function to get the subsample index
        indices = subsampleCorrelatedData(series,
                                          g=statinef,
                                          conservative=conservative)
        df = df.iloc[indices]
    else:
        df = slicing(df, lower=lower, upper=upper, step=step)

    return df
Esempio n. 38
0
        infile.close()
        # Parse data.
        n = 0
        for line in lines:
            if line[0] != '#' and line[0] != '@':
                tokens = line.split()
                u_kn[k, n] = beta_k[k] * (
                    float(tokens[2]) - float(tokens[1])
                )  # reduced potential energy without umbrella restraint
                n += 1

    # Compute correlation times for potential energy and chi
    # timeseries.  If the temperatures differ, use energies to determine samples; otherwise, use the cosine of chi

    if (DifferentTemperatures):
        g_k[k] = timeseries.statisticalInefficiency(u_kn[k, :], u_kn[k,
                                                                     0:N_k[k]])
        print("Correlation time for set %5d is %10.3f" % (k, g_k[k]))
        indices = timeseries.subsampleCorrelatedData(u_kn[k, 0:N_k[k]])
    else:
        d = d_kn[k, 0:N_k[k]]
        g_k[k] = timeseries.statisticalInefficiency(d)
        print("Correlation time for set %5d is %10.3f" % (k, g_k[k]))
        indices = timeseries.subsampleCorrelatedData(d, g=g_k[k])
        # Subsample data.
    N_k[k] = len(indices)
    u_kn[k, 0:N_k[k]] = u_kn[k, indices]
    d_kn[k, 0:N_k[k]] = d_kn[k, indices]

N_max = numpy.max(N_k)  # shorten the array size
u_kln = numpy.zeros(
    [K, K, N_max], numpy.float64
Esempio n. 39
0
def main():    
    usage = """
        usage: %prog [options] <metadata file>
    """
    
    parser = optparse.OptionParser(usage)
    parser.add_option("-o", "--outfile", dest="output_file", default='mbar_pmf.out', help="Output file for PMF [default: %default]")
    parser.add_option("-t", "--temperature", dest="temperature", default=300., type="float", help="Initial temperature in K [default: %default K]")
    parser.add_option("-b", "--bins", dest="bins", default=50, type="int", help="Number of bins for 1D PMF [default: %default]")
    parser.add_option("-d", "--double", dest="double_k", default=False, action='store_true', help="Double the k values [default: %default]")
    parser.add_option("-c", "--kcal", dest="kcal_k", default=False, action='store_true', help="Convert k values from kcal to kJ [default: %default]")
    parser.add_option("-s", "--skip-subsampling", dest="skip_subsampling", default=False, action='store_true', help="Skip data subsampling [default: %default]")
    parser.add_option("-v", "--verbose", dest="verbose", default=False, action="store_true", help="Verbose output from PyMBAR [default: %default]")
    
    (options, args) = parser.parse_args()
    
    if len(args) < 1:
        parser.error('No metadata file passed')
    elif not os.path.exists(args[0]):
        parser.error('Metadata file not found')
    
    metadata = [] # stores metadata per umbrella
    N_max = 0 # the max number of snapshots per umbrella
    different_temperatures = False # flag to know if we are reading in energies for the snapshots
    
    # open the wham metadata file
    print "Opening metadata file %s" % args[0]
    f = open(args[0], 'r')
    metadata_lines = f.readlines()
    f.close()
    
    # first get all the metadata and count the max number of snapshots per umbrella
    for line in metadata_lines:
        # skip comments
        if line.startswith('#'):
            continue
        # split lines based on spaces, but convert tabs to spaces first
        clean_split = filter(None, line.strip().expandtabs().split(' '))
        if not os.path.exists(clean_split[0]):
            print "Data file %s doesn't exist, skipping this replica" % clean_split[0]
            continue
        else:
            # get the number of snapshots for the replica
            nsnapshots = file_len(clean_split[0])
            # /path/to/timeseries/file  loc_win_min spring  [correl time] [temperature]
            k = float(clean_split[2])
            if options.double_k:
                k = k*2.0
            if options.kcal_k:
                k = k*4.184
            
            current_meta = { 'path': clean_split[0], 'coord': float(clean_split[1]), 'k': k, 'n': nsnapshots }
            #   K_k[k] = float(tokens[1]) * (numpy.pi/180)**2 # spring constant (read in kJ/mol/rad**2, converted to kJ/mol/deg**2)    
            
            if len(clean_split) >= 4:
                # TODO: temperature the 4rd or 5th value???
                # temperature might be the 4th value...
                current_meta['t'] = float(clean_split[3])
                different_temperatures = True
            metadata.append(current_meta)
    
    N_max = numpy.max([ w['n'] for w in metadata ])
    print "Max number of snapshots %d" % N_max
    
    # now allocate the memory for the arrays
    K = len(metadata)
    T_k = numpy.ones(K,float)*options.temperature # inital temperatures are all equal
    beta_k = 1.0/(kB*T_k)   # beta factor for the different temperatures
    
    data = numpy.zeros([K,N_max], numpy.float64) # the snapshot data
    u_kn = numpy.zeros([K,N_max], numpy.float64) # u_kn[k,n] is the reduced potential energy without umbrella restraints of snapshot n of umbrella simulation k
    u_kln = numpy.zeros([K,K,N_max], numpy.float64) # u_kln[k,l,n] is the reduced potential energy of snapshot n from umbrella simulation k evaluated at umbrella l
    g_k = numpy.zeros([K],numpy.float32) # correlation time

    data_min = [] # will set the min and max data values later
    data_max = []
    
    # Now loop through each datafile and extract the data
    for i, w in enumerate(metadata):
        print "Reading %s..." % w['path']
        f = open(w['path'], 'r')
        lines = f.readlines()
        f.close()
        
        clean_split_lines = [ filter(None, line.strip().expandtabs().split(' ')) for line in lines if not line.startswith('#') ]

        if different_temperatures:
            raise Exception('Differen\'t temperatures aren\'t supported yet')
            # if different temperatures are specified the metadata file, 
            # then we need the energies to compute the PMF, found in the third column
            # for j,l in enumerate(clean_split_lines):
            #     data[i,j] = float(l[1]) # second column is the coordinate
            #     # third column will be the system's potential energy
            #     potential_energy = float(l[2])
            #     dchi = w['coord']-float(l[1])
            #     restraint_potential = k_multiplier*w['k']*(dchi**2)
            #     # TODO: given the coordinate and the restraining potential, calculate the umbrella restraint
            #     u_kn[i,j] = beta_k[i] * (potential_energy-restraint_potential) # reduced potential energy without umbrella restraint
            #         
            # # Compute correlation times for potential energy and timeseries.
            # # If the temperatures differ, use energies to determine samples; otherwise, use the cosine of chi
            # g_k[i] = timeseries.statisticalInefficiency(u_kn[i,:], u_kn[i,:])
            # indices = timeseries.subsampleCorrelatedData(u_kn[i,:])
        else:
            # no temperature column
            for j,l in enumerate(clean_split_lines):
                data[i,j] = float(l[1])
            dataset = numpy.cos(data[i,:w['n']])
            g_k[i] = timeseries.statisticalInefficiency(dataset,dataset)
            if not options.skip_subsampling:
                indices = timeseries.subsampleCorrelatedData(dataset)

        if options.skip_subsampling:
            data_max.append(numpy.max(data[i]))
            data_min.append(numpy.min(data[i]))
            w['n'] = len(data[i])
            u_kn[i,0:w['n']] = u_kn[i]
            data[i,0:w['n']] = data[i]
        else:
            # get min and max for data, used for binning ranges
            data_max.append(numpy.max(data[i,indices]))
            data_min.append(numpy.min(data[i,indices]))
            # Subsample the data
            w['n'] = len(indices)
            u_kn[i,0:w['n']] = u_kn[i,indices]
            data[i,0:w['n']] = data[i,indices]
            print "Correlation time for set %5d is %10.3f" % (i,g_k[i])

    print "Finished reading data files"
    # Set zero of u_kn -- this is arbitrary.
    u_kn -= u_kn.min()

    # Construct torsion bins
    print "Binning data..."
    
    data_min = numpy.min(data_min)
    data_max = numpy.max(data_max)
    delta = (data_max - data_min) / float(options.bins)
    
    print "Min coord: %f" % data_min
    print "Max coord: %f" % data_max
    print "Delta for binning %f" % delta
    # compute bin centers
    bin_center_i = numpy.zeros([options.bins], numpy.float64)
    for i in range(options.bins):
        bin_center_i[i] = data_min + delta/2 + delta * i
    
    # Bin data
    bin_kn = numpy.zeros([K,N_max], numpy.int32)-1
    # for each window
    for k in range(K):
        # for 0 to the number of snapshots in the window k
        for n in range(metadata[k]['n']):            
            # Compute bin assignment.
            bin_kn[k,n] = int((data[k,n] - data_min) / delta)
            for l in range(K):
                # Compute minimum-image torsion deviation from umbrella center l
                dchi = data[k,n] - metadata[l]['coord']
                # Compute energy of snapshot n from simulation k in umbrella potential l
                u_kln[k,l,n] = u_kn[k,n] + beta_k[k]*metadata[l]['k']*(dchi**2)
    
    for i in range(options.bins):
        if numpy.sum(bin_kn==i) == 0:
            for j in range(options.bins):
                print "Bin: %d" % j
                print numpy.sum(bin_kn==j)
            raise Exception("At least one bin has no samples. Adjust bin sizes or eliminate empty bins to ensure at least one sample per bin.")        

    # Initialize MBAR.
    print "Running MBAR..."
    N_k = numpy.array([ w['n'] for w in metadata ], numpy.int32)
    mbar = pymbar.MBAR(u_kln, N_k, verbose=options.verbose, initialize='BAR')
    #mbar = pymbar.MBAR(u_kln, N_k, verbose=options.verbose)
    #mbar = pymbar.MBAR(u_kln, N_k, verbose = True, method = 'Newton-Raphson')

    # Compute PMF in unbiased potential (in units of kT).
    (f_i, df_i) = mbar.computePMF(u_kn, bin_kn, options.bins)

    # Write out PMF and save to file
    print "Saving PMF to file: %s" % options.output_file
    f = open(options.output_file, 'w')
    print "PMF (in units of kT)"
    print "%8s %8s %8s" % ('bin', 'f', 'df')
    f.write("#Coor   Free    +/-\n")
    for i in range(options.bins):
        print "%8.1f %8.3f %8.3f" % (bin_center_i[i], f_i[i], df_i[i])
        f.write("%8.1f %8.3f %8.3f\n" % (bin_center_i[i], f_i[i], df_i[i]))
    f.close()
Esempio n. 40
0
    bp = get_probs(data_file)[skip:,:]
    n_samples, n_bins = bp.shape
    bi = np.arange(n_bins)

    pmf = -0.6*np.log(np.mean(bp, axis=0))
    pmf_mean = pmf


    # Calculate statistical inefficiency
    try:
        g = np.load(stat_ineff_file)
    except:
        g = np.zeros((n_bins,))
        for k in xrange(n_bins):
            g[k] = timeseries.statisticalInefficiency(bp[:,k])

        np.save(stat_ineff_file, g)

    N_eff = np.ceil(n_samples / np.max(g))
    tstat = scipy.stats.t.ppf(.975, N_eff - 1)
    print 'N_eff: ', N_eff
    pmf_err = np.empty((n_bins))
    for k in xrange(n_bins):
        blks = np.array_split(bp[:,k], N_eff)

        blk_mean = np.array(map(np.mean, blks))
        blk_pmf = -0.6*np.log(blk_mean)
        pmf_err[k] = tstat*np.std(blk_pmf)/np.sqrt(blk_pmf.size)

    pmf_min = np.min(pmf)
Esempio n. 41
0
def test_statistical_inefficiency_single():
    X, Y, energy = generate_data()
    timeseries.statisticalInefficiency(X[0])
    timeseries.statisticalInefficiency(X[0], X[0])
    timeseries.statisticalInefficiency(X[0]**2)
    timeseries.statisticalInefficiency(X[0]**2, X[0]**2)
    timeseries.statisticalInefficiency(energy[0])
    timeseries.statisticalInefficiency(energy[0], energy[0])

    timeseries.statisticalInefficiency(X[0], X[0]**2)
def test_statistical_inefficiency_single():
    X, Y, energy = generate_data()
    timeseries.statisticalInefficiency(X[0])
    timeseries.statisticalInefficiency(X[0], X[0])    
    timeseries.statisticalInefficiency(X[0] ** 2)
    timeseries.statisticalInefficiency(X[0] ** 2, X[0] ** 2)
    timeseries.statisticalInefficiency(energy[0])
    timeseries.statisticalInefficiency(energy[0], energy[0])
    
    timeseries.statisticalInefficiency(X[0], X[0] ** 2)
def dA_MBAR(minimum=0,
            maximum=100,
            spacing=10,
            exponent=2,
            polymorphs='p1 p2',
            Molecules=72,
            Independent=4,
            Temp=200,
            bonds=False,
            primary_directory='.',
            added_directories=[]):
    # =============================================================================================
    # Setting up the values for gamma or lambda states
    # =============================================================================================
    #    raw_value = minimum
    #    values = []
    directory_names = np.arange(minimum, maximum + spacing, spacing)
    directory_names = np.sort(np.append(directory_names, added_directories))

    #    while raw_value <= maximum:
    #        if exponent >= 0:
    #            value = int(100 * (float(raw_value) / float(maximum)) ** abs(exponent))
    #        else:
    #            value = int(100 * (1 - (float(maximum - raw_value) / float(maximum)) ** abs(exponent)))
    #        values.append(value)
    #        raw_value = raw_value + spacing
    #    print(values)
    #    print(directory_names)
    #    exit()

    # POLYMORPH
    polymorphs = polymorphs.split()

    # =============================================================================================
    # READ IN RAW DATA
    # =============================================================================================
    # Constants.
    kB = 1.3806488e-23 * 6.0221413e23 / (1000.0 * 4.184
                                         )  # Boltzmann constant in kcal/mol

    # Parameters
    T_k = Temp * np.ones(len(directory_names),
                         float)  # Convert temperatures to floats
    print(T_k)
    #  print(values)

    K = len(directory_names)  # How many states?

    # total number of states examined; 0 are unsampled if bonds are left on, 1 is unsampled if the bonds are removed
    Kbig = K

    # maximum number of snapshots/simulation (could make this automated) - doesn't matter, as long as it's long enough.
    N_max = 5000

    # beta factor for the different temperatures
    beta_k = 1.0 / (kB * T_k)
    dA = np.zeros([len(polymorphs), Kbig], float)
    ddA = np.zeros([len(polymorphs), Kbig], float)
    convert_units = 0.2390057 * np.ones(
        Kbig, float)  # Convert all energies to kcal/mol

    # Allocate storage for simulation data
    for i, poly in enumerate(polymorphs):
        # N_k[k] is the total number of snapshots from alchemical state k
        N_k = np.zeros([Kbig], np.int32)

        # N_k_s[k,s] is the total number of snapshots from alchemical state k from seed s
        N_k_s = np.zeros([Kbig], np.int32)

        # u_kln[k,l,n] is the adjusted energy of snapshot n from simulation k
        u_kln = np.zeros([K, Kbig, N_max], np.float64)

        # dhdl_kn[k,n] is the derivative of energy with respect to lambda of snapshot n from simulation k
        dhdl_kn = np.zeros([K, N_max], np.float64)

        #Load in the data for each run
        for k in range(K):
            n = 0

            # cycle through all the input total energy data
            if directory_names[k] == int(directory_names[k]):
                dirpath = polymorphs[i] + '/' + primary_directory + '/' + str(
                    int(directory_names[k]))
            else:
                dirpath = polymorphs[i] + '/' + primary_directory + '/' + str(
                    directory_names[k])
            if os.path.isdir(dirpath):
                fname = dirpath + '/PROD.edr'
                dhdlname = dirpath + '/dhdl_PROD.xvg'

                potential_energy = panedr.edr_to_df(fname)['Potential'].values
                print("loading " + fname)

                dhdl_energy = np.loadtxt(dhdlname,
                                         comments=['#', '$', '@', '!'])
                print("loading " + dhdlname)

                # Removing any non-equilibrated points of the simulation
                [start_production, _,
                 _] = timeseries.detectEquilibration(potential_energy)
                potential_energy = potential_energy[start_production:]
                dhdl_energy = dhdl_energy[start_production:, :]

                # Cutting points if they exceed N_max
                if len(potential_energy) > N_max:
                    potential_energy = potential_energy[len(potential_energy) -
                                                        N_max:]
                    dhdl_energy = dhdl_energy[len(dhdl_energy) - N_max:, :]

                # the energy of every configuration from each state evaluated at its sampled state
                n = len(potential_energy)
                dhdl_placement = len(dhdl_energy[0, :]) - K
                u_kln[k, :K, :n] = (potential_energy.reshape(
                    (n, 1)) + dhdl_energy[:, dhdl_placement:]
                                    ).T * convert_units[k]
                dhdl_kn[k, :n] = (float(Independent) / Molecules) * \
                                 np.sum(dhdl_energy[:, 2:dhdl_placement], axis=1) * convert_units[k]

                N_k_s[k] = n
                N_k[k] = n

        # convert to nondimensional units from kcal/mol
        u_kln *= beta_k[0]

        #u_kln_save = u_kln.copy()
        u_kln_save = u_kln[:]
        g_k = np.zeros([K])

        print("Number of retained samples")
        print(N_k)
        print("Number of retained samples from each seed")
        print(N_k_s)

        # =============================================================================================
        # COMPUTE FREE ENERGY DIFFERENCE USING MBAR
        # =============================================================================================

        # Initialize MBAR.
        print("Running MBAR...")

        # generate the weights of each of the umbrella set
        mbar = pymbar.MBAR(u_kln,
                           N_k,
                           verbose=True,
                           subsampling_protocol=[{
                               'method': 'L-BFGS-B'
                           }])

        print("MBAR Converged...")
        # testing

        for k in range(Kbig):
            w = np.exp(mbar.Log_W_nk[:, k])
            print("max weight in state %d is %12.7f" % (k, np.max(w)))
            neff = 1 / np.sum(w**2)
            print("Effective number of sample in state %d is %10.3f" %
                  (k, neff))
            print("Efficiency for state %d is %d/%d = %10.4f" %
                  (k, neff, len(w), neff / len(w)))

        # extract self-consistent weights and uncertainties
        (df_i, ddf_i, theta_i) = mbar.getFreeEnergyDifferences()

        print("Free Energies Optained...")

        # convert PMF to kcal/mol and normalize by the number of molecules
        df_i /= (beta_k[0] * float(Independent))
        ddf_i /= (beta_k[0] * float(Independent))

        dA[i, :] = df_i[-1]

        # =============================================================================================
        # COMPUTE UNCERTAINTY USING THE UNCORRELATED DATA
        # =============================================================================================

        for k in range(K):
            N_k[k] = 0
            n_old = 0

            g_k[k] = timeseries.statisticalInefficiency(
                dhdl_kn[k, n_old:(n_old + N_k_s[k])])
            print("Correlation time for sampled state %d is %10.3f" %
                  (k, g_k[k]))
            # subsample the data to get statistically uncorrelated data
            indices = np.array(
                timeseries.subsampleCorrelatedData(u_kln[k, k,
                                                         n_old:(n_old +
                                                                N_k_s[k])],
                                                   g=g_k[k]))  # subsample

            # not sure why we have to transpose
            if indices != []:
                u_kln[k, :,
                      N_k[k]:(N_k[k] +
                              len(indices))] = u_kln_save[k, :,
                                                          (indices +
                                                           n_old)].transpose()
                N_k[k] = N_k[k] + len(indices)
                n_old += N_k_s[k]

        print("Number of retained samples")
        print(N_k)
        print("Number of retained samples from each seed")
        print(N_k_s)

        # generate the weights of each of the umbrella set
        mbar = pymbar.MBAR(u_kln,
                           N_k,
                           verbose=True,
                           subsampling_protocol=[{
                               'method': 'L-BFGS-B'
                           }])

        print("MBAR Converged...")

        # extract self-consistent weights and uncertainties
        try:
            (df_u, ddf_u, theta_i) = mbar.getFreeEnergyDifferences()
        except ValueError:
            pass

        print("Free Energies Optained...")

        # convert PMF to kcal/mol and normalize by the number of molecules
        df_u /= (beta_k[0] * float(Independent))
        ddf_u /= (beta_k[0] * float(Independent))

        ddA[i, :] = ddf_u[-1]
        #        ddA[i, :] = ddf_i[-1]

        # Write out free energy differences
        print("Free Energy Difference (in units of kcal/mol)")
        print("  dA(Gamma) = A(Gamma) - A(Interactions Off)")
        for k in range(Kbig):
            print("%8.3f %8.3f" % (df_i[k, -1], ddf_u[k, -1]))

        del N_k
        del N_k_s
        del u_kln
        del dhdl_kn

    out_dA = np.zeros(len(polymorphs))
    out_ddA = np.zeros(len(polymorphs))
    for i, poly in enumerate(polymorphs):
        out_dA[i] = dA[i, 0]
        out_ddA[i] = ddA[i, 0]

    return out_dA, out_ddA
Esempio n. 44
0
    mask_kt[k,0:T_k[k]] = True
# Create a list from this mask.
all_data_indices = where(mask_kt)

# Construct equal-frequency extension bins
print("binning data...")
bin_kt = zeros([K, T_max], int32)
(bin_left_boundary_i, bin_center_i, bin_width_i, bin_assignments) = construct_nonuniform_bins(x_kt[all_data_indices], nbins)
bin_kt[all_data_indices] = bin_assignments

# Compute correlation times.
N_max = 0
g_k = zeros([K], float64)
for k in range(K):
    # Compute statistical inefficiency for extension timeseries
    g = timeseries.statisticalInefficiency(x_kt[k,0:T_k[k]], x_kt[k,0:T_k[k]])
    # store statistical inefficiency
    g_k[k] = g
    print("timeseries %d : g = %.1f, %.0f uncorrelated samples (of %d total samples)" % (k+1, g, floor(T_k[k] / g), T_k[k]))
    N_max = max(N_max, ceil(T_k[k] / g) + 1)

# Subsample trajectory position data.
x_kn = zeros([K, N_max], float64)
bin_kn = zeros([K, N_max], int32)
N_k = zeros([K], int32)
for k in range(K):
    # Compute correlation times for potential energy and chi timeseries.
    indices = timeseries.subsampleCorrelatedData(x_kt[k,0:T_k[k]])
    # Store subsampled positions.
    N_k[k] = len(indices)
    x_kn[k,0:N_k[k]] = x_kt[k,indices]
Esempio n. 45
0
    )  # N_k[k] is the number of uncorrelated samples from simulation index k
    reduced_expectation_data = []
    if len(expectation_columns) > 0:
        for i in range(len(expectation_columns)):
            reduced_expectation_data.append(
                numpy.zeros([K, N_samples], numpy.float64))
    reduced_fep_data = []
    if len(fep_columns) > 0:
        for i in range(len(fep_columns)):
            reduced_fep_data.append(numpy.zeros([K, N_samples], numpy.float64))
    for k in range(K):
        # Extract timeseries.
        A_t = biasing_variable_kt[0][k, :]
        # Compute statistical inefficiency.
        try:
            g = timeseries.statisticalInefficiency(A_t)
        except Exception as e:
            print str(e)
            print A_t

        # Subsample data.
        if subsample_trajectories:
            indices = timeseries.subsampleCorrelatedData(A_t, g=g)
        else:
            indices = timeseries.subsampleCorrelatedData(A_t, g=1)
        N = len(indices)  # number of uncorrelated samples
        print "k = %5d : g = %.1f, N = %d" % (k, g, N)
        for i in range(nbiases):
            biasing_variable_kn[i][k, 0:N] = biasing_variable_kt[i][k, indices]
        for i in range(nperturbations + 1):
            U_kn[i][k, 0:N] = U_kt[i][k, indices]