def test_statistical_inefficiency_multiple():
    X, Y, energy = generate_data()
    timeseries.statisticalInefficiencyMultiple(X)
    timeseries.statisticalInefficiencyMultiple(X ** 2)
    timeseries.statisticalInefficiencyMultiple(X[0, :] ** 2)
    timeseries.statisticalInefficiencyMultiple(X[0:2, :] ** 2)
    timeseries.statisticalInefficiencyMultiple(energy)
Beispiel #2
0
def test_statistical_inefficiency_multiple():
    X, Y, energy = generate_data()
    timeseries.statisticalInefficiencyMultiple(X)
    timeseries.statisticalInefficiencyMultiple(X**2)
    timeseries.statisticalInefficiencyMultiple(X[0, :]**2)
    timeseries.statisticalInefficiencyMultiple(X[0:2, :]**2)
    timeseries.statisticalInefficiencyMultiple(energy)
Beispiel #3
0
         0, trajectory_segment_length)
     for k in range(K):
         # Determine which replica generated the data from temperature k at this iteration
         replica_index = replica_ik[iteration, k]
         # Reconstruct portion of replica trajectory.
         U_kt_replica[replica_index,
                      snapshot_indices] = U_kt[k, snapshot_indices]
         phi_kt_replica[replica_index,
                        snapshot_indices] = phi_kt[k, snapshot_indices]
         psi_kt_replica[replica_index,
                        snapshot_indices] = psi_kt[k, snapshot_indices]
 # Estimate the statistical inefficiency of the simulation by analyzing the timeseries of interest.
 # We use the max of cos and sin of the phi and psi timeseries because they are periodic angles.
 # The
 print "Computing statistical inefficiencies..."
 g_cosphi = timeseries.statisticalInefficiencyMultiple(
     numpy.cos(phi_kt_replica * numpy.pi / 180.0))
 print "g_cos(phi) = %.1f" % g_cosphi
 g_sinphi = timeseries.statisticalInefficiencyMultiple(
     numpy.sin(phi_kt_replica * numpy.pi / 180.0))
 print "g_sin(phi) = %.1f" % g_sinphi
 g_cospsi = timeseries.statisticalInefficiencyMultiple(
     numpy.cos(psi_kt_replica * numpy.pi / 180.0))
 print "g_cos(psi) = %.1f" % g_cospsi
 g_sinpsi = timeseries.statisticalInefficiencyMultiple(
     numpy.sin(psi_kt_replica * numpy.pi / 180.0))
 print "g_sin(psi) = %.1f" % g_sinpsi
 # Subsample data with maximum of all correlation times.
 print "Subsampling data..."
 g = numpy.max(numpy.array([g_cosphi, g_sinphi, g_cospsi, g_sinpsi]))
 indices = timeseries.subsampleCorrelatedData(U_kt[k, :], g=g)
 print "Using g = %.1f to obtain %d uncorrelated samples per temperature" % (
Beispiel #4
0
    heavyIndices = np.array(heavyIndices)
    cuIndices = np.array(cuIndices)

    #Load in the potential energies, INCLUDING RESTRAINT, at all states for this simulation to figure out frames to skip
    alcDat = np.loadtxt(alchemicalFile)
    startTime = alcDat[0, 1]
    startFrame = int(
        startTime
    ) - 1  #Be careful here... need write frequency in alchemical file to match exactly with positions
    #AND assuming that have written in 1 ps increments...
    #Also, first frame in trajectory is NOT at time zero, so subtract 1
    if endTime == -1:
        thisPot = alcDat[:, 3:-1]
    else:
        thisPot = alcDat[:endTime, 3:-1]
    thisg = timeseries.statisticalInefficiencyMultiple(thisPot)
    print("Statistical inefficiency for this set of potential energies: %f" %
          thisg)

    #print(startTime)
    #print(startFrame)
    #print(thisPot.shape)

    #Next load in the trajectory and get all solute coordinates that matter
    top.rb_torsions = pmd.TrackedList([])
    top = pt.load_parmed(top, traj=False)
    if endTime == -1:
        traj = pt.iterload(trajFile, top, frame_slice=(startFrame, -1))
    else:
        traj = pt.iterload(trajFile,
                           top,
if __name__ == "__main__" :        
   var=numpy.ones(N)
   for replica in xrange(2,K+1):
      var=numpy.concatenate((var,numpy.ones(N)))      
   X=numpy.random.normal(numpy.zeros(K*N), var).reshape((K,N))/10.0
   Y=numpy.random.normal(numpy.zeros(K*N), var).reshape((K,N))

#   X=numpy.random.normal(numpy.zeros(K*N), var).reshape((K,N))
#   Y=numpy.random.normal(numpy.zeros(K*N), var).reshape((K,N))

#   print "X.shape = "
#   print X.shape
   energy = 10*(X**2)/2.0 + (Y**2)/2.0

   print "statisticalInefficiencyMultiple(X)"
   print timeseries.statisticalInefficiencyMultiple(X)
   print "statisticalInefficiencyMultiple(X**2)"
   print timeseries.statisticalInefficiencyMultiple(X**2)
   print "statisticalInefficiencyMultiple(X[0,:]**2)"
   print timeseries.statisticalInefficiencyMultiple(X[0,:]**2)
   print "statisticalInefficiencyMultiple(X[0:2,:]**2)"
   print timeseries.statisticalInefficiencyMultiple(X[0:2,:]**2)      
   print "statisticalInefficiencyMultiple(energy)"
   print timeseries.statisticalInefficiencyMultiple(energy)
   
   # Exit with success.
   # TODO: Add some checks to test statistical inefficinecies are within normal expected range.
   sys.exit(0)
   
Beispiel #6
0
def summarize_timeseries(concentrations):
    """
    Use boostrap sampling together with statisticalInefficiencyMultiple to calculate the mean correlation function and
    its 95% confidence intervals as well as the estimated autocorrelation time.

    Parameters
    ----------
    concentrations: np.ndarray
        Arrays with different time series data in each row.

    Returns
    -------
    mean_corr_func, lower, upper: numpy.ndarray
        the mean, 5th percentile, and 97.5 percentile of the estimated autocorrelation function.
    auto_corr_time_mean, auto_corr_time_std
        the mean autocorrelation time with its standard error.
    """
    boot_samples = 50
    auto_corr_time = np.zeros(boot_samples)
    corr_data = []
    for sample in range(boot_samples):
        ints = np.random.choice(3, 3)
        concs = [
            concentrations[ints[0], :], concentrations[ints[1], :],
            concentrations[ints[2], :]
        ]
        g, c = timeseries.statisticalInefficiencyMultiple(
            concs, return_correlation_function=True, fast=False)
        auto_corr_time[sample] = (g - 1) / 2.0
        corr_data += c

    # The correlation time may be computed up to a different maximum time for different bootstrap samples.
    # Finding the maximum time.
    max_time = 0
    for tup in corr_data:
        if tup[0] > max_time:
            max_time = tup[0]

    # Unpacking each bootstrapped correlation function for easier analysis
    unpacked_corr_func = {}
    for i in range(max_time):
        unpacked_corr_func[i] = []

    for data in corr_data:
        unpacked_corr_func[data[0] - 1].append(data[1])

    # Working out the confidense intervals.
    mean_corr_func = np.zeros(max_time)
    lower = np.zeros(max_time)
    upper = np.zeros(max_time)
    for i in range(max_time):
        mean_corr_func[i] = np.mean(unpacked_corr_func[i])
        lower[i] = np.percentile(unpacked_corr_func[i], q=2.5)
        upper[i] = np.percentile(unpacked_corr_func[i], q=97.5)

    # When the first lower estimate hits zero, ensure that all supsequent data points are also zero.
    zero_from = np.where(lower <= 0.0)[0][0]
    lower[zero_from:] = 0

    return mean_corr_func, lower, upper, auto_corr_time.mean(
    ), auto_corr_time.std()
Beispiel #7
0
def umbrella_PMF(x_kn,
                 data_path,
                 eq,
                 k_bias,
                 save_path,
                 temps_to_use='All',
                 save=True,
                 max_time=None):
    """
    CURRENT FUNCTION USED TO COMPUTE SUBSTRUCTURE AND 1D PMF 
    
    x_kn is some list of coordiante values along which you want to compute free energies
    We assume that x_kn is either an array where each row is a timecourse at a given condition, and the rows are in order of increasing reporter values (ex. temp or setpoint)
    OR
    that x_kn is a 1D array where a new trajectory starts every t values, and the trajectories are in order of increasing reporter values
    You can also enter None, in whcih case x_kn is simply the native contacts
    Alternatively you can enter x_kn as a string, for instnace 'rmsd', in whcih case the program extracts that variale from the data
    
    IMPORTANT: YOU NEED TO MAKE SURE THAT THE POINTS IN X_KN CORRESPOND TO SAME TIMEPOINTS AS THE POINTS IN THE DATA FILE...IF IT DOESN'T, THEN
    THE FUNCTION WILL TRY TO FIX IT BY SUBSAMPLING THE X_KN, BUT I DON'T TRUST THIS...
    
    data_path tells you where the native contacts data is located
    
    eq tells you how many steps you want to leave out initially while the simulations equilibrate
    
    k_bias is the spring constant
  
    
    save_path is where you want to save the results, for instance 'ADK_umbrella_multistart/Substructure_PMF.dat'
    
    As a model for this, see pymbar/examples/umbrella-sampling-pmf/umbrella-sampling.py
    May need to download this from github, not sure it's on home computer
    
    
    By the way, on 3/17/20, added a parameter max_time, which is the last MC step to be used in equliibrium calculations
    Typically, we use everything from eq till the end of the simulation (which is the case if max_time has its default value of None)
    But if you set some numerical value for max_time, then we'll use somethign else
    For instnace, if eq = 0 and max_time = 100000000, then we'll only use the first 100000000 MC timesteps to compute PMF
    BUt if eq is set to, say, 150000000 and max_time is kept at its default value of None, then we use everything from 150000000 and beyond
    to compute PMF
    """
    print("# loading data...")

    log_file_data, temperatures, setpoints, log_files, times, variables = load_data.load_log_data(
        data_path)

    energies_index = variables.index('energy')
    energies = log_file_data[:, :, energies_index]
    #

    if 'natives' in variables:
        natives_index = variables.index('natives')
        natives = log_file_data[:, :, natives_index]
    else:  #assume no umbrella biasing
        natives = np.zeros(np.shape(energies))
        k_bias = 0

    if type(x_kn) == str:
        x_kn = log_file_data[:, :, variables.index(x_kn)]
    elif np.shape(x_kn) == ():
        x_kn = natives

    setpoints = np.array(setpoints)
    temperatures = np.array(temperatures)
    n_conditions = np.shape(natives)[0]

    x_kn = np.array(x_kn)

    del log_file_data

    if x_kn.ndim != 2:
        x_kn = np.reshape(x_kn, (n_conditions, int(len(x_kn) / n_conditions)))

    if temps_to_use != "All":
        indices_to_use = [
            t for t, temp in enumerate(temperatures) if temp in temps_to_use
        ]
        natives = natives[indices_to_use, :]
        energies = energies[indices_to_use, :]
        x_kn = x_kn[indices_to_use, :]
        n_conditions = len(indices_to_use)
        temperatures = temperatures[indices_to_use]
        setpoints = setpoints[indices_to_use]

    sample_frequency = int(np.shape(natives)[1] / np.shape(x_kn)[1])
    keep = np.arange(0, np.shape(natives)[1], sample_frequency)
    natives = natives[:, keep]
    energies = energies[:, keep]

    times = np.array([times[t] for t in range(len(times)) if t in keep])

    eq_index = np.where(times == eq)[0][0]

    if max_time == None:
        natives = natives[:, eq_index:]
        energies = energies[:, eq_index:]
        x_kn = x_kn[:, eq_index:]
    else:
        max_index = np.where(times == max_time)[0][0]
        natives = natives[:, eq_index:max_index]
        energies = energies[:, eq_index:max_index]
        x_kn = x_kn[:, eq_index:max_index]

        print(times[eq_index:max_index])

    n_timepoints = np.shape(natives)[1]

    print("# calculating potential...")

    N_k = np.array([n_timepoints for k in range(n_conditions)], np.int32)

    #u_kn=np.zeros()

    u_kln = np.zeros((n_conditions, n_conditions, n_timepoints))
    #ukln tells you the reduced potential energy (energy/kbT + spring cost) that
    #point n from condition k would experience if it were to occur in some (other)
    #condition l
    for k in range(n_conditions):
        for n in range(n_timepoints):
            u_kln[k, :, n] = energies[k, n] / temperatures + k_bias * (
                natives[k, n] - setpoints)**2
        #Have to add in bias by hand since the energies term from log files does not include that bias!

    print("# Computing normalizations...")
    mbar = pymbar.MBAR(
        u_kln, N_k
    )  #This initialization computes the log partition functions for all conditions (temperature/bias combinations)
    #dF = mbar.getFreeEnergyDifferences()[0][0,:]

    #In these previous steps, we compute the full trace (partition function) for all conditions (setpoint and temp combinations)...

    #We will now compute the free energies (partial trace over only snapshots assigned to a state) under a DIFFERENT condition (which was not represented
    #in the conditions whose normalizations we just calculated)--namely: the state in which you have no bias

    unique_temperatures = np.unique(temperatures)

    unique_x = np.unique(x_kn)

    print('Computing state free energies...')

    x_n = x_kn.flatten()
    nbins = len(unique_x)
    bin_n = np.array([np.where(unique_x == x)[0][0] for x in x_n])
    free_energies = np.zeros((len(unique_temperatures), len(unique_x)))
    uncertainties = np.zeros((len(unique_temperatures), len(unique_x)))

    for t, temp in enumerate(unique_temperatures):
        print("Computing free energy at T={}".format(temp))
        u_n = energies.flatten() / unique_temperatures[
            t]  #reduced potential energy at temperature we care about
        # we do NOT include bias in above formula because we want to compute the PMF specificlaly under the condition of no bias
        #f_i, df_i = mbar.computePMF(u_n, bin_n, nbins, uncertainties='from-normalization')
        f_i, df_i = mbar.computePMF(u_n,
                                    bin_n,
                                    nbins,
                                    uncertainties='from-lowest')
        #f_i, df_i = mbar.computePMF(u_n, bin_n, nbins, uncertainties='all-differences')

        f_i = f_i - np.min(f_i)  #set the lowest free energy to 0
        #f_i=f_i+np.log(np.sum(np.exp(-f_i)))  #normalize--this doesn't work well because you get overflow error
        free_energies[t, :] = f_i
        uncertainties[t, :] = df_i
    """
    As for the uncertainties: Since I was not computing these for many of my previous proteins, I do not want to make a new variable
    to avoid creating confusion with number of variables to be loaded by joblib
    Rather, what I will do from now on is append the uncertainties to a second page of the free_energies array
    Also, the uncertainties are scaled by sqrt(N/N_eff), where N is total nubmer of samples, and N_eff is effective number of uncorrelated samples
    """

    #First, compute statistical inefficiency using all data

    g = timeseries.statisticalInefficiencyMultiple(natives)
    NNN = len(x_n)
    N_eff = NNN / g
    uncertainties = uncertainties * np.sqrt(NNN / N_eff)

    free_energies = np.stack((free_energies, uncertainties), axis=2)

    if save: joblib.dump([unique_x, free_energies, temperatures], save_path)
    return unique_x, unique_temperatures, free_energies
 psi_kt_replica = psi_kt.copy()
 for iteration in range(niterations):
    # Determine which snapshot indices are associated with this iteration
    snapshot_indices = iteration*trajectory_segment_length + numpy.arange(0,trajectory_segment_length)
    for k in range(K):
       # Determine which replica generated the data from temperature k at this iteration
       replica_index = replica_ik[iteration,k]
       # Reconstruct portion of replica trajectory.
       U_kt_replica[replica_index,snapshot_indices] = U_kt[k,snapshot_indices]
       phi_kt_replica[replica_index,snapshot_indices] = phi_kt[k,snapshot_indices]
       psi_kt_replica[replica_index,snapshot_indices] = psi_kt[k,snapshot_indices]
 # Estimate the statistical inefficiency of the simulation by analyzing the timeseries of interest.
 # We use the max of cos and sin of the phi and psi timeseries because they are periodic angles.
 # The 
 print "Computing statistical inefficiencies..."
 g_cosphi = timeseries.statisticalInefficiencyMultiple(numpy.cos(phi_kt_replica * numpy.pi / 180.0))
 print "g_cos(phi) = %.1f" % g_cosphi
 g_sinphi = timeseries.statisticalInefficiencyMultiple(numpy.sin(phi_kt_replica * numpy.pi / 180.0))
 print "g_sin(phi) = %.1f" % g_sinphi   
 g_cospsi = timeseries.statisticalInefficiencyMultiple(numpy.cos(psi_kt_replica * numpy.pi / 180.0))
 print "g_cos(psi) = %.1f" % g_cospsi
 g_sinpsi = timeseries.statisticalInefficiencyMultiple(numpy.sin(psi_kt_replica * numpy.pi / 180.0))
 print "g_sin(psi) = %.1f" % g_sinpsi
 # Subsample data with maximum of all correlation times.
 print "Subsampling data..."
 g = numpy.max(numpy.array([g_cosphi, g_sinphi, g_cospsi, g_sinpsi]))
 indices = timeseries.subsampleCorrelatedData(U_kt[k,:], g = g)   
 print "Using g = %.1f to obtain %d uncorrelated samples per temperature" % (g, len(indices))
 N_max = int(numpy.ceil(T / g)) # max number of samples per temperature   
 U_kn = numpy.zeros([K, N_max], numpy.float64)
 phi_kn = numpy.zeros([K, N_max], numpy.float64)
K = 10

if __name__ == "__main__":
    var = numpy.ones(N)
    for replica in xrange(2, K + 1):
        var = numpy.concatenate((var, numpy.ones(N)))
    X = numpy.random.normal(numpy.zeros(K * N), var).reshape((K, N)) / 10.0
    Y = numpy.random.normal(numpy.zeros(K * N), var).reshape((K, N))

    #   X=numpy.random.normal(numpy.zeros(K*N), var).reshape((K,N))
    #   Y=numpy.random.normal(numpy.zeros(K*N), var).reshape((K,N))

    #   print "X.shape = "
    #   print X.shape
    energy = 10 * (X**2) / 2.0 + (Y**2) / 2.0

    print "statisticalInefficiencyMultiple(X)"
    print timeseries.statisticalInefficiencyMultiple(X)
    print "statisticalInefficiencyMultiple(X**2)"
    print timeseries.statisticalInefficiencyMultiple(X**2)
    print "statisticalInefficiencyMultiple(X[0,:]**2)"
    print timeseries.statisticalInefficiencyMultiple(X[0, :]**2)
    print "statisticalInefficiencyMultiple(X[0:2,:]**2)"
    print timeseries.statisticalInefficiencyMultiple(X[0:2, :]**2)
    print "statisticalInefficiencyMultiple(energy)"
    print timeseries.statisticalInefficiencyMultiple(energy)

    # Exit with success.
    # TODO: Add some checks to test statistical inefficinecies are within normal expected range.
    sys.exit(0)
      pickle_file = open(data_pickle_fn, 'wb')
      dump( (U_kn_correlated, A_ikn_correlated), pickle_file )
      pickle_file.close()

print ""

#######################################################################
#           Subsample {U,A}_kn_correlated to be uncorrelated          #
#######################################################################

print "Subsampling to achieve uncorrelated data"
if stat_inefficiency == None:
   print "(1 of 2) Calculating statistical inefficiency (i = ",
   stdout.flush()
   for d in range(N_CVs):
      statnew = timeseries.statisticalInefficiencyMultiple(A_ikn_correlated[d])
      stat_inefficiency = max([stat_inefficiency, statnew])
   print stat_inefficiency, ")"
else:
   print "(1 of 2) Using given statistical inefficiency (i =", str(stat_inefficiency) + ")"

indices = timeseries.subsampleCorrelatedData(U_kn_correlated[0,:], g = stat_inefficiency)
N_uncorrelated_samples = len(indices)

print "(2 of 2) Subsampling to achieve", N_uncorrelated_samples, "samples per replica"

U_kn  = zeros([      N_replicas+N_output_temps,N_uncorrelated_samples], float32)
A_ikn = zeros([N_CVs,N_replicas+N_output_temps,N_uncorrelated_samples], float32)

for k in range(N_replicas):
   U_kn[k] = U_kn_correlated[k][indices]