Beispiel #1
0
def make_replica_dcd_files(topology,
                           timestep=5 * unit.femtosecond,
                           time_interval=200,
                           output_dir="output",
                           output_data="output.nc",
                           checkpoint_data="output_checkpoint.nc",
                           frame_begin=0,
                           frame_stride=1):
    """
    Make dcd files from replica exchange simulation trajectory data.
    
    :param topology: OpenMM Topology
    :type topology: `Topology() <https://simtk.org/api_docs/openmm/api4_1/python/classsimtk_1_1openmm_1_1app_1_1topology_1_1Topology.html>`_
    
    :param timestep: Time step used in the simulation (default=5*unit.femtosecond)
    :type timestep: `Quantity() <http://docs.openmm.org/development/api-python/generated/simtk.unit.quantity.Quantity.html>` float * simtk.unit
    
    :param time_interval: frequency, in number of time steps, at which positions were recorded (default=200)
    :type time_interval: int
    
    :param output_directory: path to which we will write the output (default='output')
    :type output_directory: str
    
    :param output_data: name of output .nc data file (default='output.nc')
    :type output_data: str    
    
    :param checkpoint_data: name of checkpoint .nc data file (default='output_checkpoint.nc')
    :type checkpoint_data: str   
    
    :param frame_begin: Frame at which to start writing the dcd trajectory (default=0)
    :type frame_begin: int
    
    :param frame_stride: advance by this many time intervals when writing dcd trajectories (default=1)
    :type frame_stride: int 
    """

    file_list = []

    output_data_path = os.path.join(output_dir, output_data)

    # Get number of replicas:
    reporter = MultiStateReporter(output_data_path, open_mode="r")
    states = reporter.read_thermodynamic_states()[0]
    n_replicas = len(states)

    sampler_states = reporter.read_sampler_states(iteration=0)
    xunit = sampler_states[0].positions[0].unit

    for replica_index in range(n_replicas):
        replica_trajectory = extract_trajectory(
            topology,
            replica_index=replica_index,
            output_data=output_data_path,
            checkpoint_data=checkpoint_data,
            frame_begin=frame_begin,
            frame_stride=frame_stride)

        file_name = f"{output_dir}/replica_{replica_index+1}.dcd"
        file = open(file_name, "wb")
        dcd_file = DCDFile(file,
                           topology,
                           timestep,
                           firstStep=frame_begin,
                           interval=time_interval)

        for positions in replica_trajectory:
            # Add the units consistent with replica_energies
            positions *= xunit
            DCDFile.writeModel(dcd_file, positions)

        file.close()
        file_list.append(file_name)

    return file_list
Beispiel #2
0
def process_replica_exchange_data(
    output_data="output/output.nc",
    output_directory="output",
    series_per_page=4,
    write_data_file=True,
    plot_production_only=False,
    print_timing=False,
    equil_nskip=1,
    frame_begin=0,
    frame_end=-1,
):
    """
    Read replica exchange simulation data, detect equilibrium and decorrelation time, and plot replica exchange results.
    
    :param output_data: path to output .nc file from replica exchange simulation, (default='output/output.nc')
    :type output_data: str
    
    :param output_directory: path to which output files will be written (default='output')
    :type output_directory: stry

    :param series_per_page: number of replica data series to plot per pdf page (default=4)
    :type series_per_page: int
    
    :param write_data_file: Option to write a text data file containing the state_energies array (default=True)
    :type write_data_file: Boolean
    
    :param plot_production_only: Option to plot only the production region, as determined from pymbar detectEquilibration (default=False)
    :type plot_production_only: Boolean    

    :param equil_nskip: skip this number of frames to sparsify the energy timeseries for pymbar detectEquilibration (default=1) - this is used only when frame_begin=0 and the trajectory has less than 40000 frames.
    :type equil_nskip: Boolean
    
    :param frame_begin: analyze starting from this frame, discarding all prior as equilibration period (default=0)
    :type frame_begin: int
    
    :param frame_end: analyze up to this frame only, discarding the rest (default=-1).
    :type frame_end: int

    :returns:
        - replica_energies ( `Quantity() <http://docs.openmm.org/development/api-python/generated/simtk.unit.quantity.Quantity.html>`_ ( np.float( [number_replicas,number_simulation_steps] ), simtk.unit ) ) - The potential energies for all replicas at all (printed) time steps
        - replica_state_indices ( np.int64( [number_replicas,number_simulation_steps] ), simtk.unit ) - The thermodynamic state assignments for all replicas at all (printed) time steps
        - production_start ( int - The frame at which the production region begins for all replicas, as determined from pymbar detectEquilibration
        - sample_spacing ( int - The number of frames between uncorrelated state energies, estimated using heuristic algorithm )
        - n_transit ( np.float( [number_replicas] ) ) - Number of half-transitions between state 0 and n for each replica
        - mixing_stats ( tuple ( np.float( [number_replicas x number_replicas] ) , np.float( [ number_replicas ] ) , float( statistical inefficiency ) ) ) - transition matrix, corresponding eigenvalues, and statistical inefficiency
    """

    t1 = time.perf_counter()

    # Read the simulation coordinates for individual temperature replicas
    reporter = MultiStateReporter(output_data, open_mode="r")

    t2 = time.perf_counter()
    if print_timing:
        print(f"open data time: {t2-t1}")

    # figure out what the time between output is.
    # We assume all use the same time step (which i think is required)

    mcmove = reporter.read_mcmc_moves()[0]
    time_interval = mcmove.n_steps * mcmove.timestep

    t3 = time.perf_counter()
    if print_timing:
        print(f"read_mcmc_moves time: {t3-t2}")

    # figure out what the temperature list is
    states = reporter.read_thermodynamic_states()[0]

    t4 = time.perf_counter()
    if print_timing:
        print(f"read_thermodynamics_states time: {t4-t3}")

    temperature_list = []
    for s in states:
        temperature_list.append(s.temperature)

    analyzer = ReplicaExchangeAnalyzer(reporter)

    t5 = time.perf_counter()

    (
        replica_energies,
        unsampled_state_energies,
        neighborhoods,
        replica_state_indices,
    ) = analyzer.read_energies()

    # Truncate output of read_energies() to last frame of interest
    if frame_end > 0:
        # Use frames from frame_begin to frame_end
        replica_energies = replica_energies[:, :, :frame_end]
        unsampled_state_energies = unsampled_state_energies[:, :, :frame_end]
        neighborhoods = neighborhoods[:, :, :frame_end]
        replica_state_indices = replica_state_indices[:, :frame_end]

    t6 = time.perf_counter()
    if print_timing:
        print(f"read_energies time: {t6-t5}")

    n_particles = np.shape(
        reporter.read_sampler_states(iteration=0)[0].positions)[0]
    temps = np.array([temp._value for temp in temperature_list])
    beta_k = 1 / (kB * temps)
    n_replicas = len(temperature_list)
    for k in range(n_replicas):
        replica_energies[:, k, :] *= beta_k[k]**(-1)

    t7 = time.perf_counter()
    if print_timing:
        print(f"reduce replica energies time: {t7-t6}")

    total_steps = len(replica_energies[0][0])
    state_energies = np.zeros([n_replicas, total_steps])

    t8 = time.perf_counter()
    # there must be some better way to do this as list comprehension.
    for step in range(total_steps):
        for state in range(n_replicas):
            state_energies[state, step] = replica_energies[np.where(
                replica_state_indices[:, step] == state)[0], 0, step]

    t9 = time.perf_counter()
    if print_timing:
        print(f"assign state energies time: {t9-t8}")

    # can run physical-valication on these state_energies

    # Use pymbar timeseries module to detect production period

    t10 = time.perf_counter()

    # Start of equilibrated data:
    t0 = np.zeros((n_replicas))
    # Statistical inefficiency:
    g = np.zeros((n_replicas))

    subsample_indices = {}

    # If sufficiently large, discard the first 20000 frames as equilibration period and use
    # subsampleCorrelatedData to get the energy decorrelation time.
    if total_steps >= 40000 or frame_begin > 0:
        if frame_begin > 0:
            # If specified, use frame_begin as the start of the production region
            production_start = frame_begin
        else:
            # Otherwise, use frame 20000
            production_start = 20000

        for state in range(n_replicas):
            subsample_indices[state] = timeseries.subsampleCorrelatedData(
                state_energies[state][production_start:],
                conservative=True,
            )
            g[state] = subsample_indices[state][1] - subsample_indices[state][0]

    else:
        # For small trajectories, use detectEquilibration
        for state in range(n_replicas):
            t0[state], g[state], Neff_max = timeseries.detectEquilibration(
                state_energies[state], nskip=equil_nskip)

            # Choose the latest equil timestep to apply to all states
            production_start = int(np.max(t0))

    # Assume a normal distribution (very rough approximation), and use mean plus
    # the number of standard deviations which leads to (n_replica-1)/n_replica coverage
    # For 12 replicas this should be the mean + 1.7317 standard deviations

    # x standard deviations is the solution to (n_replica-1)/n_replica = erf(x/sqrt(2))
    # This is equivalent to a target of 23/24 CDF value

    print(f"g: {g.astype(int)}")

    def erf_fun(x):
        return np.power((erf(x / np.sqrt(2)) - (n_replicas - 1) / n_replicas),
                        2)

    # x must be larger than zero
    opt_g_results = minimize_scalar(erf_fun, bounds=(0, 10))

    if not opt_g_results.success:
        print("Error solving for correlation time, exiting...")
        print(f"erf opt results: {opt_g_results}")
        exit()

    sample_spacing = int(np.ceil(np.mean(g) + opt_g_results.x * np.std(g)))

    t11 = time.perf_counter()
    if print_timing:
        print(f"detect equil and subsampling time: {t11-t10}")

    print("state    mean energies  variance")
    for state in range(n_replicas):
        state_mean = np.mean(state_energies[state,
                                            production_start::sample_spacing])
        state_std = np.std(state_energies[state,
                                          production_start::sample_spacing])
        print(f"  {state:4d}    {state_mean:10.6f} {state_std:10.6f}")

    t12 = time.perf_counter()

    if write_data_file == True:
        f = open(os.path.join(output_directory, "replica_energies.dat"), "w")
        for step in range(total_steps):
            f.write(f"{step:10d}")
            for replica_index in range(n_replicas):
                f.write(
                    f"{replica_energies[replica_index,replica_index,step]:12.6f}"
                )
            f.write("\n")
        f.close()

    t13 = time.perf_counter()
    if print_timing:
        print(f"Optionally write .dat file: {t13-t12}")

    t14 = time.perf_counter()

    if plot_production_only == True:
        plot_replica_exchange_energies(
            state_energies[:, production_start:],
            temperature_list,
            series_per_page,
            time_interval=time_interval,
            time_shift=production_start * time_interval,
            file_name=f"{output_directory}/rep_ex_ener.pdf",
        )

        plot_replica_exchange_energy_histograms(
            state_energies[:, production_start:],
            temperature_list,
            file_name=f"{output_directory}/rep_ex_ener_hist.pdf",
        )

        plot_replica_exchange_summary(
            replica_state_indices[:, production_start:],
            temperature_list,
            series_per_page,
            time_interval=time_interval,
            time_shift=production_start * time_interval,
            file_name=f"{output_directory}/rep_ex_states.pdf",
        )

        plot_replica_state_matrix(
            replica_state_indices[:, production_start:],
            file_name=f"{output_directory}/state_probability_matrix.pdf",
        )

    else:
        plot_replica_exchange_energies(
            state_energies,
            temperature_list,
            series_per_page,
            time_interval=time_interval,
            file_name=f"{output_directory}/rep_ex_ener.pdf",
        )

        plot_replica_exchange_energy_histograms(
            state_energies,
            temperature_list,
            file_name=f"{output_directory}/rep_ex_ener_hist.pdf",
        )

        plot_replica_exchange_summary(
            replica_state_indices,
            temperature_list,
            series_per_page,
            time_interval=time_interval,
            file_name=f"{output_directory}/rep_ex_states.pdf",
        )

        plot_replica_state_matrix(
            replica_state_indices,
            file_name=f"{output_directory}/state_probability_matrix.pdf",
        )

    t15 = time.perf_counter()

    if print_timing:
        print(f"plotting time: {t15-t14}")

    # Analyze replica exchange state transitions
    # For each replica, how many times does the thermodynamic state go between state 0 and state n
    # For consistency with the other mixing statistics, use only the production region here

    replica_state_indices_prod = replica_state_indices[:, production_start:]

    # Number of one-way transitions from states 0 to n or states n to 0
    n_transit = np.zeros((n_replicas, 1))

    # Replica_state_indices is [n_replicas x n_iterations]
    for rep in range(n_replicas):
        last_bound = None
        for i in range(replica_state_indices_prod.shape[1]):
            if replica_state_indices_prod[
                    rep, i] == 0 or replica_state_indices_prod[rep, i] == (
                        n_replicas - 1):
                if last_bound is None:
                    # This is the first time state 0 or n is visited
                    pass
                else:
                    if last_bound != replica_state_indices_prod[rep, i]:
                        # This is a completed transition from 0 to n or n to 0
                        n_transit[rep] += 1
                last_bound = replica_state_indices_prod[rep, i]

    t16 = time.perf_counter()

    if print_timing:
        print(f"replica transition analysis: {t16-t15}")

    # Compute transition matrix from the analyzer
    mixing_stats = analyzer.generate_mixing_statistics(
        number_equilibrated=production_start)

    t17 = time.perf_counter()

    if print_timing:
        print(f"compute transition matrix: {t17-t16}")
        print(f"total time elapsed: {t17-t1}")

    return (replica_energies, replica_state_indices, production_start,
            sample_spacing, n_transit, mixing_stats)
Beispiel #3
0
def make_state_pdb_files(topology,
                         output_dir="output",
                         output_data="output.nc",
                         checkpoint_data="output_checkpoint.nc",
                         frame_begin=0,
                         frame_stride=1,
                         center=True):
    """
    Make pdb files by state from replica exchange simulation trajectory data.
    Note: these are discontinuous trajectories with constant temperature state.
    
    :param topology: OpenMM Topology
    :type topology: `Topology() <https://simtk.org/api_docs/openmm/api4_1/python/classsimtk_1_1openmm_1_1app_1_1topology_1_1Topology.html>`_
    
    :param output_directory: path to which we will write the output (default='output')
    :type output_directory: str
    
    :param output_data: name of output .nc data file (default='output.nc')
    :type output_data: str    
    
    :param checkpoint_data: name of checkpoint .nc data file (default='output_checkpoint.nc')
    :type checkpoint_data: str   
    
    :param frame_begin: Frame at which to start writing the pdb trajectory (default=0)
    :type frame_begin: int    
    
    :param frame_stride: advance by this many frames when writing pdb trajectories (default=1)
    :type frame_stride: int   

    :param center: align the center of mass of each structure in the discontinuous state trajectory (default=True)
    :type center: Boolean
    
    :returns:
        - file_list ( List( str ) ) - A list of names for the files that were written
    """
    file_list = []

    output_data_path = os.path.join(output_dir, output_data)

    # Get number of states:
    reporter = MultiStateReporter(output_data_path, open_mode="r")
    states = reporter.read_thermodynamic_states()[0]

    sampler_states = reporter.read_sampler_states(iteration=0)
    xunit = sampler_states[0].positions[0].unit

    for state_index in range(len(states)):
        state_trajectory = extract_trajectory(topology,
                                              state_index=state_index,
                                              output_data=output_data_path,
                                              checkpoint_data=checkpoint_data,
                                              frame_begin=frame_begin,
                                              frame_stride=frame_stride)

        file_name = f"{output_dir}/state_{state_index+1}.pdb"
        file = open(file_name, "w")

        PDBFile.writeHeader(topology, file=file)
        modelIndex = 1

        # TODO: replace this with MDTraj alignment tool
        if center == True:
            center_x = np.mean(state_trajectory[0, :, 0])
            center_y = np.mean(state_trajectory[0, :, 1])
            center_z = np.mean(state_trajectory[0, :, 2])

        for positions in state_trajectory:
            if center == True:
                positions[:, 0] += (center_x - np.mean(positions[:, 0]))
                positions[:, 1] += (center_y - np.mean(positions[:, 1]))
                positions[:, 2] += (center_z - np.mean(positions[:, 2]))

            # Add the units consistent with replica_energies
            positions *= xunit

            PDBFile.writeModel(topology,
                               positions,
                               file=file,
                               modelIndex=modelIndex)

        PDBFile.writeFooter(topology, file=file)

        file.close()
        file_list.append(file_name)

    return file_list
Beispiel #4
0
def make_state_dcd_files(topology,
                         timestep=5 * unit.femtosecond,
                         time_interval=200,
                         output_dir="output",
                         output_data="output.nc",
                         checkpoint_data="output_checkpoint.nc",
                         frame_begin=0,
                         frame_stride=1,
                         center=True):
    """
    Make dcd files by state from replica exchange simulation trajectory data.
    Note: these are discontinuous trajectories with constant temperature state.
    
    :param topology: OpenMM Topology
    :type topology: `Topology() <https://simtk.org/api_docs/openmm/api4_1/python/classsimtk_1_1openmm_1_1app_1_1topology_1_1Topology.html>`_
    
    :param timestep: Time step used in the simulation (default=5*unit.femtosecond)
    :type timestep: `Quantity() <http://docs.openmm.org/development/api-python/generated/simtk.unit.quantity.Quantity.html>` float * simtk.unit
    
    :param time_interval: frequency, in number of time steps, at which positions were recorded (default=200)
    :type time_interval: int
    
    :param output_directory: path to which we will write the output (default='output')
    :type output_directory: str
    
    :param output_data: name of output .nc data file (default='output.nc')
    :type output_data: str    
    
    :param checkpoint_data: name of checkpoint .nc data file (default='output_checkpoint.nc')
    :type checkpoint_data: str   
    
    :param frame_begin: Frame at which to start writing the dcd trajectory (default=0)
    :type frame_begin: int
    
    :param frame_stride: advance by this many time intervals when writing dcd trajectories (default=1)
    :type frame_stride: int 
    
    :param center: align the center of mass of each structure in the discontinuous state trajectory (default=True)
    :type center: Boolean
    
    """

    file_list = []

    output_data_path = os.path.join(output_dir, output_data)

    # Get number of states:
    reporter = MultiStateReporter(output_data_path, open_mode="r")
    states = reporter.read_thermodynamic_states()[0]

    sampler_states = reporter.read_sampler_states(iteration=0)
    xunit = sampler_states[0].positions[0].unit

    for state_index in range(len(states)):
        state_trajectory = extract_trajectory(topology,
                                              state_index=state_index,
                                              output_data=output_data_path,
                                              checkpoint_data=checkpoint_data,
                                              frame_begin=frame_begin,
                                              frame_stride=frame_stride)

        file_name = f"{output_dir}/state_{state_index+1}.dcd"
        file = open(file_name, "wb")
        dcd_file = DCDFile(file,
                           topology,
                           timestep,
                           firstStep=frame_begin,
                           interval=time_interval)

        # TODO: replace this with MDTraj alignment tool
        if center == True:
            center_x = np.mean(state_trajectory[0, :, 0])
            center_y = np.mean(state_trajectory[0, :, 1])
            center_z = np.mean(state_trajectory[0, :, 2])

        for positions in state_trajectory:
            if center == True:
                positions[:, 0] += (center_x - np.mean(positions[:, 0]))
                positions[:, 1] += (center_y - np.mean(positions[:, 1]))
                positions[:, 2] += (center_z - np.mean(positions[:, 2]))

            # Add the units consistent with replica_energies
            positions *= xunit
            DCDFile.writeModel(dcd_file, positions)

        file.close()
        file_list.append(file_name)

    return file_list
Beispiel #5
0
def bootstrap_free_energy_folding(Q,
                                  Q_folded,
                                  output_data="output/output.nc",
                                  frame_begin=0,
                                  sample_spacing=1,
                                  n_trial_boot=200,
                                  num_intermediate_states=0,
                                  conf_percent='sigma',
                                  plotfile_dir="output"):
    """
    Function for computing uncertainty of free energy, entropy, and enthalpy using bootstrapping with varying starting frames.

    :param Q: native contact fraction array of size [n_frames x n_states] (with equilibration region already trimmed)
    :type Q: 2D numpy array ( float )
    
    :param Q_folded: threshold for a native contact fraction corresponding to a folded state (Q[i,j] is folded if Q[i,j] >= Q_folded)
    :type Q_folded: float    
    
    :param output_data: Path to the simulation .nc file.
    :type output_data: str    
    
    :param frame_begin: index of first frame defining the range of samples to use as a production period (default=0)
    :type frame_begin: int    
    
    :param sample_spacing: spacing of uncorrelated data points, for example determined from pymbar timeseries subsampleCorrelatedData
    :type sample_spacing: int     
    
    :param n_trial_boot: number of trials to run for generating bootstrapping uncertainties (default=200)
    :type n_trial_boot: int
    
    :param num_intermediate_states: Number of unsampled thermodynamic states between sampled states to include in the calculation
    :type num_intermediate_states: int
    
    :returns:
      - full_T_list - A 1D numpy array listing of all temperatures, including sampled and intermediate unsampled
      - deltaF_values - A dictionary of the form {"statei_statej": 1D numpy array}, containing free energy change for each T in
                        full_T_list, for each conformational state transition.
      - deltaF uncertainty - A dictionary containing tuple of 1D numpy arrays of lower/upper of uncertainties corresponding to deltaF_values  
      - deltaS_values - A dictionary of the form {"statei_statej": 1D numpy array}, containing entropy change for each T in
                        full_T_list, for each conformational state transition.
      - deltaS uncertainty - A dictionary containing tuple of 1D numpy arrays of lower/upper uncertainties corresponding to deltaS_values 
      - deltaU_values - A dictionary of the form {"statei_statej": 1D numpy array}, containing enthalpy change for each T in
                        full_T_list, for each conformational state transition.
      - deltaU uncertainty - A dictionary containing tuple of 1D numpy arrays of lower/upper of uncertainties corresponding to deltaU_values
    """

    # extract reduced energies and the state indices from the .nc
    reporter = MultiStateReporter(output_data, open_mode="r")
    analyzer = ReplicaExchangeAnalyzer(reporter)
    (
        replica_energies_all,
        unsampled_state_energies,
        neighborhoods,
        replica_state_indices,
    ) = analyzer.read_energies()

    # Get temperature_list from .nc file:
    states = reporter.read_thermodynamic_states()[0]

    temperature_list = []
    for s in states:
        temperature_list.append(s.temperature)

    # Select production frames to analyze
    replica_energies_prod = replica_energies_all[:, :, frame_begin::]

    # For shifting reference frame bootstrap, we need the entire Q and energy arrays starting from frame_start

    if np.shape(replica_energies_prod)[2] != np.shape(Q)[0]:
        print(
            f'Error: Q array of shape {Q.shape} incompatible with energies array of shape{replica_energies_prod.shape}'
        )
        exit()

    Q_all = Q

    # Overall results:
    deltaF_values = {}
    deltaF_uncertainty = {}
    deltaS_values = {}
    deltaS_uncertainty = {}
    deltaU_values = {}
    deltaU_uncertainty = {}

    # Uncertainty for each sampling trial:
    deltaF_values_boot = {}
    deltaF_uncertainty_boot = {}

    deltaS_values_boot = {}
    deltaS_uncertainty_boot = {}

    deltaU_values_boot = {}
    deltaU_uncertainty_boot = {}

    # Get units:
    F_unit = (kB * unit.kelvin).unit  # units of free energy
    T_unit = temperature_list[0].unit
    S_unit = F_unit / T_unit
    U_unit = F_unit

    for i_boot in range(n_trial_boot):
        # Here we can potentially change the reference frame for each bootstrap trial.
        # This requires the array slicing to be done here, not above.
        ref_shift = np.random.randint(sample_spacing)

        # Replica energies and Q already have equilibration period removed:
        replica_energies = replica_energies_prod[:, :,
                                                 ref_shift::sample_spacing]
        Q = Q_all[ref_shift::sample_spacing, :]

        # Get all possible sample indices
        sample_indices_all = np.arange(0, len(replica_energies[0, 0, :]))
        # n_samples should match the size of the sliced replica energy dataset
        sample_indices = resample(sample_indices_all,
                                  replace=True,
                                  n_samples=len(sample_indices_all))

        n_states = len(Q[0, :])

        replica_energies_resample = np.zeros_like(replica_energies)
        # replica_energies is [n_states x n_states x n_frame]
        # Q is [nframes x n_states]
        Q_resample = np.zeros((len(sample_indices), n_states))

        # Select the sampled frames from Q and replica_energies:
        j = 0
        for i in sample_indices:
            replica_energies_resample[:, :, j] = replica_energies[:, :, i]
            Q_resample[j, :] = Q[i, :]
            j += 1

        # Run free energy expectation calculation:
        full_T_list, deltaF_values_boot[i_boot], deltaF_uncertainty_boot[
            i_boot] = expectations_free_energy(
                Q_resample,
                Q_folded,
                temperature_list,
                bootstrap_energies=replica_energies_resample,
                num_intermediate_states=num_intermediate_states,
            )

        # Get entropy/enthalpy for fitting current free energy data:
        # The inner dictionary keys will be transition names
        deltaS_values_boot[i_boot] = {}
        deltaU_values_boot[i_boot] = {}

        deltaS_values_boot[i_boot], deltaU_values_boot[
            i_boot] = get_entropy_enthalpy(deltaF_values_boot[i_boot],
                                           full_T_list)

    arr_deltaF_values_boot = {}
    arr_deltaS_values_boot = {}
    arr_deltaU_values_boot = {}

    # Loop over all conformational transitions:
    for key, value in deltaF_values_boot[0].items():
        arr_deltaF_values_boot[key] = np.zeros(
            (n_trial_boot, len(full_T_list)))
        arr_deltaS_values_boot[key] = np.zeros(
            (n_trial_boot, len(full_T_list)))
        arr_deltaU_values_boot[key] = np.zeros(
            (n_trial_boot, len(full_T_list)))

    # Compute mean values:
    # Free energy:
    for i_boot in range(n_trial_boot):
        for key, value in deltaF_values_boot[i_boot].items():
            arr_deltaF_values_boot[key][i_boot, :] = value.value_in_unit(
                F_unit)

    deltaF_values = {}
    for key, value in arr_deltaF_values_boot.items():
        deltaF_values[key] = np.mean(value, axis=0) * F_unit

    # Entropy:
    for i_boot in range(n_trial_boot):
        arr_deltaS_values_boot[key][
            i_boot, :] = deltaS_values_boot[i_boot][key].value_in_unit(S_unit)

    deltaS_values = {}

    for key, value in arr_deltaS_values_boot.items():
        deltaS_values[key] = np.mean(value, axis=0) * S_unit

    # Enthalpy:
    for i_boot in range(n_trial_boot):
        arr_deltaU_values_boot[key][
            i_boot, :] = deltaU_values_boot[i_boot][key].value_in_unit(U_unit)

    deltaU_values = {}

    for key, value in arr_deltaU_values_boot.items():
        deltaU_values[key] = np.mean(value, axis=0) * U_unit

    # Compute confidence intervals:
    deltaF_uncertainty = {}
    deltaS_uncertainty = {}
    deltaU_uncertainty = {}

    if conf_percent == 'sigma':
        # Use analytical standard deviation instead of percentile method:
        # Free energy:
        for key, value in arr_deltaF_values_boot.items():
            F_std = np.std(value, axis=0) * F_unit
            deltaF_uncertainty[key] = (-F_std, F_std)

        # Entropy:
        for key, value in arr_deltaS_values_boot.items():
            S_std = np.std(value, axis=0) * S_unit
            deltaS_uncertainty[key] = (-S_std, S_std)

        # Enthalpy:
        for key, value in arr_deltaU_values_boot.items():
            U_std = np.std(value, axis=0) * U_unit
            deltaU_uncertainty[key] = (-U_std, U_std)

    else:
        # Compute specified confidence interval:
        p_lo = (100 - conf_percent) / 2
        p_hi = 100 - p_lo

        # Free energy:
        for key, value in arr_deltaF_values_boot.items():
            F_diff = value - np.mean(value, axis=0)
            F_conf_lo = np.percentile(
                F_diff, p_lo, axis=0, interpolation='linear') * F_unit
            F_conf_hi = np.percentile(
                F_diff, p_hi, axis=0, interpolation='linear') * F_unit
            deltaF_uncertainty[key] = (F_conf_lo, F_conf_hi)

        # Entropy:
        for key, value in arr_deltaS_values_boot.items():
            S_diff = value - np.mean(value, axis=0)
            S_conf_lo = np.percentile(
                S_diff, p_lo, axis=0, interpolation='linear') * S_unit
            S_conf_hi = np.percentile(
                S_diff, p_hi, axis=0, interpolation='linear') * S_unit
            deltaS_uncertainty[key] = (S_conf_lo, S_conf_hi)

        # Enthalpy:
        for key, value in arr_deltaU_values_boot.items():
            U_diff = value - np.mean(value, axis=0)
            U_conf_lo = np.percentile(
                U_diff, p_lo, axis=0, interpolation='linear') * U_unit
            U_conf_hi = np.percentile(
                U_diff, p_hi, axis=0, interpolation='linear') * U_unit
            deltaU_uncertainty[key] = (U_conf_lo, U_conf_hi)

    # Plot results:

    # Free energy:
    plot_free_energy_results(full_T_list,
                             deltaF_values,
                             deltaF_uncertainty,
                             plotfile=f"{plotfile_dir}/free_energy_boot.pdf")

    # Entropy and enthalpy:
    plot_entropy_enthalpy(
        full_T_list,
        deltaS_values,
        deltaU_values,
        deltaS_uncertainty=deltaS_uncertainty,
        deltaU_uncertainty=deltaU_uncertainty,
        plotfile_entropy=f"{plotfile_dir}/entropy_boot.pdf",
        plotfile_enthalpy=f"{plotfile_dir}/enthalpy_boot.pdf")

    return full_T_list, deltaF_values, deltaF_uncertainty, deltaS_values, deltaS_uncertainty, deltaU_values, deltaU_uncertainty
Beispiel #6
0
def get_heat_capacity(frame_begin=0,
                      sample_spacing=1,
                      frame_end=-1,
                      output_data="output/output.nc",
                      num_intermediate_states=0,
                      frac_dT=0.05,
                      plot_file=None,
                      bootstrap_energies=None):
    """
    Given a .nc output and a number of intermediate states to insert for the temperature list, this function calculates and plots the heat capacity profile.
                             
    :param frame_begin: index of first frame defining the range of samples to use as a production period (default=0)
    :type frame_begin: int
    
    :param sample_spacing: spacing of uncorrelated data points, for example determined from pymbar timeseries subsampleCorrelatedData (default=1)
    :type sample_spacing: int
    
    :param frame_end: index of last frame to include in heat capacity calculation (default=-1)
    :type frame_end: int

    :param output_data: Path to the output data for a NetCDF-formatted file containing replica exchange simulation data (default = "output/output.nc")                                                                                          
    :type output_data: str    
    
    :param num_intermediate_states: The number of states to insert between existing states in 'temperature_list' (default=0)
    :type num_intermediate_states: int

    :param frac_dT: The fraction difference between temperatures points used to calculate finite difference derivatives (default=0.05)
    :type num_intermediate_states: float
    
    :param plotfile: path to filename to output plot
    :type plotfile: str
    
    :param bootstrap_energies: a custom replica_energies array to be used for bootstrapping calculations. Used instead of the energies in the .nc file.
    :type bootstrap_energies: 2d numpy array (float)

    :returns:
          - C_v ( List( float ) ) - The heat capacity values for all (including inserted intermediates) states
          - dC_v ( List( float ) ) - The uncertainty in the heat capacity values for intermediate states
          - new_temp_list ( List( float * unit.simtk.temperature ) ) - The temperature list corresponding to the heat capacity values in 'C_v'

    """

    if bootstrap_energies is not None:
        # Use a subsampled replica_energy matrix instead of reading from file
        replica_energies = bootstrap_energies
        # Still need to get the thermodynamic states
        reporter = MultiStateReporter(output_data, open_mode="r")
    else:
        # extract reduced energies and the state indices from the .nc
        reporter = MultiStateReporter(output_data, open_mode="r")
        analyzer = ReplicaExchangeAnalyzer(reporter)
        (
            replica_energies_all,
            unsampled_state_energies,
            neighborhoods,
            replica_state_indices,
        ) = analyzer.read_energies()

        # Select production frames to analyze
        if frame_end > 0:
            replica_energies = replica_energies_all[:, :, frame_begin:
                                                    frame_end:sample_spacing]
        else:
            replica_energies = replica_energies_all[:, :, frame_begin::
                                                    sample_spacing]

    # Get the temperature list from .nc file:
    states = reporter.read_thermodynamic_states()[0]

    temperature_list = []
    for s in states:
        temperature_list.append(s.temperature)

    # determine the numerical values of beta at each state in units consistent with the temperature
    Tunit = temperature_list[0].unit
    temps = np.array([temp.value_in_unit(Tunit) for temp in temperature_list
                      ])  # should this just be array to begin with
    beta_k = 1 / (kB.value_in_unit(unit.kilojoule_per_mole / Tunit) * temps)

    # convert the energies from replica/evaluated state/sample form to evaluated state/sample form
    replica_energies = pymbar.utils.kln_to_kn(replica_energies)

    n_samples = len(replica_energies[0, :])

    # calculate the number of states we need expectations at.  We want it at all of the original
    # temperatures, each intermediate temperature, and then temperatures +/- from the original
    # to take finite derivatives.

    # create  an array for the temperature and energy for each state, including the
    # finite different state.
    num_sampled_T = len(temps)
    n_unsampled_states = 3 * (num_sampled_T +
                              (num_sampled_T - 1) * num_intermediate_states)
    unsampled_state_energies = np.zeros([n_unsampled_states, n_samples])
    full_T_list = np.zeros(n_unsampled_states)

    # delta is the spacing between temperatures.
    delta = np.zeros(num_sampled_T - 1)

    # fill in a list of temperatures at all original temperatures and all intermediate states.
    full_T_list[0] = temps[0]
    t = 0
    for i in range(num_sampled_T - 1):
        delta[i] = (temps[i + 1] - temps[i]) / (num_intermediate_states + 1)
        for j in range(num_intermediate_states + 1):
            full_T_list[t] = temps[i] + delta[i] * j
            t += 1
    full_T_list[t] = temps[-1]
    n_T_vals = t + 1

    # add additional states for finite difference calculation and the requested spacing/
    full_T_list[n_T_vals] = full_T_list[0] - delta[0] * frac_dT
    full_T_list[2 * n_T_vals] = full_T_list[0] + delta[0] * frac_dT
    for i in range(1, n_T_vals - 1):
        ii = i // (num_intermediate_states + 1)
        full_T_list[i + n_T_vals] = full_T_list[i] - delta[ii] * frac_dT
        full_T_list[i + 2 * n_T_vals] = full_T_list[i] + delta[ii] * frac_dT
    full_T_list[2 * n_T_vals -
                1] = full_T_list[n_T_vals - 1] - delta[-1] * frac_dT
    full_T_list[3 * n_T_vals -
                1] = full_T_list[n_T_vals - 1] + delta[-1] * frac_dT

    # calculate betas of all of these temperatures
    beta_full_k = 1 / (kB.value_in_unit(unit.kilojoule_per_mole / Tunit) *
                       full_T_list)

    ti = 0
    N_k = np.zeros(n_unsampled_states)
    for k in range(n_unsampled_states):
        # Calculate the reduced energies at all temperatures, sampled and unsample.
        unsampled_state_energies[
            k, :] = replica_energies[0, :] * (beta_full_k[k] / beta_k[0])
        if ti < len(temps):
            # store in N_k which states do and don't have samples.
            if full_T_list[k] == temps[ti]:
                ti += 1
                N_k[k] = n_samples // len(
                    temps)  # these are the states that have samples

    # call MBAR to find weights at all states, sampled and unsampled
    mbarT = pymbar.MBAR(unsampled_state_energies,
                        N_k,
                        verbose=False,
                        relative_tolerance=1e-12)

    for k in range(n_unsampled_states):
        # get the 'unreduced' potential -- we can't take differences of reduced potentials
        # because the beta is different; math is much more confusing with derivatives of the reduced potentials.
        unsampled_state_energies[k, :] /= beta_full_k[k]

    # we don't actually need these expectations, but this code can be used to validate
    #results = mbarT.computeExpectations(unsampled_state_energies, state_dependent=True)
    #E_expect = results[0]
    #dE_expect = results[1]

    # expectations for the differences between states, which we need for numerical derivatives
    results = mbarT.computeExpectations(unsampled_state_energies,
                                        output="differences",
                                        state_dependent=True)
    DeltaE_expect = results[0]
    dDeltaE_expect = results[1]

    # Now calculate heat capacity (with uncertainties) using the finite difference approach.
    Cv = np.zeros(n_T_vals)
    dCv = np.zeros(n_T_vals)
    for k in range(n_T_vals):
        im = k + n_T_vals  # +/- delta up and down.
        ip = k + 2 * n_T_vals
        Cv[k] = (DeltaE_expect[im, ip]) / (full_T_list[ip] - full_T_list[im])
        dCv[k] = (dDeltaE_expect[im, ip]) / (full_T_list[ip] - full_T_list[im])

    # add units so the plot has the right units.
    Cv *= unit.kilojoule_per_mole / Tunit  # always kJ/mol, since the OpenMM output is in kJ/mol.
    dCv *= unit.kilojoule_per_mole / Tunit
    full_T_list *= Tunit

    # plot and return the heat capacity (with units)
    if plot_file is not None:
        plot_heat_capacity(Cv,
                           dCv,
                           full_T_list[0:n_T_vals],
                           file_name=plot_file)
    return (Cv, dCv, full_T_list[0:n_T_vals])
Beispiel #7
0
def physical_validation_ensemble(output_data="output.nc",
                                 output_directory="ouput",
                                 plotfile='ensemble_check',
                                 pairs='single',
                                 ref_state_index=0):
    """
    Run ensemble physical validation test for 2 states in replica exchange simulation

    :param output_data: Path to the output data for a NetCDF-formatted file containing replica exchange simulation data
    :type output_data: str
    
    :param plotfile: Filename for outputting ensemble check plot
    :type plotfile: str
    
    :param pairs: Option for running ensemble validation on all replica pair combinations ('all'), adjacent pairs ('adjacent'), or single pair with optimal spacing ('single')
    
    :param ref_state_index: Index in temperature_list to use as one of the states in the ensemble check. The other state will be chosen based on the energy standard deviation at the reference state. Ignored if pairs='all'
    :type ref_state_index: int
    
    """

    # Get temperature list and read the energies for individual temperature replicas
    reporter = MultiStateReporter(output_data, open_mode="r")
    analyzer = ReplicaExchangeAnalyzer(reporter)

    states = reporter.read_thermodynamic_states()[0]
    temperature_list = []
    for s in states:
        temperature_list.append(s.temperature)

    (
        replica_energies,
        unsampled_state_energies,
        neighborhoods,
        replica_state_indices,
    ) = analyzer.read_energies()

    n_particles = np.shape(
        reporter.read_sampler_states(iteration=0)[0].positions)[0]
    T_unit = temperature_list[0].unit
    temps = np.array([temp.value_in_unit(T_unit) for temp in temperature_list])
    beta_k = 1 / (kB.value_in_unit(unit.kilojoule_per_mole / T_unit) * temps)
    n_replicas = len(temperature_list)
    for k in range(n_replicas):
        replica_energies[:, k, :] *= beta_k[k]**(-1)

    total_steps = len(replica_energies[0][0])
    state_energies = np.zeros([n_replicas, total_steps])

    for step in range(total_steps):
        for state in range(n_replicas):
            state_energies[state, step] = replica_energies[np.where(
                replica_state_indices[:, step] == state)[0], 0, step]

    state_energies *= unit.kilojoule_per_mole

    T_array = np.zeros(len(temperature_list))
    for i in range(len(temperature_list)):
        T_array[i] = temperature_list[i].value_in_unit(T_unit)

    if pairs.lower() != 'single' and pairs.lower(
    ) != 'adjacent' and pairs.lower() != 'all':
        print(
            f"Error: Pair option '{pairs}' not recognized, using default option 'single'"
        )
        pairs = 'single'

    if pairs.lower() == 'single':
        # Run ensemble validation on one optimally spaced temperature pair
        quantiles = {}

        # Find optimal state pair for ensemble check:
        # Compute standard deviations of each energy distribution:
        state_energies_std = np.std(state_energies, axis=1)

        # Select reference state point
        T_ref = temperature_list[ref_state_index]
        std_ref = state_energies_std[ref_state_index]

        # Compute optimal spacing:
        deltaT = 2 * kB * T_ref**2 / std_ref
        #print("DeltaT: %r" %deltaT)

        # Find closest match
        T_diff = np.abs(T_ref.value_in_unit(T_unit) - T_array)

        T_opt_index = np.argmin(np.abs(deltaT.value_in_unit(T_unit) - T_diff))
        T_opt = temperature_list[T_opt_index]

        # Set SimulationData for physical validation
        state1_index = ref_state_index
        state2_index = T_opt_index

        sim_data1, sim_data2 = set_simulation_data(state_energies, T_array,
                                                   state1_index, state2_index)

        # Run physical validation
        try:
            quantiles_ij = pv.ensemble.check(sim_data1,
                                             sim_data2,
                                             total_energy=False,
                                             filename=plotfile)

            quantiles[
                f"state{state1_index}_state{state2_index}"] = quantiles_ij[0]

        except InputError:
            print(
                f"Insufficient overlap between trajectories for states {state1_index} and {state2_index}. Skipping..."
            )

    elif pairs.lower() == 'adjacent':
        # Run ensemble validation on all adjacent temperature pairs
        quantiles = {}

        for i in range(len(temperature_list) - 1):
            # Set SimulationData for physical validation
            state1_index = i
            state2_index = i + 1

            sim_data1, sim_data2 = set_simulation_data(state_energies, T_array,
                                                       state1_index,
                                                       state2_index)

            # Run physical validation
            try:
                quantiles_ij = pv.ensemble.check(
                    sim_data1,
                    sim_data2,
                    total_energy=False,
                    filename=f"{plotfile}_{state1_index}_{state2_index}")

                quantiles[
                    f"state{state1_index}_state{state2_index}"] = quantiles_ij[
                        0]

            except InputError:
                print(
                    f"Insufficient overlap between trajectories for states {state1_index} and {state2_index}. Skipping..."
                )

    elif pairs.lower() == 'all':
        # Run ensemble validation on all combinations of temperature pairs
        quantiles = {}
        for i in range(len(temperature_list)):
            for j in range(i + 1, len(temperature_list)):
                # Set SimulationData for physical validation
                state1_index = i
                state2_index = j

                sim_data1, sim_data2 = set_simulation_data(
                    state_energies, T_array, state1_index, state2_index)

                # Run physical validation
                try:
                    quantiles_ij = pv.ensemble.check(
                        sim_data1,
                        sim_data2,
                        total_energy=False,
                        filename=f"{plotfile}_{state1_index}_{state2_index}")

                    quantiles[
                        f"state{state1_index}_state{state2_index}"] = quantiles_ij[
                            0]

                except InputError:
                    print(
                        f"Insufficient overlap between trajectories for states {state1_index} and {state2_index}. Skipping..."
                    )

    return quantiles