def superpose_to_most_compact_in_list(superpose_info, geom_list): r""" Provided a list of `mdtraj.Trajectory` objects, orient them to the most compact possible structure according to :obj:`superpose_info` Parameters ---------- superpose_info : boolean, str, or iterable of integers boolean : "True" orients with all atoms or "False" won't do anything str : superpose according to anything :obj:`mdtraj.Topology.select` can understand (http://mdtraj.org/latest/atom_selection.html) iterable of integers : superpose according to these atom idxs geom_list : list of :obj:`mdtraj.Trajectory` objects Returns ------- geom_list : list of :obj:`mdtraj.Trajectory` objects """ # Superpose if wanted sel = parse_atom_sel(superpose_info, geom_list[0].top) if sel is not None: ref = geom_list_2_geom(geom_list) ref = ref[_md.compute_rg(ref).argmin()] geom_list = [ igeom.superpose(ref, atom_indices=sel) for igeom in geom_list ] return geom_list
def computeRG(self, reference=None, masses=None): assert self.traj is not None if reference is None: reference = self.traj return md.compute_rg(reference, masses=masses)
def get_rg_for_run(name, ply_idxs, pdb, use_cent, recalc): topfile, trajnames = get_trajnames(name, use_cent) rg_for_run = [] for j in range(len(trajnames)): idx = j + 1 if use_cent: tname = name + "_traj_cent_" + str(idx) + ".dcd" else: tname = name + "_traj_" + str(idx) + ".dcd" rg_name = "rg_{}.npy".format(idx) if not os.path.exists(rg_name) or recalc: if not os.path.exists(tname): raise IOError(tname + " does not exist!") last_change = np.abs(os.path.getmtime(tname) - time.time()) / 60. if last_change > 5: # only calculate if traj has been modified in last five minutes. # this is meant to check if traj is still running. Rg = [] for chunk in md.iterload(tname, top=pdb, atom_indices=ply_idxs): rg = md.compute_rg(chunk) Rg.append(rg) Rg = np.concatenate(Rg) print(" " + rg_name) np.save(rg_name, Rg) else: Rg = None else: Rg = np.load(rg_name) if not (Rg is None): rg_for_run.append(Rg) return rg_for_run
def test_principal_moments(traj4): rg_actual = md.compute_rg(traj4) principal_moments = shape.principal_moments(traj4) rg_computed = np.sqrt(principal_moments.sum(axis=1)) assert eq(rg_actual, rg_computed)
def compute_radius_of_gyration(self): """ Compute the radius of gyration for every frame. :return: Rg for every frame (ndarray) """ return md.compute_rg(self.traj)
def test_asphericity(traj4): b_computed = shape.asphericity(traj4) pm = shape.principal_moments(traj4) rg = md.compute_rg(traj4) b_actual = 1.5 * pm[:, 2] - rg**2 / 2.0 assert eq(b_actual, b_computed)
def test_shape_metrics(traj4): b = shape.asphericity(traj4) c = shape.acylindricity(traj4) rg = md.compute_rg(traj4) kappa_actual = (b**2 + 0.75 * c**2) / (rg**4) kappa_computed = shape.relative_shape_antisotropy(traj4) assert eq(kappa_actual, kappa_computed)
def main(): traj = md.load(sys.argv[1]) rg = md.compute_rg(traj) * 10 # unit: nm --> A with open( os.path.splitext(os.path.basename(sys.argv[1]))[0] + '.Rg.dat', 'w') as f: for val in rg: print >> f, '%8.3f' % (val)
def compute_Rgyr(trajectory): ''' Compute the radius of gyration ''' rgyr = mdtraj.compute_rg( trajectory) # compute radius of gyration for each frame for i in range(rgyr.size): print("Rgyr [frame: {0}] = {1} nm".format(i, rgyr[i])) return rgyr
def cal_rg_PDB(pdbfile: str, selection='all'): """Calculate the Residue Gyration value for single PDB file. Args: pdbfile (str): Input PDB files. selection (str, optional): Selection group to calculate the Rg value. Defaults to 'all'. """ PDB = md.load(pdbfile) AtomIndex = PDB.top.select(selection) selGroup = PDB.atom_slice(AtomIndex) rg = md.compute_rg(selGroup) print("%s: %.4f nm" % (pdbfile, rg))
def compute_flory(struc,nres): """ The coordinates need to be centered EACH TIME Rg is computed. Therefore you can't just keep adding to S and recomputing the eigenvalues without centering everything. """ N = range(5,25) rg = np.zeros(len(N)) count = np.zeros(len(N)) for n in N: for r in range(nres-n): sel = struc.atom_slice(struc.topology.select('resid ' + str(r) + ' to ' + str(r+n-1))) rg[n-5] += md.compute_rg(sel)[0] count[n-5] += 1 rg = [rg[i]/count[i] for i in range(len(rg))] return rg
def cal_rg_traj(topologyfile: str, trajfile: str, selection='all', outfile=None) -> np.array: """Calculate the Rg value for single trajecotry file. Args: topologyfile (str): topology file trajfile (str): trajectory file selection (str, optional): select action for atoms. Defaults to 'all'. outfile (str optional): outfile to save the Rg value. Returns: np.array: [description] """ traj = md.load(trajfile, top=topologyfile) AtomIndex = traj.top.select(selection) rgs = md.compute_rg(traj.atom_slice(AtomIndex)) if outfile: np.savetxt(outfile, rgs, fmt='%.4f') return rgs
def extract_rgyr(self, mdtraj_obj, cmpd_name=None): """ It returns a dictionary containing Mean, Median and Standard Deviation of the Radius of Gyration (Rgyr). It uses the compute_rg function of MDTraj. If cmpd_name is specified, it is returned in the dictionary. Parameters: ---------- solute_traj: MDTraj trajectory object trajectory of the solute. The trajectory should be read in using the MDTraj functions. Example: solute_traj = md.load(traj_file, top=pdb_file, atom_indices = solute_atoms) cmpd_name: str, optional Name of the compound. If specified, it is returned in the output dictionary. (Default = None) Returns ---------- dict_rgyr: dict Dictionary containing mean, standard deviation, and median of the Rgyr calculated over the simulation trajectory. If cmpd_name is specified, it is returned in the dictionary. """ df = list( md.compute_rg(mdtraj_obj, masses=np.array([ a.element.mass for a in mdtraj_obj.topology.atoms ]))) stats = list(self.get_stats(df)) dict_rgyr = { 'wat_rgyr_av': stats[0], 'wat_rgyr_std': stats[1], 'wat_rgyr_med': stats[2] } if cmpd_name == None: return (dict_rgyr) else: dict_rgyr.update({"cmpd_name": cmpd_name}) return (dict_rgyr)
def compute_mdtraj_order_parmeters(trajectory_file, rmsd_reference_structure=None): # documentation: http://mdtraj.org/1.8.0/analysis.html# trajectory = md.load(trajectory_file) return_values = [] return_value_names = [] if not rmsd_reference_structure == None: reference = md.load(rmsd_reference_structure) rmsd = md.rmsd(trajectory, reference) return_values.append(rmsd) return_value_names.append("RMSD") hydrogen_bonds = np.array( [np.sum(x) for x in md.kabsch_sander(trajectory)]) return_values.append(hydrogen_bonds) return_value_names.append("HBondEnergy") ss = md.compute_dssp(trajectory) shape = ss.shape transdict = dict( zip(list(set(list(ss.flatten()))), range(len(list(set(list(ss.flatten()))))))) ss = np.array([transdict[x] for x in ss.flatten()]).reshape(shape).T return_values.append(ss) return_value_names.append("SecondaryStructure") rg = md.compute_rg(trajectory) return_values.append(rg) return_value_names.append("Rg") distances, residue_pairs = md.compute_contacts(trajectory, scheme='ca') contacts = md.geometry.squareform(distances, residue_pairs) return_values.append(contacts) return_value_names.append("Contacts") return dict(zip(return_value_names, return_values))
traj[int(n_frames / 10.0):].save_xtc(PathOut + "ex_md.xtc") replica = PathOut + "ex_md.xtc" replica = PathOut + "ex_md.xtc" traj = md.load(replica, top=struct) n_frames = traj.n_frames n_atoms = traj.n_atoms #backbone's atoms indice bb = topology.select('name N or name CA or name C') #Fit all structure's backbone to the first frame traj.superpose(reference=traj[0], frame=0, atom_indices=bb) topology = traj.topology #Compute radius of gyration gyrateArray = md.compute_rg(traj) #Compute average structure if there is no reference structure for the RMSD if args.stru4RMSD is None: cmd = gmx + " covar -f " + replica + " -s " + tpr + " -av " + PathOut + "average.gro" #Structure moyenne obtenu sur la traj ex_md Popen("echo \"4 0\" | " + cmd, shell=True).wait() ref_struct = md.load(PathOut + "average.gro") rms = md.rmsd(traj, ref_struct, atom_indices=bb) * 10 else: ref_struct = md.load(args.stru4RMSD) topology2 = ref_struct.topology bb2 = topology2.select('name N or name CA or name C') print("Superpose atoms:\n{0}\nwith\n{0}\n".format(bb, bb2)) rms = md.rmsd(traj, ref_struct, atom_indices=bb) * 10 print( "There is {0:.2f} %of conformation with a RMSD below 2.0 A".format(
#!/usr/bin/env python from __future__ import print_function import sys import numpy as np import mdtraj as md infile = sys.argv[1] trj = md.load(infile) masses = np.array([atom.element.mass for atom in trj.top.atoms]) rg = md.compute_rg(trj, masses=masses) print("{:.2f} nm".format(rg[0]))
def calc_rg(t): traj = md.load(t) return md.compute_rg(traj)
def compute_radius_of_gyration(self): return 10 * md.compute_rg(self._trajectory)
Created on Fri Jul 29 16:10:31 2016 @author: hliu """ import mdtraj as md import pandas as pd import numpy as np from researchcode.plotting.plot_set import * import glob import os import matplotlib as mpl from matplotlib.ticker import FuncFormatter struct_funct = {'ss': lambda x: md.compute_dssp(x), 'rg': lambda x: md.compute_rg(x), 'heli': lambda x: calSSPercent(x, 'H'), 'beta': lambda x: calSSPercent(x, 'E'), #'rmsd': lambda x: rmsds[x.name] } def addProperty2Traj(traj, props): for key in props: if not hasattr(traj, key): setattr(traj, key, props[key](traj)) else: continue def getTraj(trajDir, trajNameType, topFile):
# Example of programing "to a concrete" or "to a specific". This is the "bad" way of doing things... import mdtraj as md import MDAnalysis as mda import numpy as np import sys toolkit = sys.argv[1] if toolkit == 'mdtraj': # mdtraj style trajectory = md.load_pdb('protein.pdb') print( 10 * md.compute_center_of_mass(trajectory)) # factor of 10 converts nm to Å print(10 * md.compute_rg(trajectory)) elif toolkit == 'mdanalysis': # MDAnalysis style universe = mda.Universe('protein.pdb') mass_by_frame = np.ndarray(shape=(len(universe.trajectory), 3)) for ts in universe.trajectory: mass_by_frame[ts.frame] = universe.atoms.center_of_mass( compound='segments') print(mass_by_frame) rg_by_frame = np.ndarray(shape=(len(universe.trajectory))) for ts in universe.trajectory: rg_by_frame[ts.frame] = universe.atoms.radius_of_gyration( compound='segments') print(rg_by_frame) else: raise AttributeError
import mdtraj as md import MDAnalysis as mda import numpy as np import sys from abc import ABC, abstractmethodC toolkit = sys.argv[1] if toolkit == "MDTraj": print("MDTraj") trajectory = md.load_pdb("protein.pdb") print(md.compute_center_of_mass(trajectory) * 10) print(md.compute_rg(trajectory) * 10) elif toolkit == "MDAnalysis": print("MDAnalysis") universe = mda.Universe("protein.pdb") mass_by_frame = np.ndarray(shape=(len(universe.trajectory), 3)) rg_by_frame = np.empty(len(universe.trajectory)) for ts in universe.trajectory: mass_by_frame[ts.frame] = universe.atoms.center_of_mass( compound="segments") rg_by_frame[ts.frame] = universe.atoms.radius_of_gyration() print(mass_by_frame) print(rg_by_frame)
def analyze_trajectory(traj_path, do_sasa, do_sasa_vmd, do_rgyr, do_rgyr_vmd, vmd_selection, do_rmsf, report_pattern, report_dir, disp_logfile): trajectory = None data = None if do_sasa: print "Calculating SASA ..." if trajectory is None: trajectory = mdtraj.load(traj_path) #Shrake, A; Rupley, JA. (1973) J Mol Biol 79 (2): 351--71. sasa = mdtraj.shrake_rupley(trajectory, mode='residue').sum(axis=1) numpy.savetxt(traj_path + '.sasa', sasa) if do_sasa_vmd and do_rgyr_vmd: calculate_sasa_and_rgyr_with_vmd( traj_path, '%s.%s.sasa' % (traj_path, vmd_selection.replace(" ", "_")), '%s.%s.rgyr' % (traj_path, vmd_selection.replace(" ", "_")), vmd_selection) if do_sasa_vmd and not do_rgyr_vmd: calculate_sasa_with_vmd( traj_path, '%s.%s.sasa' % (traj_path, vmd_selection.replace(" ", "_")), vmd_selection) if do_rgyr_vmd and not do_sasa_vmd: calculate_rgyr_with_vmd( traj_path, '%s.%s.rgyr' % (traj_path, vmd_selection.replace(" ", "_")), vmd_selection) if do_rgyr: print "Calculating Radius of Gyration ..." if trajectory is None: trajectory = mdtraj.load(traj_path) rgyr = mdtraj.compute_rg(trajectory) numpy.savetxt(traj_path + '.rgyr', rgyr) if trajectory is not None: del trajectory if do_rmsf: print "Calculating RMSF ..." # data, _ = load_all_pdbs_ca([{ # "source": traj_path, # "base_selection":"name CA" # }]) # rmsf_array = rmsf(data.structure_ensemble) import prody print "Loading structure..." pdb = prody.proteins.pdbfile.parsePDB(traj_path, subset='ca') print "Calculating ..." rmsf_array = ca_rmsf(pdb) numpy.savetxt(traj_path + '.rmsf', rmsf_array) if data is not None: del data if report_pattern != "": print "Extracting acceptance and energies from report files with pattern %s inside %s ..." % ( report_pattern, report_dir) files = glob.glob(os.path.join(report_dir, report_pattern)) assert len( files) != 0, "No report file with pattern %s found inside %s" % ( report_pattern, report_dir) all_accepted = [] all_total = [] all_energies = [] for report_file in files: total, accepted, energies = process_report_file(report_file) all_total.append(total) all_accepted.append(accepted) all_energies.append(list(energies)) total = numpy.sum(all_total) accepted = numpy.sum(all_accepted) acceptance = accepted / total numpy.savetxt(traj_path + '.acc', [acceptance], fmt="%.4f ") energy_handler = open(traj_path + '.ener', "w") for i in range(len(all_energies)): for j in range(len(all_energies[i])): energy_handler.write("%f\n" % all_energies[i][j]) energy_handler.write("###\n") if disp_logfile != "": handler = open(disp_logfile) fractions = [] for line in handler: if line[0:4] == "DBG:": if "iterations performed" in line: parts = line.split() fractions.append(float(parts[1]) / float(parts[3])) numpy.savetxt(traj_path + '.frac', [numpy.mean(fractions)], fmt="%.4f ") handler.close()
def run(self): time_start=time.time() print("start") parser = self.create_arg_parser() args = parser.parse_args() #parser = argparse.ArgumentParser() #parser.add_argument('--Kconfig', help='link to Kernel configurations file') #parser.add_argument('--port', dest="port", help='port for RabbitMQ server', default=5672, type=int) #args = parser.parse_args() Kconfig = imp.load_source('Kconfig', args.Kconfig) pdb_file=glob.glob(args.path+'/iter*_input*.pdb')[0] #pdb_file=glob.glob('iter*_input*.pdb')[0] #traj_files=glob.glob(args.path+'/iter*_traj*.dcd') p_cont=True p_iter=0 traj_files=[] traj_files_npy=[] iter_arr=[] while(p_cont): traj_files_tmp=glob.glob(args.path+'/iter'+str(p_iter)+'_traj*.dcd') traj_files_npy_tmp=glob.glob(args.path+'/iter'+str(p_iter)+'_traj*.npy') traj_files.sort() if len(traj_files_tmp)==0: p_cont=False else: print("iter", str(p_iter), " # files", str(len(traj_files_tmp))) traj_files=traj_files+traj_files_tmp traj_files_npy=traj_files_npy+traj_files_npy_tmp iter_arr=iter_arr+[p_iter]*len(traj_files_tmp) p_iter=p_iter+1 p_iter_max=p_iter-1 iter_arr=np.array(iter_arr) #traj_files=glob.glob('iter*_traj*.dcd') traj_files.sort() get_out_arr=[] for i, file in enumerate(traj_files_npy): get_out_arr=get_out_arr+[np.load(file)] #topfile = md.load(pdb_file) #featurizer = pyemma.coordinates.featurizer(topfile) #featurizer.add_residue_mindist(residue_pairs='all', scheme='closest-heavy') #featurizer.add_backbone_torsions(cossin=True) #featurizer.dimension() #inp = pyemma.coordinates.source(traj_files, featurizer) #inp.get_output() #print("n atoms",topfile.n_atoms) #print("n frames total",inp.n_frames_total()) #print("n trajs",inp.number_of_trajectories()) #print(" traj lengths", inp.trajectory_lengths()) #print(" input dimension",inp.dimension()) tica_lag=Kconfig.tica_lag#1 tica_dim=Kconfig.tica_dim tica_stride=Kconfig.tica_stride if Kconfig.koopman=='yes': try: tica_obj = pyemma.coordinates.tica(get_out_arr, lag=tica_lag, dim=tica_dim, kinetic_map=True, stride=tica_stride, weights='koopman') print("koopman works") except: tica_obj = pyemma.coordinates.tica(get_out_arr, lag=tica_lag, dim=tica_dim, kinetic_map=True, stride=tica_stride, weights='empirical') print("koopman failed, using normal tica") else: tica_obj = pyemma.coordinates.tica(get_out_arr, lag=tica_lag, dim=tica_dim, kinetic_map=True, stride=tica_stride, weights='empirical') # tica_weights='empirical', tica_weights='koopman' #tica_obj = pyemma.coordinates.tica(inp, lag=tica_lag, dim=tica_dim, kinetic_map=True, stride=tica_stride, weights=tica_weights) print("TICA eigenvalues", tica_obj.eigenvalues) print("TICA timescales",tica_obj.timescales) y = tica_obj.get_output(stride=tica_stride) np.save(args.path+'/npy_iter'+str(args.cur_iter)+'_tica_y.npy',y) #y[0].shape print('time tica finished', str(time.time()-time_start)) msm_states=Kconfig.msm_states msm_stride=Kconfig.msm_stride msm_lag=Kconfig.msm_lag cl = pyemma.coordinates.cluster_kmeans(data=y, k=msm_states, max_iter=10, stride=msm_stride) #np.save(args.path+'/npy_iter'+str(args.cur_iter)+'_tica_cl.npy',cl) np.save(args.path+'/npy_iter'+str(args.cur_iter)+'_tica_dtrajs.npy',cl.dtrajs) #cl = pyemma.coordinates.cluster_mini_batch_kmeans(data=y, k=msm_states, max_iter=10, n_jobs=None) print('time kmeans finished', str(time.time()-time_start)) m = pyemma.msm.estimate_markov_model(cl.dtrajs, msm_lag) np.save(args.path+'/npy_iter'+str(args.cur_iter)+'_tica_m.npy',m) print('time msm finished', str(time.time()-time_start)) ######################################## #print(tica_obj.eigenvectors) print("MSM eigenvalues",m.eigenvalues(10)) #print(m.eigenvectors_left(10)) #print(m.eigenvectors_right(10)) print("MSM P connected",m.P) #only connected #print("MSM clustercenters",cl.clustercenters) print("MSM timescales", m.timescales(10)) #print("MSM stat", m.stationary_distribution) print("MSM active set", m.active_set) print('fraction of states used = ', m.active_state_fraction) print('fraction of counts used = ', m.active_count_fraction) c = m.count_matrix_full s = np.sum(c, axis=1) print("count matrix sums",s) if 0 not in s: q = 1.0 / s n_states=c.shape[0] dtrajs = [ t for t in cl.dtrajs ] #print("msm dtrajs", dtrajs) #get frame_list for each msm state frame_state_list = {n: [] for n in range(n_states)} for nn, dt in enumerate(dtrajs): for mm, state in enumerate(dt): frame_state_list[state].append((nn,mm)) for k in range(n_states): if len(frame_state_list[k]) == 0: print('removing state '+str(k)+' no frames') q[k] = 0.0 # and normalize the remaining one q /= np.sum(q) n_pick=int(args.n_select)#100 if Kconfig.strategy=='cmicro': state_picks = np.random.choice(np.arange(len(q)), size=n_pick, p=q) elif Kconfig.strategy=='cmacro': num_eigenvecs_to_compute = 10 microstate_transitions_used=c #cache['too_small']='False' num_visited_microstates=c.shape[0] states_unique=np.arange(num_visited_microstates) visited_microstates=states_unique largest_visited_set=msmtools.estimation.largest_connected_set(microstate_transitions_used) C_largest0=microstate_transitions_used[largest_visited_set, :][:, largest_visited_set] rowsum = np.ravel(C_largest0.sum(axis=1)) largest_visited_set2=largest_visited_set[rowsum>0] C_largest=microstate_transitions_used[largest_visited_set2, :][:, largest_visited_set2] rowsum = C_largest.sum(axis=1) #print("C_largest", C_largest.shape[0]) if C_largest.shape[0]>10: if(np.min(rowsum) == 0.0): print("failed because rowsum", rowsoum, C_largest) cache['small']='True' #raise ValueError("matrix C contains rows with sum zero.") #try: #print("try") T_largest=msmtools.estimation.transition_matrix(C_largest, reversible=True) #print(T_largest.shape) states_largest=largest_visited_set2 print("largest_connected_set", states_largest.shape[0]) #print(states_largest, states_unique) MSM_largest=pyemma.msm.markov_model(T_largest) current_eigenvecs = MSM_largest.eigenvectors_right(num_eigenvecs_to_compute) current_timescales = np.real(MSM_largest.timescales()) current_eigenvals = np.real(MSM_largest.eigenvalues()) not_connect=np.where(np.in1d(states_unique, states_largest,invert=True))[0] all_connect=np.where(np.in1d(states_unique, states_largest))[0] print("worked timescales",current_timescales[:10]) print("not_connected states",not_connect) projected_microstate_coords_scaled = sklearn.preprocessing.MinMaxScaler(feature_range=(-1, 1)).fit_transform(current_eigenvecs[:,1:]) projected_microstate_coords_scaled *= np.sqrt(current_timescales[:num_eigenvecs_to_compute-1] / current_timescales[0]).reshape(1, num_eigenvecs_to_compute-1) select_n_macro_type=Kconfig.select_n_macro_type #'kin_content' #Kconfig.select_n_macro_type if select_n_macro_type == 'const': # 1_over_cmacro_estim par_num_macrostates=int(Kconfig.num_macrostates)#30 num_macrostates = min(par_num_macrostates,num_visited_microstates) elif select_n_macro_type == 'kin_var': # 1_over_cmacro_estim3 frac_kin_var=0.5 kin_var = np.cumsum(current_eigenvals**2) cut = kin_var[kin_var < kin_var.max()*frac_kin_var] num_macrostates = min(max(cut.shape[0],1),num_visited_microstates) elif select_n_macro_type == 'kin_content': # 1_over_cmacro_estim4 frac_kin_content=0.5 kin_cont = np.cumsum(-1./np.log(np.abs(current_eigenvals[1:])))/2. cut = kin_cont[kin_cont < kin_cont.max()*frac_kin_content] num_macrostates = min(max(cut.shape[0],1),num_visited_microstates) macrostate_method='pcca' #macrostate_method='kmeans' if macrostate_method=='pcca': m.pcca(num_macrostates) macrostate_assignments = { k:v for k,v in enumerate(m.metastable_sets) } largest_assign = m.metastable_assignments print("macrostate assignments", macrostate_assignments) print("mismatch", "largest_assign", largest_assign.shape, "num_visited_microstates", num_visited_microstates) #all_assign=largest_assign all_assign=np.zeros(num_visited_microstates) all_assign[all_connect]=largest_assign all_assign[not_connect]=np.arange(not_connect.shape[0])+largest_assign.max()+1 print('time macrostate pcca finished', str(time.time()-time_start)) else: kmeans_obj = pyemma.coordinates.cluster_kmeans(data=projected_microstate_coords_scaled, k=num_macrostates, max_iter=10) largest_assign=kmeans_obj.assign()[0] print('time macrostate kmeans finished', str(time.time()-time_start)) all_assign=np.zeros(num_visited_microstates) all_assign[all_connect]=largest_assign all_assign[not_connect]=np.arange(not_connect.shape[0])+largest_assign.max()+1 macrostate_assignment_of_visited_microstates=all_assign.astype('int') np.save(args.path+'/npy_iter'+str(args.cur_iter)+'_msm_macrostates.npy',macrostate_assignment_of_visited_microstates) print("all_assign",all_assign) select_macro_type = 'sto_inv_linear' if select_macro_type=='dmdmd': macrostate_counts = np.array([np.sum(s[states_unique][macrostate_assignment_of_visited_microstates == macrostate_label]) for macrostate_label in range(macrostate_assignment_of_visited_microstates.max()+1)]) selected_macrostate = select_restart_state(macrostate_counts[macrostate_counts > 0], 'rand', np.arange(macrostate_counts.shape[0])[macrostate_counts > 0], nparallel=nparallel) #print(macrostate_counts[macrostate_counts > 0], np.arange(num_macrostates)[macrostate_counts > 0], selected_macrostate) elif select_macro_type == 'sto_inv_linear': macrostate_counts = np.array([np.sum(s[states_unique][macrostate_assignment_of_visited_microstates == macrostate_label]) for macrostate_label in range(macrostate_assignment_of_visited_microstates.max()+1)]) selected_macrostate = select_restart_state(macrostate_counts[macrostate_counts > 0], 'sto_inv_linear', np.arange(macrostate_counts.shape[0])[macrostate_counts > 0], nparallel=n_pick) print("macrostate_counts", macrostate_counts) print("selected_macrostate", selected_macrostate) select_micro_within_macro_type='sto_inv_linear' restart_state=np.empty((0)) for i in range(n_pick): selected_macrostate_mask = (macrostate_assignment_of_visited_microstates == selected_macrostate[i]) #print(selected_macrostate, microstate_transitions_used[visited_microstates], macrostate_counts, counts[states_unique][selected_macrostate]) counts_in_selected_macrostate = s[states_unique][selected_macrostate_mask] #print parameters['select_micro_within_macro_type'] if select_micro_within_macro_type == 'sto_inv_linear': # within a macrostate, select a microstate based on count add_microstate=select_restart_state(counts_in_selected_macrostate, 'sto_inv_linear', visited_microstates[selected_macrostate_mask], nparallel=1) elif select_micro_within_macro_type == 'rand': add_microstate=select_restart_state(counts_in_selected_macrostate, 'rand', visited_microstates[selected_macrostate_mask], nparallel=1) #restart_state = [np.random.choice(visited_microstates[selected_macrostate_mask])] * nparallel restart_state=np.append(restart_state,add_microstate) #print(i,selected_macrostate[i], add_microstate) state_picks=restart_state.astype('int') print("state_picks",state_picks) print("no exceptions") #except: #state_picks = np.random.choice(np.arange(len(q)), size=n_pick, p=q) #print("state_picks",state_picks) #print("exception found") else: print("didn't recognize strategy") print("selected msm restarts", state_picks) picks = [ frame_state_list[state][np.random.randint(0, len(frame_state_list[state]))] for state in state_picks ] traj_select = [traj_files[pick[0]] for pick in picks] frame_select = [pick[1]*tica_stride*msm_stride for pick in picks] print('traj_select picks',picks) print('frame_select',traj_select) print('time frame selection finished', str(time.time()-time_start)) text_file = open(args.path + "/traj_select.txt", "w") for idx in range(n_pick): text_file.write(traj_select[idx]+' to iter '+str(args.cur_iter)+' idx '+str(idx)+' \n') text_file.close() # write new input files from frames for idx in range(n_pick): tmp =md.load(args.path+'/iter0_input0.pdb') files = md.load(traj_select[idx], top=args.path+'/iter0_input0.pdb') tmp.xyz[0,:,:]=files.xyz[frame_select[idx],:,:] tmp.save_pdb(args.path+'/iter'+str(args.cur_iter+1)+'_input'+str(idx)+'.pdb') print('time writing new frames finished', str(time.time()-time_start)) #rg rmsd original_file = md.load(args.path+'/'+args.ref)#'/iter0_input0.pdb') out_files=glob.glob(args.path+'/iter*_out*.pdb') out_files.sort() #print md.rmsd(md.load(out_files2[2]),original_file, atom_indices=heavy)[0] BETA_CONST = 50 # 1/nm LAMBDA_CONST = 1.8 NATIVE_CUTOFF = 0.45 # nanometers heavy = original_file.topology.select_atom_indices('heavy') heavy_pairs = np.array([(i,j) for (i,j) in combinations(heavy, 2) if abs(original_file.topology.atom(i).residue.index - \ original_file.topology.atom(j).residue.index) > 3]) # compute the distances between these pairs in the native state heavy_pairs_distances = md.compute_distances(original_file[0], heavy_pairs)[0] # and get the pairs s.t. the distance is less than NATIVE_CUTOFF native_contacts = heavy_pairs[heavy_pairs_distances < NATIVE_CUTOFF] r0 = md.compute_distances(original_file[0], native_contacts) rg_arr=[] rmsd_arr=[] q_arr=[] for file in out_files: file2 = md.load(file) rmsd_val=md.rmsd(file2,original_file, atom_indices=heavy)[0] rg_arr.append(md.compute_rg(file2)[0]) rmsd_arr.append(rmsd_val) r = md.compute_distances(file2[0], native_contacts) q = np.mean(1.0 / (1 + np.exp(BETA_CONST * (r - LAMBDA_CONST * r0))), axis=1)[0] q_arr.append(q) rg_arr=np.array(rg_arr) np.save(args.path+'/npy_iter'+str(args.cur_iter)+'_rg_arr.npy',rg_arr) #print("rg values", rg_arr.min(), rg_arr.max(), rg_arr) rmsd_arr=np.array(rmsd_arr) np.save(args.path+'/npy_iter'+str(args.cur_iter)+'_rmsd_arr.npy',rmsd_arr) #print("rmsd values", rmsd_arr.min(), rmsd_arr.max(), rmsd_arr) q_arr=np.array(q_arr) np.save(args.path+'/npy_iter'+str(args.cur_iter)+'_q_arr.npy',q_arr) #print("Q values", q_arr.min(), q_arr.max(), q_arr) ######################################## colornames=[name for name, color in matplotlib.colors.cnames.iteritems()] tica0=np.array([]) tica1=np.array([]) for i in range(len(y)): tica0=np.append(tica0,y[i][:,0]) tica1=np.append(tica1,y[i][:,1]) clf() fig=figure() ax = fig.add_subplot(111) ax.scatter(np.arange(tica_obj.timescales.shape[0]),tica_obj.timescales) ax.set_ylabel('TICA Timescales (steps)') ax.set_xlabel('# TICA eigenvector') ax.set_yscale('log') savefig(args.path+'/plot_iter'+str(args.cur_iter)+'_tica_timescales.png', bbox_inches='tight', dpi=200) cumvar = np.cumsum(tica_obj.timescales) cumvar /= cumvar[-1] clf() plot(cumvar, linewidth=2) for thres in [0.5,0.8,0.95]: threshold_index=np.argwhere(cumvar > thres)[0][0] print "tica thres, thres_idx", thres, threshold_index vlines(threshold_index, 0.0, 1.0, linewidth=2) hlines(thres, 0, cumvar.shape[0], linewidth=2) xlabel('Eigenvalue Number', fontsize = 16) ylabel('cumulative kinetic content', fontsize = 16) savefig(args.path+'/plot_iter'+str(args.cur_iter)+'_tica_cumulative_kinetic_content.png', bbox_inches='tight', dpi=200) msm_timescales=m.timescales(100) clf() fig=figure() ax = fig.add_subplot(111) ax.scatter(np.arange(msm_timescales.shape[0]),msm_timescales*tica_stride) ax.set_ylabel('MSM Timescales (steps)') ax.set_xlabel('# MSM eigenvector') ax.set_yscale('log') savefig(args.path+'/plot_iter'+str(args.cur_iter)+'_msm_timescales.png', bbox_inches='tight', dpi=200) cumvar = np.cumsum(m.timescales(100)) cumvar /= cumvar[-1] clf() plot(cumvar, linewidth=2) for thres in [0.5,0.8,0.95]: threshold_index=np.argwhere(cumvar > thres)[0][0] print "msm thres, thres_idx", thres, threshold_index vlines(threshold_index, 0.0, 1.0, linewidth=2) hlines(thres, 0, cumvar.shape[0], linewidth=2) xlabel('Eigenvalue Number', fontsize = 16) ylabel('cumulative kinetic content', fontsize = 16) savefig(args.path+'/plot_iter'+str(args.cur_iter)+'_msm_cumulative_kinetic_content.png', bbox_inches='tight', dpi=200) clf() xlabel("TICA ev0") ylabel("TICA ev1") cp = scatter(tica0, tica1, s=10, c='blue', marker='o', linewidth=0.,cmap='jet', label='MSM states') savefig(args.path+'/plot_iter'+str(args.cur_iter)+'_tica_evs.png', bbox_inches='tight', dpi=200) clf() fig, ax = plots.plot_free_energy(tica0, tica1,cmap='Spectral') xlabel("TICA ev0") ylabel("TICA ev1") savefig(args.path+'/plot_iter'+str(args.cur_iter)+'_tica_evs2.png', bbox_inches='tight', dpi=200) clf() fig, ax = plots.plot_free_energy(tica0, tica1,cmap='Spectral') cp = scatter(cl.clustercenters[:,0], cl.clustercenters[:,1], s=10, c='blue', marker='o', linewidth=0.,cmap='jet', label='MSM state centers') xlabel("TICA ev0") ylabel("TICA ev1") legend() savefig(args.path+'/plot_iter'+str(args.cur_iter)+'_tica_evs3_centers.png', bbox_inches='tight', dpi=200) #plot msm ev clf() xlabel("MSM ev1") ylabel("MSM ev2") cp = scatter(m.eigenvectors_right(10)[:,1], m.eigenvectors_right(10)[:,2], s=10, c='blue', marker='o', linewidth=0.,cmap='jet', label='MSM states') savefig(args.path+'/plot_iter'+str(args.cur_iter)+'_msm_evs.png', bbox_inches='tight', dpi=200) #plot msm ev clf() fig, ax = plots.plot_free_energy(m.eigenvectors_right(10)[:,1], m.eigenvectors_right(10)[:,2], cmap='Spectral', weights=m.stationary_distribution, nbins=30) xlabel("MSM ev1") ylabel("MSM ev2") savefig(args.path+'/plot_iter'+str(args.cur_iter)+'_msm_evs2.png', bbox_inches='tight', dpi=200) clf() xlabel("RMSD") ylabel("Rg") cp = scatter(rmsd_arr, rg_arr, s=10, c='blue', marker='o', linewidth=0.,cmap='jet', label='MSM states') savefig(args.path+'/plot_iter'+str(args.cur_iter)+'_rgrmsd.png', bbox_inches='tight', dpi=200) #plot msm ev clf() fig, ax = plots.plot_free_energy(rmsd_arr, rg_arr, cmap='Spectral', nbins=30) xlabel("RMSD") ylabel("Rg") savefig(args.path+'/plot_iter'+str(args.cur_iter)+'_rgrmsd2.png', bbox_inches='tight', dpi=200) clf() xlabel("Q") ylabel("Rg") cp = scatter(q_arr, rg_arr, s=10, c='blue', marker='o', linewidth=0.,cmap='jet', label='MSM states') savefig(args.path+'/plot_iter'+str(args.cur_iter)+'_qrg.png', bbox_inches='tight', dpi=200) clf() fig, ax = plots.plot_free_energy(q_arr, rg_arr, cmap='Spectral', nbins=10) xlabel("Q") ylabel("Rg") savefig(args.path+'/plot_iter'+str(args.cur_iter)+'_qrg_2.png', bbox_inches='tight', dpi=200) #Q 1d free energy clf() z, x = np.histogram(q_arr, bins=10) F = -np.log(z) F=F-F.min() plot(x[1:], F) scatter(x[1:], F) xlabel('Q', fontsize = 15) ylabel('Free Energy [kT]', fontsize =15) savefig(args.path+'/plot_iter'+str(args.cur_iter)+'_free_energy_q.png', bbox_inches='tight', dpi=200) #MSM 1d free energy clf() n_step=int(m.P.shape[0]/10) bins=np.sort(m.eigenvectors_right(10)[:,1])[::n_step] bins=np.append(bins,np.sort(m.eigenvectors_right(10)[:,1])[-1]) z, x = np.histogram(m.eigenvectors_right(10)[:,1], weights=m.stationary_distribution, density=True, bins=bins) F = -np.log(z) F=F-F.min() plot(x[1:], F) scatter(x[1:], F) xlabel('MSM ev1', fontsize = 15) ylabel('Free Energy [kT]', fontsize =15) savefig(args.path+'/plot_iter'+str(args.cur_iter)+'_msm_free_energy.png', bbox_inches='tight', dpi=200) #which tica frames seleted tica0_sel=np.array([]) tica1_sel=np.array([]) for i in range(n_pick): tica0_sel=np.append(tica0_sel,y[picks[i][0]][frame_select[i],0]) tica1_sel=np.append(tica1_sel,y[picks[i][0]][frame_select[i],1]) clf() xlabel("TICA ev0") ylabel("TICA ev1") cp = scatter(tica0, tica1, s=10, c='blue', marker='o', linewidth=0.,cmap='jet', label='all frames') cp = scatter(tica0_sel, tica1_sel, s=10, c='red', marker='o', linewidth=0.,cmap='jet', label='selected') legend() savefig(args.path+'/plot_iter'+str(args.cur_iter)+'_tica_evs4_selected.png', bbox_inches='tight', dpi=200) #m.ck_test ck=m.cktest(2) clf() pyemma.plots.plot_cktest(ck, diag=True, figsize=(7,7), layout=(2,2), padding_top=0.1, y01=False, padding_between=0.3, dt=0.1, units='ns') savefig(args.path+'/plot_iter'+str(args.cur_iter)+'_msm_cktest.png') #lags = [1,2,5,10,20,50,100,200, 500,1000] #its = pyemma.msm.its(dtrajs, nits=10, lags=lags) #clf() #pyemma.plots.plot_implied_timescales(its, ylog=True, units='steps', linewidth=2) #xlim(0, 40); ylim(0, 120); #savefig(args.path+'/plot_iter'+str(args.cur_iter)+'_msm_its.png', bbox_inches='tight', dpi=200) its = pyemma.msm.its(dtrajs, errors='bayes', nits=10) clf() pyemma.plots.plot_implied_timescales(its, ylog=True, units='steps', linewidth=2) #xlim(0, 40); ylim(0, 120); savefig(args.path+'/plot_iter'+str(args.cur_iter)+'_msm_its2.png', bbox_inches='tight', dpi=200) #clf() #pyemma.plots.plot_implied_timescales(its, ylog=False, units='steps', linewidth=2, show_mle=False) ##xlim(0, 40); ylim(0, 120); #savefig(args.path+'/plot_iter'+str(args.cur_iter)+'_msm_its3.png', bbox_inches='tight', dpi=200) #which msm states selected #warning m only connected, c full -selected #m.active_set #state_picks #msm_states p_picks_active=[] for i in state_picks: if i in m.active_set: p_picks_active.append(np.argwhere(i==m.active_set)[0][0]) p_picks_active=np.unique(np.array(p_picks_active)).astype(int) clf() xlabel("MSM ev1") ylabel("MSM ev2") cp = scatter(m.eigenvectors_right(10)[:,1], m.eigenvectors_right(10)[:,2], s=10, c='blue', marker='o', linewidth=0.,cmap='jet', label='MSM states') cp = scatter(m.eigenvectors_right(10)[p_picks_active,1], m.eigenvectors_right(10)[p_picks_active,2], s=10, c='red', marker='o', linewidth=0.,cmap='jet', label='selected') legend(loc='center left', bbox_to_anchor=(1, 0.5)) savefig(args.path+'/plot_iter'+str(args.cur_iter)+'_msm_evs_4_select.png', bbox_inches='tight', dpi=200) p_states=np.array([]) p_unique=[] for p_iter in range(p_iter_max+1): p_arr=np.argwhere(iter_arr==p_iter) for i in p_arr: #print i[0] p_states=np.append(p_states,dtrajs[i[0]]) p_states=np.unique(p_states).astype(int) p_unique.append(p_states.shape[0]) p_unique=np.array(p_unique) np.save(args.path+'/npy_iter'+str(args.cur_iter)+'_p_unique.npy',p_unique) clf() fig=figure() ax = fig.add_subplot(111) ax.scatter(np.arange(p_unique.shape[0]),p_unique) ax.set_ylabel('# of current msm states explored') ax.set_xlabel('iteration') #ax.set_yscale('log') savefig(args.path+'/plot_iter'+str(args.cur_iter)+'_strategy.png', bbox_inches='tight', dpi=200) clf() xlabel("TICA ev0") ylabel("TICA ev1") for p_iter in range(p_iter_max,-1,-1): p_arr=np.argwhere(iter_arr==p_iter) tica0=np.array([]) tica1=np.array([]) for i in p_arr: #print i[0] tica0=np.append(tica0,y[i[0]][:,0]) tica1=np.append(tica1,y[i[0]][:,1]) cp = scatter(tica0, tica1, s=10, marker='o', linewidth=0.,cmap='jet', c=colornames[p_iter], label='iter '+str(p_iter)) legend(loc='center left', bbox_to_anchor=(1, 0.5)) savefig(args.path+'/plot_iter'+str(args.cur_iter)+'_tica_evs5_iters.png', bbox_inches='tight', dpi=200) clf() xlabel("MSM ev1") ylabel("MSM ev2") for p_iter in range(p_iter_max,-1,-1): p_arr=np.argwhere(iter_arr==p_iter) p_states=np.array([]) for i in p_arr: #print i[0] p_states=np.append(p_states,dtrajs[i[0]]) p_states=np.unique(p_states).astype(int) p_states_active=[] for i in p_states: if i in m.active_set: p_states_active.append(np.argwhere(i==m.active_set)[0][0]) p_states_active=np.unique(np.array(p_states_active)).astype(int) cp = scatter(m.eigenvectors_right(10)[p_states_active,1], m.eigenvectors_right(10)[p_states_active,2], s=10, marker='o', linewidth=0., cmap='spectral', c=colornames[p_iter], label='iter '+str(p_iter)) legend(loc='center left', bbox_to_anchor=(1, 0.5)) savefig(args.path+'/plot_iter'+str(args.cur_iter)+'_msm_evs_3_iter.png', bbox_inches='tight', dpi=200) print('time plotting finished', str(time.time()-time_start))
def GetRgRee(traj, DOP, NP, NAtomsPerChain=None, plotDir='RgRee_plots', RgDatName='RgTimeSeries', ReeDatName='ReeTimeSeries', RgStatOutName='RgReeStats', Ext='.dat', res0Id=0, autowarmup=True, nwarmup=100, plot=False): """NAtomsPerChain: used if running CG system, if provided will assume there is one residue per chain multiply coordinates by 10 if input traj was generated by lammps and unit is nonDim""" ElementDictionary = { "carbon": 12.01, "hydrogen": 1.008, "oxygen": 16.00, "nitrogen": 14.001, "virtual site": 1.0, "sodium": 23.0, "chloride": 35.5 } if plot: try: os.mkdir(plotDir) except: pass print('...Rg and Ree plots will be saved in {}...\n'.format(plotDir)) RgTimeseries = [range(traj.n_frames)] Rgheader = "Frame " RgSqStats = [] RgSqTimeseries = [range(traj.n_frames)] RgSqheader = "Frame " RgSqList = [] txtRg = "" ReeTimeseries = [range(traj.n_frames)] Reeheader = "Frame " ReeSqStats = [] ReeSqTimeseries = [range(traj.n_frames)] ReeSqheader = "Frame " ReeSqList = [] #get indices of residues in all chains MoleculeResidueList = [] BlockResName = [] if not NAtomsPerChain: #number residues per chain = DOP (for AA systems) for j in range(NP): resId = range(res0Id + j * DOP, res0Id + (j + 1) * DOP) MoleculeResidueList.append(resId) resname = [] for res in traj.topology.residues: if res.index in resId: resname.append(res.name) #check if diblock if j == 0: resname1 = resname[0] resname2 = resname[-1] i1 = np.where(np.array(resname) == resname1)[0] i2 = np.where(np.array(resname) == resname2)[0] if np.min(i1) == np.min(resId) and int( np.min(i2) - np.max(i1)) == 1 and np.max(i2) == np.max(resId): block = True BlockResName = [resname1, resname2] RgSqList_b = [[], []] RgSqStats_b = [[], []] print( 'Detect diblock:\n block 1: {} {}-mer, block 2: {} {}-mer' .format(resname1, len(i1), resname2, len(i2))) else: block = False else: #1 residue per chain (for CG system) a0Id = [atom.index for atom in traj.topology.residue(res0Id).atoms] a0Id = np.min(a0Id) MoleculeResidueList for i in range(NP): atomId_per_chain = range( a0Id + i * NAtomsPerChain, a0Id + i * NAtomsPerChain + NAtomsPerChain) resId_tmp = [ traj.topology.atom(aId).residue.index for aId in atomId_per_chain ] MoleculeResidueList.append(np.unique(resId_tmp)) block = False for j, resId in enumerate(MoleculeResidueList): resIdLow = np.min(resId) resIdUp = np.max(resId) atom_indices = traj.topology.select('resid {} to {}'.format( resIdLow, resIdUp)) if block: atom_indices_b = [] mass_list_b = [] for resname in BlockResName: ii = traj.topology.select( "resid {} to {} and resname '{}'".format( resIdLow, resIdUp, resname)) atom_indices_b.append(ii) tmp = [] for index in ii: element = str(traj.topology.atom(index).element) try: mass = ElementDictionary[element] except: mass = 1. tmp.append(mass) tmp = np.array(tmp) mass_list_b.append(tmp) mass_list = [] for index in atom_indices: element = str(traj.topology.atom(index).element) try: mass = ElementDictionary[element] except: mass = 1. mass_list.append(mass) mass_list = np.array(mass_list) if j == 0: print('Indices of atoms in chain {} \n{}'.format( j + 1, atom_indices)) print('Mass of atoms in a chain {}'.format(mass_list)) print('Evaluate Rg and Ree of chain {}/{}'.format( j + 1, len(MoleculeResidueList))) '''=== Compute Rg ===''' Rg = md.compute_rg(traj.atom_slice(atom_indices), masses=mass_list) RgTimeseries.append(Rg.tolist()) Rgheader += 'Rg{} '.format(j + 1) np.savetxt(RgDatName + Ext, np.transpose(RgTimeseries), fmt='%5.5f', header=Rgheader) RgSq = Rg**2. RgSqTimeseries.append(RgSq.tolist()) Rgheader += 'Rg{}^2 '.format(j + 1) np.savetxt('RgSqTimeSeries' + Ext, np.transpose(RgSqTimeseries), fmt='%5.5f', header=RgSqheader) #do stats on Rg^2 file = open('RgSqTimeSeries' + Ext, 'r') if autowarmup: warmup, Data, nwarmup = stats.autoWarmupMSER(file, j + 1) #print ("Auto warmup detection with MSER-5 => ",nwarmup) else: warmup, Data = stats.extractData(file, j + 1, nwarmup) (nsamples, (min, max), mean, semcc, kappa, unbiasedvar, autocor) = stats.doStats(warmup, Data, False, False, '_{0}_mol{1}'.format(file.name, j + 1)) Data = Data[::int(np.max([1., kappa]))] # get decorrelated samples RgSqList.extend(Data) lines = "" lines += '\n==== Rg^2 for molecule {} ===='.format(j + 1) lines += "\n - Mean = {} +/- {}".format( mean, semcc) lines += "\n - Equilibrated samples = {}".format(nsamples) lines += "\n - Correlation time = {}".format(kappa) lines += "\n - Effective # samples = {}".format(nsamples / kappa) lines += "\n - Reduced-bias variance = {}".format(unbiasedvar) # note that there is no unbiased estimator for the population standard deviation. We can use sqrt(var) as a indicative estimator. lines += "\n - S.D. (unbiased, biased) = {} {}".format( np.sqrt(unbiasedvar), np.std(Data, ddof=0) ) # ddof is correction to 1/N...using ddof=1 returns regular reduced-bias estimator lines += "\n - Min, Max = {} {}\n".format(min, max) txtRg += lines Avg = mean Std = np.sqrt(unbiasedvar) Err = semcc CorrTime = kappa NUncorrSamples = nsamples / kappa RgSqStats.append([Avg, Std, CorrTime, Err, NUncorrSamples]) ''' Plot Rg ''' if plot: plt.axvspan(0, nwarmup, alpha=0.5, color='#6495ED') plt.plot(Rg, "k-") plt.xlim(0) plt.xlabel('timestep') plt.ylabel('Radius-of-gryation') plt.savefig("{}/Rg{}.png".format(plotDir, j + 1), bbox_inches='tight') plt.close() ''' Rg of blocks ''' if block: Rg_b = [] for i, ai in enumerate(atom_indices_b): Rg_tmp = md.compute_rg(traj.atom_slice(ai), masses=mass_list_b[i]) Rg_b.append(Rg_tmp) Rg_b = np.array(Rg_b) RgSq_b = Rg_b**2. for i, RgSq in enumerate(RgSq_b): data = [range(0, len(RgSq))] data.append(RgSq.tolist()) np.savetxt('tmp.dat', np.transpose(data), fmt='%5.5f') #do stats on Rg^2 file = open('tmp.dat', 'r') if autowarmup: warmup, Data, nwarmup = stats.autoWarmupMSER(file, 1) else: warmup, Data = stats.extractData(file, 1, nwarmup) (nsamples, (min, max), mean, semcc, kappa, unbiasedvar, autocor) = stats.doStats(warmup, Data, False, False, '_{0}_mol{1}'.format(file.name, 1)) Data = Data[::int(np.max([1., kappa ]))] # get decorrelated samples RgSqList_b[i].extend(Data) Avg = mean Std = np.sqrt(unbiasedvar) Err = semcc CorrTime = kappa NUncorrSamples = nsamples / kappa RgSqStats_b[i].append( [Avg, Std, CorrTime, Err, NUncorrSamples]) os.remove("tmp.dat") '''=== Compute Ree ===''' atom_pairs = [np.min(atom_indices), np.max(atom_indices)] Ree = md.compute_distances(traj, atom_pairs=[atom_pairs], periodic=False, opt=True) Ree = Ree.tolist() Ree = [a[0] for a in Ree] ReeTimeseries.append(Ree) Reeheader += 'Ree{} '.format(j + 1) np.savetxt(ReeDatName + Ext, np.transpose(ReeTimeseries), fmt='%5.5f', header=Reeheader) ReeSq = np.array(Ree)**2. ReeSqTimeseries.append(ReeSq.tolist()) Reeheader += 'Ree{}^2 '.format(j + 1) np.savetxt('ReeSqTimeSeries' + Ext, np.transpose(ReeSqTimeseries), fmt='%5.5f', header=ReeSqheader) #do stats on Ree^2 file = open('ReeSqTimeSeries' + Ext, 'r') if autowarmup: warmup, Data, nwarmup = stats.autoWarmupMSER(file, j + 1) #print ("Auto warmup detection with MSER-5 => ",nwarmup) else: warmup, Data = stats.extractData(file, j + 1, nwarmup) (nsamples, (min, max), mean, semcc, kappa, unbiasedvar, autocor) = stats.doStats(warmup, Data, False, False, '_{0}_mol{1}'.format(file.name, j + 1)) Data = Data[::int(np.max([1., kappa]))] ReeSqList.extend(Data) lines = "" lines += '\n==== Ree^2 for molecule {} ===='.format(j + 1) lines += "\n - Mean = {} +/- {}".format( mean, semcc) lines += "\n - Equilibrated samples = {}".format(nsamples) lines += "\n - Correlation time = {}".format(kappa) lines += "\n - Effective # samples = {}".format(nsamples / kappa) lines += "\n - Reduced-bias variance = {}".format(unbiasedvar) # note that there is no unbiased estimator for the population standard deviation. We can use sqrt(var) as a indicative estimator. lines += "\n - S.D. (unbiased, biased) = {} {}".format( np.sqrt(unbiasedvar), np.std(Data, ddof=0) ) # ddof is correction to 1/N...using ddof=1 returns regular reduced-bias estimator lines += "\n - Min, Max = {} {}\n".format(min, max) txtRg += lines Avg = mean Std = np.sqrt(unbiasedvar) Err = semcc CorrTime = kappa NUncorrSamples = nsamples / kappa ReeSqStats.append([Avg, Std, CorrTime, Err, NUncorrSamples]) ''' Plot Ree ''' if plot: plt.axvspan(0, nwarmup, alpha=0.5, color='#6495ED') plt.plot(Ree, "k-") plt.xlim(0) plt.xlabel('timestep') plt.ylabel('End-to-end distance') plt.savefig("{}/Ree{}.png".format(plotDir, j + 1), bbox_inches='tight') plt.close() # get RMS Rg and Ree RgSqList = np.array(RgSqList) RgRMS = np.sqrt(np.mean(RgSqList)) RgSqErr = scipy.stats.sem(RgSqList) RgRMSErr = 1. / 2. * RgSqErr / RgRMS # propagate SEM of Rg^2 to Rg RgSqStd = np.std(RgSqList, ddof=1) RgRMSStd = 1. / 2. * RgSqStd / RgRMS # propagate Std of Rg^2 to Rg RgSqStats = np.array(RgSqStats) RgRMSCorrTime = np.mean(RgSqStats[:, 2]) RgRMSCorrTimeErr = np.sqrt(np.var(RgSqStats[:, 2]) / len(RgSqStats[:, 2])) RgRMSNUncorrSamples = np.mean(RgSqStats[:, 4]) #Rg of blocks RgRMS_b = [] RgRMSErr_b = [] RgRMSStd_b = [] RgRMSCorrTime_b = [] RgRMSCorrTimeErr_b = [] RgRMSNUncorrSamples_b = [] if block: for i, resname in enumerate(BlockResName): RgSqList = np.array(RgSqList_b[i]) RgRMS_b.append(np.sqrt(np.mean(RgSqList))) Err = scipy.stats.sem(RgSqList) RgRMSErr_b.append(1. / 2. * Err / RgRMS_b[i]) Std = np.std(RgSqList, ddof=1) RgRMSStd_b.append(1. / 2. * Std / RgRMS_b[i]) RgSqStats = np.array(RgSqStats_b[i]) RgRMSCorrTime_b.append(np.mean(RgSqStats[:, 2])) RgRMSCorrTimeErr_b.append( np.sqrt(np.var(RgSqStats[:, 2]) / len(RgSqStats[:, 2]))) RgRMSNUncorrSamples_b.append(np.mean(RgSqStats[:, 4])) #Ree ReeSqList = np.array(ReeSqList) ReeRMS = np.sqrt(np.mean(ReeSqList)) ReeSqErr = scipy.stats.sem(ReeSqList) ReeRMSErr = 1. / 2. * ReeSqErr / ReeRMS ReeSqStd = np.std(ReeSqList, ddof=1) ReeRMSStd = 1. / 2. * ReeSqStd / ReeRMS ReeSqStats = np.array(ReeSqStats) ReeRMSCorrTime = np.mean(ReeSqStats[:, 2]) ReeRMSCorrTimeErr = np.sqrt( np.var(ReeSqStats[:, 2]) / len(ReeSqStats[:, 2])) ReeRMSNUncorrSamples = np.mean(ReeSqStats[:, 4]) lines = "" lines += '\n\n=====================' lines += '\n\nRMS of Rg is: {0:2.4f} +/- {1:2.5f}'.format(RgRMS, RgRMSErr) lines += '\nRMS Rg correlation time: {0:5.4f} +/- {1:5.6f}'.format( RgRMSCorrTime, RgRMSCorrTimeErr) lines += '\n\nRMS of Ree is: {0:2.4f} +/- {1:2.5f}'.format( ReeRMS, ReeRMSErr) lines += '\nRMS Ree correlation time: {0:5.4f} +/- {1:5.6f}'.format( ReeRMSCorrTime, ReeRMSCorrTimeErr) if block: for i, resname in enumerate(BlockResName): lines += '\n\nRMS of Rg for block %i-%s is: %2.4f +/- %2.5f' % ( i + 1, resname, RgRMS_b[i], RgRMSErr_b[i]) lines += '\nRMS Rg correlation time: {0:5.4f} +/- {1:5.6f}'.format( RgRMSCorrTime_b[i], RgRMSCorrTimeErr_b[i]) print(lines + '\n') txtRg += lines f = open(RgStatOutName + Ext, 'w') f.write(txtRg) return RgRMS, ReeRMS, RgRMSErr, ReeRMSErr, RgRMSCorrTime, RgRMSCorrTimeErr, RgRMSNUncorrSamples, ReeRMSCorrTime, ReeRMSCorrTimeErr, ReeRMSNUncorrSamples, RgRMSStd, ReeRMSStd, RgRMS_b, RgRMSErr_b, RgRMSStd_b, RgRMSCorrTime_b, RgRMSCorrTimeErr_b, RgRMSNUncorrSamples_b, BlockResName
def rg(self): trj = md.Trajectory( self.xyz, self.ADP.mdtraj_topology, unitcell_lengths=self.dims ) rg = md.compute_rg(trj)[0] return rg
import os, sys from msmbuilder import Project import mdtraj as md from mdtraj import io import numpy as np project = Project.load_from("ProjectInfo-RRR.yaml") Rgs = -1 * np.ones((project.n_trajs, max(project.traj_lengths))) for i in range(project.n_trajs): t = project.load_traj(i) rg = md.compute_rg(t) Rgs[i][:len(rg)] = rg io.saveh('Rgs-RRR.h5', Rgs)
def RG(self): for seedi in range(len(self.FNSeeds)): self.RGs[seedi] = md.compute_rg(self.trajectories[seedi], masses=None) print("Radius of gyration:") print((self.RGs[seedi]).shape)
def get_rg(trj): #func isn't really needed at the moment, but maybe in future(?) return(md.compute_rg(trj))
def getRgRee(trajFile, top, DOP, NP, NAtomsPerChain=None, RgDatName='RgTimeSeries', ReeDatName='ReeTimeSeries', RgStatOutName='RgReeStats', Ext='.dat', res0Id=0, stride=1, autowarmup=True, warmup=100, plot=False): """NAtomsPerChain: used if running CG system, if provided will assume there is one residue per chain""" ElementDictionary = { "carbon": 12.01, "hydrogen": 1.008, "oxygen": 16.00, "nitrogen": 14.001, "virtual site": 1.0, "virtual_site": 1.0, "sodium": "na+" } traj = md.load(trajFile, top=top, stride=stride) # traj.make_molecules_whole(inplace=True, sorted_bonds=None) # Automatically finds the bonds from the topology file RgStats = [] RgTimeseries = [range(traj.n_frames)] Rgheader = "Frame " txtRg = "" ReeStats = [] ReeTimeseries = [range(traj.n_frames)] Reeheader = "Frame " #get indices of residues in all chains MoleculeResidueList = [] if not NAtomsPerChain: #number residues per chain = DOP (for AA systems) for j in range(NP): resId = range(res0Id + j * DOP, res0Id + (j + 1) * DOP) MoleculeResidueList.append(resId) else: #1 residue per chain (for CG system) x = range(res0Id, res0Id + NP) MoleculeResidueList = [[a] for a in x] for j, resId in enumerate(MoleculeResidueList): resIdLow = np.min(resId) resIdUp = np.max(resId) atom_indices = traj.topology.select('resid {} to {}'.format( resIdLow, resIdUp)) print('Indices of atoms in chain {} \n{}'.format(j + 1, atom_indices)) mass_list = [] for index in atom_indices: element = str(traj.topology.atom(index).element) try: mass = ElementDictionary[element] except: mass = 1. mass_list.append(mass) mass_list = np.array(mass_list) '''=== Compute Rg ===''' Rg = md.compute_rg(traj.atom_slice(atom_indices), masses=mass_list) RgTimeseries.append(Rg.tolist()) Rgheader += 'Rg{} '.format(j + 1) np.savetxt(RgDatName + Ext, np.transpose(RgTimeseries), fmt='%5.5f', header=Rgheader) #do stats file = open(RgDatName + Ext, 'r') if autowarmup: warmup, Data, nwarmup = stats.autoWarmupMSER(file, j + 1) print("Auto warmup detection with MSER-5 => ", nwarmup) else: warmup, Data = stats.extractData(file, j + 1, warmup) (nsamples, (min, max), mean, semcc, kappa, unbiasedvar, autocor) = stats.doStats(warmup, Data, False, False, '_{0}_mol{1}'.format(file.name, j + 1)) lines = "" lines += '\n==== Rg for molecule {} ===='.format(j + 1) lines += "\n - Mean = {} +/- {}".format( mean, semcc) lines += "\n - Equilibrated samples = {}".format(nsamples) lines += "\n - Correlation time = {}".format(kappa) lines += "\n - Effective # samples = {}".format(nsamples / kappa) lines += "\n - Reduced-bias variance = {}".format(unbiasedvar) # note that there is no unbiased estimator for the population standard deviation. We can use sqrt(var) as a indicative estimator. lines += "\n - S.D. (unbiased, biased) = {} {}".format( np.sqrt(unbiasedvar), np.std(Data, ddof=0) ) # ddof is correction to 1/N...using ddof=1 returns regular reduced-bias estimator lines += "\n - Min, Max = {} {}\n".format(min, max) print(lines) txtRg += lines RgAvg = mean RgStd = np.sqrt(unbiasedvar) RgErr = semcc CorrTime = kappa NUncorrSamples = nsamples / kappa RgStats.append([RgAvg, RgStd, CorrTime, RgErr, NUncorrSamples]) # print ('The Rg for molecule {} (mean, error, std)'.format(j)) # print ('\t{0:2.4f}\t{1:2.5f}\t{1:2.5f}'.format(RgAvg, RgErr, RgStd)) ''' Plot Rg ''' if plot: plt.plot(Rg, "k-") plt.xlabel('timestep') plt.ylabel('Radius-of-gryation') plt.savefig("Rg{}.png".format(j + 1), bbox_inches='tight') plt.close() '''=== Compute Ree ===''' atom_pairs = [np.min(atom_indices), np.max(atom_indices)] Ree = md.compute_distances(traj, atom_pairs=[atom_pairs], periodic=False, opt=True) Ree = Ree.tolist() Ree = [a[0] for a in Ree] ReeTimeseries.append(Ree) Reeheader += 'Ree{} '.format(j + 1) np.savetxt(ReeDatName + Ext, np.transpose(ReeTimeseries), fmt='%5.5f', header=Reeheader) #do stats file = open(ReeDatName + Ext, 'r') if autowarmup: warmup, Data, nwarmup = stats.autoWarmupMSER(file, j + 1) print("Auto warmup detection with MSER-5 => ", nwarmup) else: warmup, Data = stats.extractData(file, j + 1, warmup) (nsamples, (min, max), mean, semcc, kappa, unbiasedvar, autocor) = stats.doStats(warmup, Data, False, False, '_{0}_mol{1}'.format(file.name, j + 1)) lines = "" lines += '\n==== Ree for molecule {} ===='.format(j + 1) lines += "\n - Mean = {} +/- {}".format( mean, semcc) lines += "\n - Equilibrated samples = {}".format(nsamples) lines += "\n - Correlation time = {}".format(kappa) lines += "\n - Effective # samples = {}".format(nsamples / kappa) lines += "\n - Reduced-bias variance = {}".format(unbiasedvar) # note that there is no unbiased estimator for the population standard deviation. We can use sqrt(var) as a indicative estimator. lines += "\n - S.D. (unbiased, biased) = {} {}".format( np.sqrt(unbiasedvar), np.std(Data, ddof=0) ) # ddof is correction to 1/N...using ddof=1 returns regular reduced-bias estimator lines += "\n - Min, Max = {} {}\n".format(min, max) print(lines) txtRg += lines ReeAvg = mean ReeStd = np.sqrt(unbiasedvar) ReeErr = semcc CorrTime = kappa NUncorrSamples = nsamples / kappa ReeStats.append([ReeAvg, ReeStd, CorrTime, ReeErr, NUncorrSamples]) ''' Plot Ree ''' if plot: plt.plot(Ree, "k-") plt.xlabel('timestep') plt.ylabel('End-to-end distance') plt.savefig("Ree{}.png".format(j + 1), bbox_inches='tight') plt.close() #get averages of stats RgStats = np.array(RgStats) RgAvg = np.mean(RgStats[:, 0]) RgStd = np.mean(RgStats[:, 1]) RgCorrTime = np.mean(RgStats[:, 2]) RgErr = np.mean(RgStats[:, 3]) RgErr_Prop = np.sqrt(np.sum(RgStats[:, 3]**2)) / NP RgCorrTimeErr = np.sqrt(np.var(RgStats[:, 2]) / len(RgStats[:, 2])) RgNUncorrSamples = np.mean(RgStats[:, 4]) ReeStats = np.array(ReeStats) ReeAvg = np.mean(ReeStats[:, 0]) ReeStd = np.mean(ReeStats[:, 1]) ReeCorrTime = np.mean(ReeStats[:, 2]) ReeErr = np.mean(ReeStats[:, 3]) ReeErr_Prop = np.sqrt(np.sum(ReeStats[:, 3]**2)) / NP ReeCorrTimeErr = np.sqrt(np.var(ReeStats[:, 2]) / len(ReeStats[:, 2])) ReeNUncorrSamples = np.mean(ReeStats[:, 4]) lines = "" lines += '\n\n=====================\nTotal Rg average is: {0:2.3f} +/- {1:2.5f}'.format( RgAvg, RgErr) lines += '\nTotal Rg avg. correlation time: {0:5.4f} +/- {1:5.6f}'.format( RgCorrTime, RgCorrTimeErr) lines += '\n\nTotal Ree average is: {0:2.3f} +/- {1:2.5f}'.format( ReeAvg, ReeErr) lines += '\nTotal Ree avg. correlation time: {0:5.4f} +/- {1:5.6f}'.format( ReeCorrTime, ReeCorrTimeErr) print(lines) txtRg += lines f = open(RgStatOutName + Ext, 'w') f.write(txtRg) return RgAvg, RgStd, RgErr, RgCorrTime, RgCorrTimeErr, RgNUncorrSamples, ReeAvg, ReeStd, ReeErr, ReeCorrTime, ReeCorrTimeErr, ReeNUncorrSamples
traj.save_xtc(os.path.join(DATA_FOLDER, 'test.xtc')) # rmsd = [ md.rmsd(traj, traj[0]) for traj in traj_list ] # print type(rmsd) for traj in traj_list: x_axis = np.arange(0.0,1.0 * len(traj), 1.0) / 4.0 y_axis = md.rmsd(traj, traj[0]) plt.plot(x_axis, y_axis, 'k') plt.show() for traj in traj_list: x_axis = np.arange(0.0,1.0 * len(traj), 1.0) / 4.0 traj = traj.superpose(traj[0], atom_indices = heme_indices) y_axis = md.rmsd(traj, traj[0], atom_indices = heme_indices) plt.plot(x_axis, y_axis, 'k') plt.show() for traj in traj_list: x_axis = np.arange(0.0,1.0 * len(traj), 1.0) / 4.0 traj = traj.superpose(traj[0]) y_axis = md.compute_rg(traj) plt.plot(x_axis, y_axis, 'k') plt.show()
def rg_feature(traj): return md.compute_rg(traj).astype(np.float32).reshape(-1, 1)
def compute_rg(fname, topname, step=1): rg = [] for chunk in md.iterload(fname, top=topname, stride=step): rg.append(md.compute_rg(chunk)) rg = np.concatenate(rg) return rg
def sample(MD_trajectories, MD_top, projected_trajectories, atom_selection = None, proj_idxs=[0,1], n_points=100, n_geom_samples=1, keep_all_samples = False, proj_stride=1, verbose=False, return_data=False ): r""" Returns a sample of molecular geometries and their positions in the projected space Parameters ---------- MD_trajectories : list of strings Filenames (any extension that :py:obj:`mdtraj` can read is accepted) containing the trajectory data. There is an untested input mode where the user parses directly :obj:`mdtraj.Trajectory` objects MD_top : str to topology filename or directly :obj:`mdtraj.Topology` object projected_trajectories : (list of) strings or (list of) numpy ndarrays of shape (n_frames, n_dims) Time-series with the projection(s) that want to be explored. You can provide .npy-filenames or readable asciis (.dat, .txt etc). Alternatively, you can feed in your own PyEMMA-clustering object NOTE: molpx assumes that there is no time column. atom_selection : string or iterable of integers, default is None The geometries of the original trajectory files will be filtered down to these atoms. It can be any DSL string that mdtraj.Topology.select could understand or directly the iterable of integers. If :py:obj`MD_trajectories` is already a (list of) md.Trajectory objects, the atom-slicing can take place before calling this method. proj_idxs: int, default is None Selection of projection idxs (zero-idxd) to visualize. The default behaviour is that proj_idxs = range(n_projs). However, if proj_idxs != None, then n_projs is ignored and proj_dim is set automatically n_points : int, default is 100 Number of points along the projection path. The higher this number, the higher the projected coordinate is resolved, at the cost of more computational effort. It's a trade-off parameter n_geom_samples : int, default is 1 For each of the :obj:`n_points` along the projection path, :obj:`n_geom_samples` will be retrieved from the trajectory files. The higher this number, the *smoother* the minRMSD projection path. Also, the longer it takes for the path to be computed. This is a trade-off parameter between how smooth the transitons between geometries can be and how long it takes to generate the sample keep_all_samples : boolean, default is False In principle, once the closest-to-ref geometry has been kept, the other geometries are discarded, and the output sample contains only n_point geometries. There are, still, special cases where the user might want to keep all sampled geometries. Typical use-case is when the n_points is low and many representatives per clustercenters will be much more informative than the other way around. This is an advanced feature that other methods of molPX use internally for generating overlays, be awere that it changes the return type of :obj:`geom_smpl` from the default (an :obj:`mdtraj.Trajectory` with :obj:`n_points`-frames) to a list list of length :obj:`n_geom_samples`, each element is an :obj:`mdtraj.Trajectory` object of :obj:`n_points`-frames proj_stride : int, default is 1 Stride value that was used in the :obj:`projected_trajectories` relative to the :obj:`MD_trajectories` If the original :obj:`MD_trajectories` were stored every 5 ps but the projected trajectories were stored every 50 ps, :obj:`proj_stride` = 10 has to be provided, otherwise an exception will be thrown informing the user that the :obj:`MD_trajectories` and the :obj:`projected_trajectories` have different number of frames. Returns -------- pos : ndarray with the positions of the sample geom_smpl : sampled geometries. Can be of two types: * default: :obj:`mdtraj.Trajectory` with :obj:`n_points`-frames * if keep_all_samples = True: list of length :obj:`n_geom_samples`. Each element is an :obj:`mdtraj.Trajectory` object of :obj:`n_points`-frames. """ MD_trajectories = _bmutils.listify_if_not_list(MD_trajectories) if isinstance(MD_trajectories[0], _md.Trajectory): src = MD_trajectories else: src = _source(MD_trajectories, top=MD_top) # Find out if we already have a clustering object try: projected_trajectories.dtrajs cl = projected_trajectories except: idata = _bmutils.data_from_input(projected_trajectories) cl = _bmutils.regspace_cluster_to_target([dd[:,proj_idxs] for dd in idata], n_points, n_try_max=10, verbose=verbose) pos = cl.clustercenters cat_smpl = cl.sample_indexes_by_cluster(_np.arange(cl.n_clusters), n_geom_samples) geom_smpl = _bmutils.save_traj_wrapper(src, _np.vstack(cat_smpl), None, top=MD_top, stride=proj_stride) atom_slice = _bmutils.parse_atom_sel(atom_selection, geom_smpl.top) if atom_slice is not None: geom_smpl = geom_smpl.atom_slice(atom_slice) if n_geom_samples>1: geom_smpl = _bmutils.re_warp(geom_smpl, [n_geom_samples] * cl.n_clusters) if not keep_all_samples: # Of the most populated geom, get the most compact most_pop = _np.bincount(_np.hstack(cl.dtrajs)).argmax() geom_most_pop = geom_smpl[most_pop][_md.compute_rg(geom_smpl[most_pop]).argmin()] geom_smpl = _bmutils.slice_list_of_geoms_to_closest_to_ref(geom_smpl, geom_most_pop) else: geom_smpl = _bmutils.transpose_geom_list(geom_smpl) if not return_data: return pos, geom_smpl else: return pos, geom_smpl, idata
def get_good_starting_point(cl, geom_samples, cl_order=None, strategy='smallest_Rgyr'): r""" provided a pyemma-cl object and a list of geometries, return the index of the clustercenter that's most suited to start a minimally diffusing path. Parameters ---------- cl : :obj:`pyemma.coordinates` clustering object geom_samples : list of :obj:`mdtraj.Trajectory` objects corresponding to each clustercenter in :obj:`cl` cl_order : None or iterable of integers The order of the list :obj:`geom_samples` may or may not correspond to the order of :obj:`cl`. Very often, :obj:`geom_samples` is sorted in ascending order of a given coordinate while the clustercenters in :obj:`cl` are not. :obj:`cl_order` represents this reordering, so that :obj:`geom_samples[cl_order]` reproduces the order of the clusterscenters, so that finally: :obj:`geom_samples[cl_order][i]` contains geometries sampled for the :obj:`i`-th clustercenter strategy : str, default is 'smallest_Rgyr' Which property gets optimized * *smallest_Rgyr*: look for the geometries with smallest radius of gyration(:obj:`mdtraj.compute_rg`), regardless of the population * *most_pop*: look for the clustercenter that's most populated, regardless of the associated geometries * *most_pop_x_smallest_Rgyr*: Mix both criteria. Weight Rgyr values with population to avoid highly compact but rarely populated structures * *bimodal_compact*: assume the distribution of clustercenters is bimodal, then locate its centers and choose the one with smaller Rgyr * *bimodal_open*: assume the distribution of clustercenters is bimodal, then locate its centers and choose the one with larger Rgyr Returns ------- start_idx : int, ndex of list :obj:`geom_samples` The :obj:`mdtraj.Trajectory` in :obj:`geom_samples[start_idx]` satisfies best the :obj:`strategy` criterion """ if cl_order is None: cl_order = _np.arange(cl.n_clusters) if strategy == 'smallest_Rgyr': start_idx = _np.argmin( [_md.compute_rg(igeoms).mean() for igeoms in geom_samples]) elif strategy == 'most_pop': start_idx = (_np.bincount(_np.hstack(cl.dtrajs))[cl_order]).argmax() elif strategy == 'most_pop_x_smallest_Rgyr': rgyr = _np.array( [_md.compute_rg(igeoms).mean() for igeoms in geom_samples]) pop = (_np.bincount(_np.hstack(cl.dtrajs))[cl_order]).astype('float') # Normalize rgyr -= rgyr.min() rgyr = -rgyr + rgyr.max() rgyr /= rgyr.sum() pop /= pop.sum() start_idx = _np.argmax(rgyr * pop) elif strategy in ['bimodal_compact', 'bimodal_open']: # assume bimodality in the coordinate of interest (usually the case at least for TIC_0) (left_idx, right_idx), igmm = find_centers_gmm( _np.vstack(cl.data_producer.data).reshape(-1, 1), cl.clustercenters[cl_order].squeeze(), n_components=2) # bias towards starting points with compact structures (low small radius of gyration) left_value, right_value = _md.compute_rg(geom_samples[left_idx]).mean(), \ _md.compute_rg(geom_samples[right_idx]).mean() if strategy == 'bimodal_compact': start_idx = [left_idx, right_idx][_np.argmin([left_value, right_value])] else: start_idx = [left_idx, right_idx][_np.argmax([left_value, right_value])] else: raise NotImplementedError("This starting point strategy is unkown %s" % strategy) return start_idx