def test_rmsd_khybrid_mpi_subsample(): TRJFILE = os.path.join(os.path.dirname(__file__), 'data', 'frame0.xtc') TOPFILE = os.path.join(os.path.dirname(__file__), 'data', 'native.pdb') SELECTION = '(name N or name C or name CA or name H or name O)' SUBSAMPLE_FACTOR = 3 expected_size = (5, (np.ceil(501 / 3),) * 5) with tempfile.TemporaryDirectory() as tdname: tdname = mpi.comm.bcast(tdname, root=0) for i in range(expected_size[0]): shutil.copy(TRJFILE, os.path.join(tdname, 'frame%s.xtc' % i)) with warnings.catch_warnings(): warnings.filterwarnings('ignore') a, d, inds, s = runhelper([ '--trajectories', os.path.join(tdname, 'frame?.xtc'), '--topology', TOPFILE, '--cluster-radius', '0.1', '--subsample', str(SUBSAMPLE_FACTOR), '--atoms', SELECTION, '--algorithm', 'khybrid'], expected_size=expected_size, expect_reassignment=False) trj = md.load(TRJFILE, top=TOPFILE) expected_s = md.join([trj[i[1]] for i in inds]) assert_array_equal(expected_s.xyz, md.join(s).xyz)
def test_rmsd_cluster_mpi_subsample(): TRJFILE = get_fn('frame0.xtc') TOPFILE = get_fn('native.pdb') SELECTION = '(name N or name C or name CA or name H or name O)' SUBSAMPLE_FACTOR = 3 expected_size = (5, (np.ceil(501 / 3), ) * 5) with tempfile.TemporaryDirectory() as tdname: tdname = MPI.COMM_WORLD.bcast(tdname, root=0) for i in range(expected_size[0]): shutil.copy(TRJFILE, os.path.join(tdname, 'frame%s.xtc' % i)) with warnings.catch_warnings(): warnings.filterwarnings('ignore') a, d, inds, s = runhelper([ '--trajectories', os.path.join(tdname, 'frame?.xtc'), '--topology', TOPFILE, '--cluster-radii', '0.1', '--subsample', str(SUBSAMPLE_FACTOR), '--selection', SELECTION, '--random-state', str(2), '--kmedoids-iters', str(1), ], expected_size=expected_size) a = a.flatten() d = d.flatten() trj = md.load(TRJFILE, top=TOPFILE) trj_sele = trj.atom_slice(trj.top.select(SELECTION)) expected_s = md.join([trj[i[1]] for i in inds]) assert_array_equal(expected_s.xyz, md.join(s).xyz) expect_a, expect_d = assign_to_nearest_center( md.join([trj_sele] * expected_size[0]), md.join([trj_sele[i[1]] for i in inds]), md.rmsd) assert_array_equal(expect_a[::SUBSAMPLE_FACTOR], a) assert_allclose(expect_d[::SUBSAMPLE_FACTOR], d, atol=1e-4)
def test_rmsd_cluster_mpi_basic(): expected_size = (2, 501) TRJFILE = get_fn('frame0.xtc') TOPFILE = get_fn('native.pdb') SELECTION = '(name N or name C or name CA or name H or name O)' with tempfile.TemporaryDirectory() as tdname: shutil.copy(TRJFILE, os.path.join(tdname, 'frame0.xtc')) shutil.copy(TRJFILE, os.path.join(tdname, 'frame1.xtc')) tdname = MPI.COMM_WORLD.bcast(tdname, root=0) print('rank', MPI.COMM_WORLD.Get_rank()) MPI.COMM_WORLD.Barrier() a, d, i, s = runhelper([ '--trajectories', os.path.join(tdname, 'frame?.xtc'), '--topology', TOPFILE, '--cluster-radii', '0.1', '--selection', SELECTION, '--kmedoids-iters', 0, ], expected_size=expected_size) a = a.flatten() d = d.flatten() trj = md.load(TRJFILE, top=TOPFILE) trj_sele = trj.atom_slice(trj.top.select(SELECTION)) # expected_i = [(1, 194), (1, 40), (0, 430), (1, 420)] expected_i = [[0, 0], [0, 42], [0, 430], [0, 319]] assert_array_equal(i, expected_i) expected_s = md.join([trj[i[1]] for i in expected_i]) assert_array_equal(expected_s.xyz, md.join(s).xyz) expect_a, expect_d = assign_to_nearest_center( md.join([trj_sele] * 2), md.join([trj_sele[i[1]] for i in expected_i]), md.rmsd) assert_array_equal(expect_a, a) assert_allclose(expect_d, d, atol=1e-4)
def test_rmsd_khybrid_mpi_basic(): expected_size = (2, 501) TRJFILE = os.path.join(os.path.dirname(__file__), 'data', 'frame0.xtc') TOPFILE = os.path.join(os.path.dirname(__file__), 'data', 'native.pdb') SELECTION = '(name N or name C or name CA or name H or name O)' with tempfile.TemporaryDirectory() as tdname: shutil.copy(TRJFILE, os.path.join(tdname, 'frame0.xtc')) shutil.copy(TRJFILE, os.path.join(tdname, 'frame1.xtc')) tdname = mpi.comm.bcast(tdname, root=0) mpi.comm.Barrier() a, d, idx, s = runhelper([ '--trajectories', os.path.join(tdname, 'frame?.xtc'), '--topology', TOPFILE, '--cluster-radius', '0.095', '--atoms', SELECTION, '--algorithm', 'khybrid'], expected_size=expected_size) a = a.flatten() d = d.flatten() trj = md.load(TRJFILE, top=TOPFILE) trj_sele = trj.atom_slice(trj.top.select(SELECTION)) expected_s = md.join([trj[i[1]] for i in idx]) expected_i = [[0, 0], [0, 55], [1, 102], [1, 196]] assert_array_equal(idx, expected_i) expected_s = md.join([trj[i[1]] for i in idx]) assert_array_equal( expected_s.xyz, md.join(s).xyz) expect_a, expect_d = util.assign_to_nearest_center( md.join([trj_sele] * 2), md.join([trj_sele[i[1]] for i in idx]), md.rmsd) assert_array_equal(expect_a, a) assert_allclose(expect_d, d, atol=1e-4)
def test_exposons_pipeline_weighting(self): repeat_trj = md.join([self.trj[0:3], self.trj[0:3], self.trj[3:6]]) norepeat_trj = md.join([self.trj[0:3], self.trj[3:6]]) unweighted_mi, unweighted_exp = exposons(repeat_trj, 0.9, threshold=1.0) weighted_mi, weighted_exp = exposons(norepeat_trj, 0.9, threshold=1.0, weights=[2, 2, 2, 1, 1, 1]) assert_allclose(unweighted_mi, weighted_mi, rtol=1e-14) assert_array_equal(unweighted_exp, weighted_exp)
def generate_traj_from_stateinds(inds, meta, atom_selection='all'): """ Concatenate several frames from different trajectories to create a new one. Parameters ---------- inds: list of tuples, Each element of the list has to be a 2D tuple of ints (traj_index, frame_index) meta: a metadata object atom_selection: str, Which atoms to load Returns ------- traj: mdtraj.Trajectory """ frame_list = [] for traj_i, frame_i in inds: top = mdtraj.load_prmtop(meta.loc[traj_i]['top_fn']) atoms = top.select(atom_selection) frame_list.append( mdtraj.load_frame(meta.loc[traj_i]['traj_fn'], atom_indices=atoms, index=frame_i, top=meta.loc[traj_i]['top_fn']) ) traj = mdtraj.join(frame_list, check_topology=False) traj.center_coordinates() traj.superpose(traj, 0) return traj
def _execute(self, directory, available_resources): import mdtraj if len(self.input_coordinate_paths) != len( self.input_trajectory_paths): raise ValueError( "There should be the same number of coordinate and trajectory paths." ) if len(self.input_trajectory_paths) == 0: raise ValueError("No trajectories were given to concatenate.") trajectories = [] output_coordinate_path = None for coordinate_path, trajectory_path in zip( self.input_coordinate_paths, self.input_trajectory_paths): output_coordinate_path = output_coordinate_path or coordinate_path trajectories.append( mdtraj.load_dcd(trajectory_path, coordinate_path)) self.output_coordinate_path = output_coordinate_path output_trajectory = (trajectories[0] if len(trajectories) == 1 else mdtraj.join(trajectories, False, False)) self.output_trajectory_path = path.join(directory, "output_trajectory.dcd") output_trajectory.save_dcd(self.output_trajectory_path)
def execute(self, directory, available_resources): import mdtraj if len(self.input_coordinate_paths) != len(self.input_trajectory_paths): return PropertyEstimatorException(directory=directory, message='There should be the same number of ' 'coordinate and trajectory paths.') if len(self.input_trajectory_paths) == 0: return PropertyEstimatorException(directory=directory, message='No trajectories were ' 'given to concatenate.') trajectories = [] output_coordinate_path = None for coordinate_path, trajectory_path in zip(self.input_coordinate_paths, self.input_trajectory_paths): output_coordinate_path = output_coordinate_path or coordinate_path trajectories.append(mdtraj.load_dcd(trajectory_path, coordinate_path)) self.output_coordinate_path = output_coordinate_path output_trajectory = trajectories[0] if len(trajectories) == 1 else mdtraj.join(trajectories, False, False) self.output_trajectory_path = path.join(directory, 'output_trajectory.dcd') output_trajectory.save_dcd(self.output_trajectory_path) return self._get_output_dictionary()
def sampling_along_tIC(resultdir, opath, tica_trajs, xtc_traj_folder, traj_list_array, pdb_name, tIC_a): transformed = np.concatenate(tica_trajs) draw_tica_histogram_core(transformed[:, 0], transformed[:, 1], '1', '2') tica_trajs = {i: tica_trajs[i] for i in range(len(tica_trajs)) } #tica_trajs is now a dictionary inds = sample_dimension(tica_trajs, dimension=tIC_a - 1, n_frames=200, scheme='random') #sample 200 conformations #make trajectory traj = md.join( md.load_frame(xtc_traj_folder + traj_list_array[i], index=frame_i, top=xtc_traj_folder + pdb_name) for i, frame_i in inds) #save the trajectory traj.save("%s/tica-dimension-tIC%s.xtc" % (resultdir, tIC_a - 1)) #show the samples on tICA projections samples_coord = [] for i, frame_i in inds: samples_coord.append( [tica_trajs[i][frame_i][0], tica_trajs[i][frame_i][1]]) samples_coord = np.array(samples_coord) print(samples_coord.shape) plt.plot(samples_coord[:, 0], samples_coord[:, 1], 'o-') plt.legend('sample') plt.savefig(resultdir + '/' + opath)
def _analyse_results_using_mdtraj( self, env: str, snapshots: list, unitcell: list, save_results: bool, engine: str, ): logger.debug(f"Evaluating with {engine}") if engine == "openMM": lambda_states = [ lambda_state for lambda_state in range(1, self.nr_of_states + 1) ] xyz_array = snapshots # Decide if we want to use the multiprocessing library r = starmap( self._evaluate_e_on_all_snapshots_openMM, zip(repeat(xyz_array), repeat(unitcell), lambda_states, repeat(env)), ) u_kn = np.stack([r_i for r_i in r]) elif engine == "CHARMM": confs = [] # write out traj in self.base_path for (dcd, psf) in self.traj_files[env]: traj = mdtraj.load( f"{dcd}", top=f"{psf}", ) # return and append thinned trajs traj, _, _ = self._thinning(traj) confs.append(traj) joined_trajs = mdtraj.join(confs, check_topology=True) joined_trajs.save_dcd(f"{self.base_path}/traj.dcd") u_kn = np.stack([ self._evaluate_e_on_all_snapshots_CHARMM( joined_trajs, lambda_state, env) for lambda_state in range(1, self.nr_of_states + 1) ]) # remove merged traj os.remove(f"{self.base_path}/traj.dcd") else: raise RuntimeError(f"Either openMM or CHARMM engine, not {engine}") if save_results: file = f"{self.save_results_to_path}/mbar_data_for_{self.structure_name}_in_{env}.pickle" logger.info(f"Saving results: {file}") results = {"u_kn": u_kn, "N_k": self.N_k} pickle.dump(results, open(file, "wb+")) return self.calculate_dG_using_mbar(u_kn, self.N_k, env)
def test_rmsd_kcenters_mpi(): TRJFILE = os.path.join(os.path.dirname(__file__), 'data', 'frame0.xtc') TOPFILE = os.path.join(os.path.dirname(__file__), 'data', 'native.pdb') SELECTION = '(name N or name C or name CA or name H or name O)' expected_size = (5, 501) with tempfile.TemporaryDirectory() as tdname: tdname = MPI.COMM_WORLD.bcast(tdname, root=0) for i in range(expected_size[0]): shutil.copy(TRJFILE, os.path.join(tdname, 'frame%s.xtc' % i)) with warnings.catch_warnings(): warnings.filterwarnings('ignore') a, d, inds, s = runhelper( [ '--trajectories', os.path.join(tdname, 'frame?.xtc'), '--topology', TOPFILE, '--cluster-number', '4', # '--subsample', str(SUBSAMPLE_FACTOR), '--atoms', SELECTION, '--algorithm', 'kcenters' ], expected_size=expected_size, expect_reassignment=True) trj = md.load(TRJFILE, top=TOPFILE) trj_sele = trj.atom_slice(trj.top.select(SELECTION)) result = kcenters.kcenters(trj_sele, 'rmsd', n_clusters=4, mpi_mode=False) assert_array_equal(result.center_indices, inds[:, 1]) assert_array_equal(result.distances, d[0]) assert_array_equal(result.assignments, a[0]) expected_s = md.join([trj[i[1]] for i in inds]) assert_array_equal(expected_s.xyz, md.join(s).xyz)
def test_md_join(): t_ref = md.load(get_fn('frame0.h5'))[:20] loaded = md.load(fn, top=t_ref, stride=2) iterloaded = md.join(md.iterload(fn, top=t_ref, stride=2, chunk=6)) eq(loaded.xyz, iterloaded.xyz) eq(loaded.time, iterloaded.time) eq(loaded.unitcell_angles, iterloaded.unitcell_angles) eq(loaded.unitcell_lengths, iterloaded.unitcell_lengths)
def fit(self, traj_list, y=None): all_trajs = join(traj_list) indices = baker_hubbard(all_trajs, freq=self.freq, exclude_water=self.exclude_water, periodic=self.periodic, sidechain_only=self.sidechain_only) self.indices = indices[:, 1:]
def sampling_representative_structures_for_MSM(resultdir, msm_traj, xtc_traj_folder, traj_list_array, pdb_name): transformed_assignments=np.concatenate(msm_traj) info_list=[] #elements in info_list: [traj_id, frame_id, class_id, eigenvector_value] for j in range(len(msm_traj)): for k in range(len(msm_traj[j])): info_list.append([j, k, msm_traj[j][k]]) info_list = np.array(info_list) for j in range(min(transformed_assignments), max(transformed_assignments)+1): samples = random.sample(list(info_list[np.where(info_list[:, 2] == j)[0]]), 50) traj = md.join(md.load_frame(xtc_traj_folder+traj_list_array[i], index=frame_i, top=xtc_traj_folder+pdb_name) for i, frame_i, class_id in samples) traj.save("%s/representative_structure_in_state%d.xtc"%(resultdir, j))
def sampling_along_msm_eigenmode(resultdir, msm_eigen_trajs, assignment_list, xtc_traj_folder, traj_list_array, pdb_name, mode_num): #mode_num=1: the slowest dynamic mode (2nd eigenvector of TPM) info_list=[] #elements in info_list: [traj_id, frame_id, class_id, eigenvector_value] for j in range(len(assignment_list)): for k in range(len(assignment_list[j])): info_list.append([j, k, assignment_list[j][k], msm_eigen_trajs[j][k][mode_num-1]]) #index starts from 0 #randomly sample 200 conformations samples=random.sample(info_list, 200) #can play with the number 200 sorted_samples=sorted(samples, key=lambda x: x[3]) #get the xtc file containing all the sampled conformations traj = md.join(md.load_frame(xtc_traj_folder+traj_list_array[i], index=frame_i, top=xtc_traj_folder+pdb_name) for i, frame_i, class_id, eigenvector_value in sorted_samples) traj.save("%s/msm-%s-dynamic-mode.xtc"%(resultdir, mode_num))
def load_pdb(self, stride=4): traj = None for i, t in enumerate( mt.iterload(self.traj_fn, stride=stride, chunck=100)): if i == 0: traj = t else: traj = mt.join([traj, t]) self.traj = traj.atom_slice(traj.topology.select("not water")) self.n_frames = traj.n_frames return self.traj
def itertrajs(meta, stride=1): """Load one mdtraj trajectory at a time and yield it. MDTraj does striding badly. It reads in the whole trajectory and then performs a stride. We join(iterload) to conserve memory. """ tops = preload_tops(meta) for i, row in meta.iterrows(): yield i, md.join(md.iterload(row['traj_fn'], top=tops[row['top_fn']], stride=stride), discard_overlapping_frames=False, check_topology=False)
def sample_ev(n_ev: int, n_cut: int, ptrajs_df, top_path: str, threshold: float = 1e-6) -> md.Trajectory: n_ev = str(n_ev) df = ptrajs_df.loc[:, [n_ev, 'mixing']].copy(deep=True) df['cat'] = pd.qcut(df[n_ev], q=n_cut, duplicates='drop') df['min'] = df.groupby('cat')['mixing'].transform('min') df = df.loc[np.abs(df['mixing'] - df['min']) < threshold, :] sample = df.groupby('cat').sample(n=1) sample.sort_values(by='cat', inplace=True) sample_ixs = list(sample.index) traj = md.join( [md.load_frame(x, top=top_path, index=y) for x, y in sample_ixs]) return traj
def main(): arg = argparse.ArgumentParser(prog='rex.track_traj') arg.add_argument('-t', '--top', dest='top_fn', required=True) arg.add_argument('-p', '--prefix', dest='prefix_s', required=True, nargs='+') arg.add_argument('-o', '--output', dest='output_prefix', default='traj') arg.add_argument('--dcd_step', dest='dcd_step', default=25000) arg.add_argument('--exchange_rate', dest='exchange_rate', default=5000) # if len(sys.argv) == 1: arg.print_help() return arg = arg.parse_args() # history_step = float(arg.dcd_step) / float(arg.exchange_rate) # history = read_history(arg.prefix_s) dcd_frames = history_step * ( 1 + np.arange(int(history.shape[1] / history_step))) dcd_frames = np.ceil(dcd_frames).astype(int) - 1 history = history[:, dcd_frames] top = mdtraj.load(arg.top_fn) # n_replica = history.shape[0] n_frame = history.shape[1] # replica = [] for k in range(n_replica): for prefix in arg.prefix_s: replica_fn = 'replica.%d/%s.dcd' % (k, prefix) replica.append(mdtraj.load(replica_fn, top=top)) replica = mdtraj.join(replica, check_topology=False) # for k in range(n_replica): frame = np.array([n_frame * j + i for i, j in enumerate(history[k])]) traj = replica.slice(frame) traj.save("%s.%d.dcd" % (arg.output_prefix, k))
def main(): #Part to load coordinate file topfile = sys.argv[1] alength = len(sys.argv) #number of dcd files can vary #trjfile = sys.argv[2] #outfile = sys.argv[3] #settings for env variables. outstr = input( "output dcd file prefix? ex) DES_npt_b (becomes DES_npt_b${i}.dcd) \n") nb = int(input("how many blocks? ex) 5 \n")) bstep = int(input("frames per block? ex) 2000 \n")) neq = int( input( "How many initial frames do you want to cut as equilibration? ex) 500 \n" )) #input 1 : load traj. (big file) #check if multiple trajectories are put and needed to be merged. ntrajfile = alength - 2 if ntrajfile == 1: trajtotal = md.load(sys.argv[2], top=topfile) nframetotal = trajtotal.n_frames print('nframetotal : {}'.format(nframetotal)) elif ntrajfile == 2: traj1, traj2 = md.load(sys.argv[2], top=topfile), md.load(sys.argv[3], top=topfile) nframe1, nframe2 = traj1.n_frames, traj2.n_frames print('nframe1, nframe2 : {} {}'.format(nframe1, nframe2)) trajtotal = md.join([traj1, traj2], check_topology=True, discard_overlapping_frames=True) nframetotal = trajtotal.n_frames print('nframetotal : {}'.format(nframetotal)) #traj splitting:loop for i in range(nb): outfile = outstr + str(i) + '.dcd' iframe, fframe = neq + i * bstep, neq + (i + 1) * bstep trajfrag = trajtotal[iframe:fframe] trajfrag.save_dcd(outfile)
def test_iterload_chunk_dcd(get_fn): # Makes sure that the actual chunk size yielded by iterload corresponds to the number of # frames specified when calling it (for dcd files). file = get_fn("alanine-dipeptide-explicit.dcd") top = get_fn("alanine-dipeptide-explicit.pdb") skip_frames = 3 frames_chunk = 2 full = md.load(file, top=top, stride=skip_frames) length = len(full) chunks = [] for traj_chunk in md.iterload(file, top=top, stride=skip_frames, chunk=frames_chunk): chunks.append(traj_chunk) joined = md.join(chunks) assert len(full) == len(joined) assert eq(full.xyz, joined.xyz)
def sample_clusters(meta, trajs, df): clust_id = df['Trajectory'].unique() for i in clust_id: print(i) df_smp = df.ix[df['Trajectory'] == i, ['Key', 'Frame']].sample(1000) inds = zip(df_smp['Key'], df_smp['Frame']) # Use loc because sample_dimension is nice traj = md.join( md.load_frame(meta.loc[traj_i]['traj_fn'], index=int(frame_i), top=meta.loc[traj_i]['top_fn']) for traj_i, frame_i in inds) # Save traj_fn = "/Users/robert_arbon/Code/AADH/Analysis/KR_Comparison/pcca_cluster-{}.dcd".format( i) backup(traj_fn) traj.save(traj_fn)
def make_traj(prefix, index): list_pdb = open(prefix + '-listfiles') lines = list_pdb.readlines() if len(lines) > 0: first = lines.pop(0).replace(' \n', '') else: return False t = mdtraj.load(first, top='input.prmtop') table, bonds = t.topology.to_dataframe() for i in range(len(lines)): filename = lines.pop(0).replace(' \n', '') #print(filename) tnext = mdtraj.load(filename.replace(' \n', ''), top='input.prmtop') t = mdtraj.join([t, tnext], check_topology=True, discard_overlapping_frames=False) t.save_mdcrd(prefix + '-traj.mdcrd') t = mdtraj.load(prefix + '-traj.mdcrd', top='input.prmtop') table, bonds = t.topology.to_dataframe() #print(index) dih = mdtraj.compute_dihedrals(t, index) t.save_netcdf(prefix + '-traj.nc') return t.n_frames, dih
def sample_clusters(): meta = load_meta() tops = preload_tops(meta) print('Sampling trajectories') ref = md.load('topology.pdb') for i in range(int(num_clusters)): print(i) df_smp = df.ix[df['Trajectory']==i, ['Key', 'Time_ps']].sample(100) inds = zip(df_smp['Key'], df_smp['Time_ps']) # Use loc because sample_dimension is nice traj = md.join( md.load_frame(meta.loc[traj_i]['traj_fn'], index=frame_i, top=meta.loc[traj_i]['top_fn']) for traj_i, frame_i in inds ) # Original trajectories include both BT1 and BT2 so need to superpose traj.superpose(reference=ref) # Save traj_fn = "clusters/rmsd_cluster-{}.dcd".format(i) backup(traj_fn) traj.save(traj_fn)
def sample_tica_dim(dim=0, n_frames=200, meta=None, ttrajs=None): ## Load if (not meta is None) & (not ttrajs is None): ## Sample # These are apparently ordered according tica value inds = sample_dimension(ttrajs, dimension=dim, n_frames=n_frames, scheme='random') save_generic(inds, "tica-dimension-{}-inds.pickl".format(dim + 1)) ## Get tica components tica_values = np.array( [ttrajs[traj_i][frame_i][dim] for traj_i, frame_i in inds]) tica_values = (tica_values - tica_values.min()) / (tica_values.max() - tica_values.min()) tica_values *= 10 ## Make trajectory top = preload_top(meta) # Use loc because sample_dimension is nice traj = md.join( md.load_frame(meta.loc[traj_i]['traj_fn'], index=frame_i, top=top) for traj_i, frame_i in inds) ## Supperpose ## Save traj_fn = "tica-dimension-{}.dcd".format(dim + 1) backup(traj_fn) traj.save(traj_fn) else: raise ValueError('Specify meta data and trajectory objects')
## Try to limit RAM usage def guestimate_stride(): total_data = meta['nframes'].sum() want = kmed.n_clusters * 10 stride = max(1, total_data // want) print("Since we have", total_data, "frames, we're going to stride by", stride, "during fitting, because this is probably adequate for", kmed.n_clusters, "clusters") return stride ## Fit kmed.fit([traj for _, traj in itertrajs(meta, stride=guestimate_stride())]) print(kmed.summarize()) ## Save save_generic(kmed, 'clusterer.pickl') ## Save centroids def frame(traj_i, frame_i): # Note: kmedoids does 0-based, contiguous integers so we use .iloc row = meta.iloc[traj_i] return md.load_frame(row['traj_fn'], frame_i, top=row['top_fn']) centroids = md.join((frame(ti, fi) for ti, fi in kmed.cluster_ids_), check_topology=False) centroids_fn = 'centroids.xtc' backup(centroids_fn) centroids.save("centroids.xtc")
def calculate_tica_components(self, cluster_method, calculate_strides=False, feats=None): '''Load in the features, calculate a given number of tICA components (tica_components) given a lagtime (lag_time), and save tICA coordinates and eigenvector data. It then creates and populates a list for each desired component, clusters the data, saving normalized populations as populations.dat and saving each cluster center as a .pdb. tICA plots are created and saved, and implied timescales are calculated, saved, and plotted. ''' # tICA parameters tica_lagtime = 10 # determine from implied timescales tica_components = 8 # how many tICs to compute n_clusters = 100 # denotes number of microstates n_timescales = tica_components # plot all eigenvalues --> timescales md_time_step = 0.02 # ns subsampled_time_step = 1. # ns multiplier of timescales and lagtimes in implied timescale plot stride = int(subsampled_time_step / md_time_step) #time step stride for sub-sampling equil_time = 1. # ns equil_steps = 1 #int(equil_time / md_time_step) time steps to be removed from start lagtimes = np.array([1,2,4,8,16,32,64,128,256,512,1024]) cluster_method = 'kcenters' # 'kcenters/kmeans' all_ticas = list(itertools.permutations(range(1,tica_components+1), 2)) # all combinations all_ticas = [[1,2]] # override: just show analysis for first two components cluster_percentage_cutoff = 5 # clusters with a relative population less than this # number will not be labeled on plot i.e. 0 : all clusters labeled verbose = False print("\nCalculating tICA components...") # Load in feature files THIS WILL NEED TO BE CHANGED if feats == None: if calculate_strides: self.calculate_stride_distances(stride, equil_steps) data = np.load('/home/server/git/fah-scripts/DataAnalysisScripts/stride_dist/stride_dist_%d.npy' % self.proj_num) else: data = self.data else: data = np.load(feats) features = [] for run in data: for clone in run: gen_seq = [] for gen in clone: if gen is not None and gen[0] is not None: if calculate_strides or feats is not None: gen_seq.append(gen) else: gen_seq.append(gen[::stride]) if len(gen_seq) > 0: gen_cat = np.concatenate(gen_seq) if calculate_strides: features.append(gen_cat) else: features.append(gen_cat[equil_steps:]) features = np.asarray(features) print(features.shape) print(features[0].shape) tica_coordinates = tICA(lag_time=tica_lagtime, n_components=int(tica_components)).fit_transform(features) np.save('%s/lag_%d_coord_%d.npy' %(self.tICA_dir, tica_lagtime, tica_components), tica_coordinates) # Initiate and populate an array for each component for i in range(tica_components): exec('tica_' + str(i+1) + ' = []') for i in tqdm.tqdm(range(len(features))): for j in range(len(tica_coordinates[i])): for k in range(tica_components): exec('tica_' + str(k+1) + '.append(tica_coordinates[i][j][k])') # Perform clustering based on the cluster_method parameter. if cluster_method == 'kcenters': print("Clustering via KCenters...") clusters = KCenters(n_clusters) elif cluster_method == 'kmeans': print("Clustering via KMeans...") clusters = KMeans(n_clusters) else: sys.exit("Invalid cluster_method. Use kmeans or kcenters.") # Determine cluster assignment for each frame. sequences = clusters.fit_transform(tica_coordinates) np.save('%s/lag_%d_clusters_%d_sequences.npy' %(self.tICA_dir, tica_lagtime, n_clusters), sequences) np.save('%s/lag_%d_clusters_%d_center.npy' %(self.tICA_dir, tica_lagtime, n_clusters), clusters.cluster_centers_) # Determine cluster populations, normalize the counts, and save as percentages for # labeling if a cluster contains more than cluster_percentage_cutoff percent of the data. # Finally, save normalized counts. print("\nDetermining cluster populations...") if not os.path.exists('%s/cluster_centers' % self.tICA_dir): os.makedirs('%s/cluster_centers' % self.tICA_dir) counts = np.array([len(np.where(np.concatenate(sequences)==i)[0]) for i in range(n_clusters)]) normalized_counts = counts/float(counts.sum()) percentages = [ i*100 for i in normalized_counts ] population_labels = [ [i,"%.2f"%percentages[i]] for i in range(len(percentages)) if percentages[i] > cluster_percentage_cutoff ] np.savetxt('%s/cluster_centers/populations.dat' % self.tICA_dir, normalized_counts) # Plot all unique combinations of tICA components print("\nPlotting tICA components with cluster centers...") all_ticas = list(itertools.permutations(range(1,tica_components+1), 2)) for j in tqdm.tqdm(range(len(all_ticas))): # For each pair if all_ticas[j][0] < all_ticas[j][1]: plt.figure(j, figsize=(20,16)) plt.hexbin(eval("tica_"+str(all_ticas[j][0])), eval("tica_"+str(all_ticas[j][1])), bins='log') x_centers = [clusters.cluster_centers_[i][all_ticas[j][0]-1] for i in range(len(clusters.cluster_centers_))] y_centers = [clusters.cluster_centers_[i][all_ticas[j][1]-1] for i in range(len(clusters.cluster_centers_))] high_pop_x_centers = [ x_centers[i] for i in range(len(x_centers)) if percentages[i] > cluster_percentage_cutoff ] high_pop_y_centers = [ y_centers[i] for i in range(len(y_centers)) if percentages[i] > cluster_percentage_cutoff ] plt.plot(x_centers, y_centers, color='y', linestyle="", marker="o") plt.plot(eval("tica_"+str(all_ticas[j][0])+'[0]'), eval("tica_"+str(all_ticas[j][1])+'[0]'), color='k', marker='*',markersize=24) plt.xlabel('tic'+str(all_ticas[j][0])) plt.ylabel('tic'+str(all_ticas[j][1])) plt.title(self.proj_num) # Add labels for high-population cluster centers for label, x, y in zip(population_labels, high_pop_x_centers, high_pop_y_centers): plt.annotate( label, xy = (x, y), xytext = (-15, 15), textcoords = 'offset points', ha = 'right', va = 'bottom', bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5), arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0')) plt.savefig('%s/tica_' % (self.tICA_dir) +str(all_ticas[j][0])+'_'+str(all_ticas[j][1])+'.png') plt.close() ########################################################################### for filename in os.listdir(self.tICA_dir + '/cluster_centers'): if filename.endswith('.pdb'): os.remove(self.tICA_dir + '/cluster_centers/' + filename) # Write out PDBs for each cluster center print("Performing cluster analytics and saving center PDBs...\n") runs, clones, gens = data.shape[0], data.shape[1], data.shape[2] x, y, z = 0, 0, 0 for i in range(len(features)): if i % clones == 0 and i != 0: x += 1 if i % gens == 0: y = 0 n_snapshots = len(clusters.distances_[i]) # Determine frames that are cluster centers cluster_indices = np.arange(n_snapshots)[ (clusters.distances_[i] < 1e-6) ] # Determine number of each cluster, correlates to populations.dat cluster_labels = sequences[i][cluster_indices] # Save each cluster center as a pdb if list(cluster_indices): # load center-containing xtcs to check length traj_cat = [] print('x: %d, y: %d, z: %d' % (x, y, z)) while True: try: traj = base_dir + 'PROJ%s/RUN%s/CLONE%s/results%s/traj_comp.xtc' % (self.proj_num, x, y, z) traj_cat.append(md.load(traj, top=self.gro_file)) z += 1 except: break if len(traj_cat) > 0: trajectory_file = md.join(traj_cat) xtc_len = len(trajectory_file) y += 1 z = 0 for j in range(len(cluster_indices)): frames = range(xtc_len) # map the strided frame number back to xtc frame number strided_frames = frames[equil_steps:][::stride] xtc_frame = frames.index(strided_frames[cluster_indices[j]]) cluster_traj = trajectory_file[xtc_frame] cluster_traj.save_pdb('%s/cluster_centers/state_%d_%.3f.pdb'%(self.tICA_dir, cluster_labels[j],percentages[cluster_labels[j]])) if verbose: print('Successfully saved PDB for cluster: %d, (rel.pop: %.3f)'%(cluster_labels[j],percentages[cluster_labels[j]])) print('traj_file: %s (%d/%d)'%(trajectory_file,i,len(features))) print('frame: %d (%d/%d centers from this trajectory)'%(cluster_indices[j],j,len(cluster_indices))) print('strided: npy_frame/npy_len = %d/%d = %f'%(cluster_indices[j],n_snapshots,cluster_indices[j]/n_snapshots)) print('re-mapped: orig_frame/xtc_len = %d/%d = %f\n'%(xtc_frame,xtc_len,xtc_frame/xtc_len))
please cite msmbuilder in any publications """ import mdtraj as md import os from msmbuilder.io.sampling import sample_states from msmbuilder.io import load_trajs, save_generic, preload_top, backup, load_generic ## Load meta, ttrajs = load_trajs('ttrajs') kmeans = load_generic("kmeans.pickl") ## Sample inds = sample_states(ttrajs, kmeans.cluster_centers_, k=10) save_generic(inds, "cluster-sample-inds.pickl") ## Make trajectories top = preload_top(meta) out_folder = "cluster_samples" backup(out_folder) os.mkdir(out_folder) for state_i, state_inds in enumerate(inds): traj = md.join( md.load_frame(meta.loc[traj_i]['traj_fn'], index=frame_i, top=top) for traj_i, frame_i in state_inds) traj.save("{}/{}.xtc".format(out_folder, state_i))
def main(): Vars = KinfoVariables() lib_dir = Vars['lib_dir'] print( '\033[34m## need to change the hard-coded library/repository directory ##\033[0m' ) print('\033[34m current lib_dir:\033[0m ' + lib_dir + '\n') args = UserInput() if args.use_sk not in sk_ml: # default SK model: RandomForest args.use_sk = 'et' ### These are hard-coded test cases with known key residue positions. ### This was used for examining the code by skipping some steps # wrk_dir = '.' # args.tmpl_file = wrk_dir+'examples/strada_cido.prot.1atp.pdb' # args.traj_file = wrk_dir+'examples/strada_cidi.2.200ps.dcd' # args.outpref = 'test' # args.b3k = 39 # args.dfg = 152 # args.c_glu = 57 ## user-defined library path if args.lib_dir: lib_dir = args.lib_dir ## reference structure must start at resid 1. Modified ref is hardcoded here if not os.path.isfile('{0}/{1}'.format(lib_dir, Vars['ref_pdb'])): sys.exit( '\n \033[31mFATAL: Reference structure \033[0m"{0}"\033[31m not found in database\033[0m' .format(Vars['ref_pdb'])) else: ref_file = lib_dir + '/' + Vars['ref_pdb'] ref_pkl = lib_dir + '/' + Vars['ref_pkl'] ref_dfg = Vars['ref_dfg'] ref_b3k = Vars['ref_b3k'] ref_c_glu = Vars['ref_c_glu'] ###################### ## get reference PDB structure 1ATP.pdb coordinates dataframe print('\033[34m# Reading in reference file:\033[0m ' + ref_file) if not ref_pkl or not os.path.isfile(ref_pkl): ref = md.load_pdb(ref_file) ref_cd = ExtractCoords(dfg=ref_dfg, b3k=ref_b3k, c_glu=ref_c_glu, pkl=ref_pkl) ref_df = CalculateMetrics(ref_cd(ref)) ## skip calculation if data is already stored in pickle else: print( ' \033[34m## INFO: Read structural residue coords from:\033[0m {0}\n' .format(ref_pkl)) with bz2.open(ref_pkl, 'rb') as fi: ref = pickle.load(fi) ref_df = CalculateMetrics(ref) ###################### ## load trajectory file(s) with MDtraj, can be multiple traj files at once traj = [] print('\033[34m# Reading in trajectory file(s)...\033[0m') start = time.perf_counter() if not args.pkl or not os.path.isfile(args.pkl): start2 = time.perf_counter() TrjIn = ReadTraj(top=args.tmpl_file) if re.search(r'dcd$|nc$|crd$|xtc$', args.traj_file): traj = TrjIn(args.traj_file) else: traj_list = filter(None, (l.rstrip() for l in open(args.traj_file, 'r') if l is not re.search(r'^#', l))) mpi = multiprocessing.Pool() traj = md.join(mpi.imap(TrjIn, traj_list, 2)) mpi.close() mpi.join() end2 = time.perf_counter() print( ' ## Time to load trajectory: \033[31m{0:.1f}\033[0m ms for \033[34m{1}\033[0m frames\n' .format((end2 - start2) * 1000, len(traj))) ## superpose all frames to template structure pre-superposed to ref 1ATP.pdb if args.superp: print( '\033[34m# Applying superposition to trajectory with:\033[0m ' + args.superp) tmpl = md.load_pdb(args.tmpl_file) sele = tmpl.topology.select(args.superp) traj = traj.superpose(tmpl, atom_indices=sele, parallel=True) ## get trajectory coordinates dataframe print( '\033[34m# Extracting structural matrics from trajectory...\033[0m' ) start = time.perf_counter() trj_cd = ExtractCoords(dfg=args.dfg, b3k=args.b3k, c_glu=args.c_glu, pkl=args.pkl) trj_df = CalculateMetrics(trj_cd(traj)) ## skip calculation if data is already stored in pickle else: print( ' \033[34m## INFO: Read structural residue coords from:\033[0m {0}\n' .format(args.pkl)) with bz2.open(args.pkl, 'rb') as fi: trj_df = CalculateMetrics(pickle.load(fi)) end = time.perf_counter() print('## Total time to get traj descriptors: {0:.1f} ms for {1} frames'. format((end - start) * 1000, len(trj_df))) del traj # save memory print('\n#########################################\n') ###################### ###################### ## calculate structural metrics from coordinates, then print out raw output print( '\033[34m# Calculating structural matrics from coordinates...\033[0m') start = time.perf_counter() mat_df = CompareMetrics(trj_df, ref_df) mat_df.to_csv(args.outpref + '.csv', sep=',') end = time.perf_counter() print( '## Total time to compare descriptors: \033[31m{0:.1f}\033[0m ms for \033[34m{1}\033[0m frames' .format((end - start) * 1000, len(mat_df))) print('\n#########################################\n') ##################### ## use Kinformation Random Forest Classifier to assign conformation/confidence start = time.perf_counter() KinfoClassify(mat_df, lib_dir, args.outpref, args.use_sk) end = time.perf_counter() print( '\n## Total time to SK \033[31m{0}\033[0m Classification: \033[31m{1:.3f}\033[0m ms for \033[34m{2}\033[0m frames' .format(args.use_sk, (end - start) * 1000, len(mat_df))) print('\n#########################################\n')
## JOIN CSVs print("\tJoining CSVs") df_orig = pd.read_csv(original_data_file) last_time = df_orig.iloc[-1]['Time (ps)'] df_re = pd.read_csv(restarted_data_file) df_re['Time (ps)'] = df_re['Time (ps)'].apply(lambda x: x + last_time) n_orig_rows = df_orig.shape[0] df_orig = df_orig.append(df_re.iloc[:1000 - n_orig_rows]) df_orig.to_csv(file_prefix + file + '.50ns.combined.csv', index=False) ### JOIN TRAJECTORIES print("\tLoading topology PDB") # Load topology PDB original_ref = md.load_pdb(original_ref_file) print("\tRemoving solvent from topology PDB") original_ref_solute = original_ref.remove_solvent() # Load original trajectory print("\tLoading original trajectory") original_traj = md.load_dcd(original_traj_file, top=original_ref_solute) # Load restarted trajectory, slicing to keep only the number of additional frames needed print("\tLoading restarted trajectory") restarted_traj = md.load(restarted_traj_file, top=original_ref_solute) restarted_traj = restarted_traj[:1000 - original_traj.n_frames] # Join the trajectories print("\tJoining trajectories and saving them") joined_traj = md.join([original_traj, restarted_traj]) joined_traj.save(file_prefix + file + ".50ns.combined.solute.dcd")
def guestimate_stride(): total_data = meta['nframes'].sum() want = kmed.n_clusters * 10 stride = max(1, total_data // want) print("Since we have", total_data, "frames, we're going to stride by", stride, "during fitting, because this is probably adequate for", kmed.n_clusters, "clusters") return stride ## Fit kmed.fit([traj for _, traj in itertrajs(meta, stride=guestimate_stride())]) print(kmed.summarize()) ## Save save_generic(kmed, 'clusterer.pickl') ## Save centroids def frame(traj_i, frame_i): # Note: kmedoids does 0-based, contiguous integers so we use .iloc row = meta.iloc[traj_i] return md.load_frame(row['traj_fn'], frame_i, top=row['top_fn']) centroids = md.join((frame(ti, fi) for ti, fi in kmed.cluster_ids_), check_topology=False) centroids_fn = 'centroids.xtc' backup(centroids_fn) centroids.save("centroids.xtc")
- trajs """ import mdtraj as md from msmbuilder.io import load_trajs, save_generic, preload_top, backup, load_generic from msmbuilder.io.sampling import sample_msm ## Load meta, ttrajs = load_trajs('ttrajs') msm = load_generic('msm.pickl') kmeans = load_generic('kmeans.pickl') ## Sample # Warning: make sure ttrajs and kmeans centers have # the same number of dimensions inds = sample_msm(ttrajs, kmeans.cluster_centers_, msm, n_steps=200, stride=1) save_generic(inds, "msm-traj-inds.pickl") ## Make trajectory top = preload_top(meta) traj = md.join( md.load_frame(meta.loc[traj_i]['traj_fn'], index=frame_i, top=top) for traj_i, frame_i in inds ) ## Save traj_fn = "msm-traj.xtc" backup(traj_fn) traj.save(traj_fn)