def load_trajchunks(traj, parm, start=1, stride=1, standard_names=True, **kwargs): """Loads a file into a generator of MDtraj trajectory chunks. Useful for large/memory intensive trajectory files Usage: load_trajchunks(traj, parm, [start=1, stride=1, **kwargs]) Standard kwargs include chunk (size of the trajectory chunks to load per iteration), and atom_indices (an array of 0-indexed atoms to keep). 'standard_names=False' (not the default here, or in MDTraj) may also be useful for PDB topologies, otherwise amide H might be renamed from the atom names provided to the standard PDB identifiers (e.g. 'H', 'H2', 'H3' for the terminal NH3 group). Returns a generator object with trajectory iterations.""" try: parmobj = md.load_topology(parm, standard_names=standard_names) except TypeError: parmobj = md.load_topology( parm) # MDTraj only has standard_names kwarg for certain filetypes return md.iterload(traj, top=parmobj, skip=start - 1, stride=stride, **kwargs) # Start is zero indexed
def load_fulltraj(traj, parm, start=1, stop=None, stride=1, standard_names=True, **kwargs): """Loads an MDtraj trajectory object with the desired topology and coordinates. Usage: setup_universe(parm,traj,[start=1,stop=None,stride=1,**kwargs]) Standard kwargs include atom_indices (an array of 0-indexed atoms to keep) and stride (integer of every nth frame to keep). 'standard_names=False' (not the default here, or in MDTraj) may also be useful for PDB topologies, otherwise amide H might be renamed from the atom names provided to the standard PDB identifiers (e.g. 'H', 'H2', 'H3' for the terminal NH3 group). Returns a complete trajectory, which may be memory intensive. See also load_trajchunks for an iterative load of large trajectories """ try: parmobj = md.load_topology(parm, standard_names=standard_names) except TypeError: parmobj = md.load_topology( parm) # MDTraj only has standard_names kwarg for certain filetypes t = md.load(traj, top=parmobj, **kwargs) if stop is None: stop = t.n_frames return t[start - 1:stop:stride] # Start is zero indexed
def plot_rmsd(trajectories, topology=None, subset=None, output='rmsd.dat', chunksize=100, reimage=False): import mdtraj import numpy as np from tqdm import tqdm if topology: topology = mdtraj.load_topology(topology) if subset: subset = topology.select(subset) trajectories = sorted(trajectories, key=sort_key_for_numeric_suffixes) first_frame = mdtraj.load_frame(trajectories[0], 0, top=topology) frame_size = first_frame.xyz[0].nbytes if reimage: first_frame.image_molecules(inplace=True) rmsds = [] for trajectory in tqdm(trajectories, unit='file'): _, ext = os.path.splitext(trajectory) total, unit_scale = None, None if ext.lower() == '.dcd': n_frames = round( os.path.getsize(trajectory) / frame_size, -1 * len(str(chunksize)[1:])) total = int(n_frames / chunksize) unit_scale = chunksize itertraj = mdtraj.iterload(trajectory, top=topology, chunk=chunksize) tqdm_kwargs = { 'total': total, 'unit': 'frames', 'unit_scale': unit_scale, 'postfix': { 'traj': trajectory } } for chunk in tqdm(itertraj, **tqdm_kwargs): if reimage: chunk.image_molecules(inplace=True) rmsd = mdtraj.rmsd(chunk, first_frame, atom_indices=subset) * 10.0 # nm->A rmsds.append(rmsd) rmsds = np.concatenate(rmsds) with open(output, 'w') as f: f.write('\n'.join(map(str, rmsds))) print('\nWrote RMSD values to', output) print('Plotting results...') plt.plot(rmsds) fig = plt.gca() fig.set_title('{}{}'.format( trajectories[0], ' and {} more'.format( len(trajectories[1:]) if len(trajectories) > 1 else ''))) fig.set_xlabel('Frames') fig.set_ylabel('RMSD (A)') plt.show()
def test_0(): top = load_topology(get_fn('native2.pdb'), no_boxchk=True) t1 = load(get_fn('native2.xml'), top=top) t2 = load(get_fn('native2.pdb'), no_boxchk=True) t1.center_coordinates() t2.center_coordinates() yield lambda: eq(t1.xyz, t2.xyz) yield lambda: eq(t1.unitcell_vectors, t2.unitcell_vectors)
def load_partial(netcdf, prmtop, start, stop, stride=1): topology = md.load_topology(prmtop) with md.open(netcdf) as f: f.seek(start) t = f.read_as_traj( topology, n_frames=int((stop-start)/stride), stride=stride, ) return t
def test_xml(get_fn): top = load_topology(get_fn('native2.pdb'), no_boxchk=True) t1 = load(get_fn('native2.xml'), top=top) t2 = load(get_fn('native2.pdb'), no_boxchk=True) t1.center_coordinates() t2.center_coordinates() assert eq(t1.xyz, t2.xyz) assert eq(t1.unitcell_vectors, t2.unitcell_vectors)
def get_positions( topology: PathLike, trajectory: List[str], *, mask: str = "all", stride: Optional[int] = None, ) -> NDArray[(Any, ...), Float]: """Read a molecular dynamics trajectory and retrieve the coordinates. Parameters ---------- topology : PathLike Topology file trajectory : list of str Trajectory file mask : str Selection criterion for coordinates stride : int, optional Number of steps to read Returns ------- NDArray The coordinates with shape (n_frames / step, n_atoms, 3) """ top: md.Topology = md.load_topology(topology) selection: Optional[NDArray[(Any, ...), Float]] = ( top.select(mask) if mask != "all" else None ) filenames = ( glob.iglob(*trajectory) if len(trajectory) == 1 and "*" in "".join(trajectory) else trajectory ) # MDTraj stores positions in nanometers; we convert it to Ångstroms. positions: NDArray[(Any, ...), Float] = np.concatenate( [ frames.xyz for filename in filenames for frames in md.iterload( filename, top=top, atom_indices=selection, stride=stride ) ], axis=0, ) if not ( ".gro" in "".join(filenames) or ".xtc" in "".join(filenames) or ".trj" in "".join(filenames) or ".tng" in "".join(filenames) ): in_units_of(positions, "nanometer", "angstroms", inplace=True) return positions
def mobile(self) -> NDArray[(Any, ...), Float]: """Load coordinates from a trajectory file. Returns ------- NDArray Trajectory """ top = md.load_topology(TOPWW) sel = top.select("protein and name CA") traj = md.load(TRJWW, top=top, atom_indices=sel) return traj.xyz
def mobile(self) -> NDArray[(Any, ...), Float]: """Load coordinates from a trajectory file. Returns ------- NDArray Trajectory """ topology = md.load_topology(TOPWW) indices = topology.select("protein and name CA") universe = md.load(TRJWW, top=topology).atom_slice(indices) return universe.xyz
def calculate_grid_quantities(self, energy=True, entropy=True, hbonds=True, start_frame=None, num_frames=None): if start_frame is None: start_frame = self.start_frame if num_frames is None: num_frames = self.num_frames chunk_size = 1 if (start_frame + num_frames) <= chunk_size: chunk_size = start_frame + num_frames chunk_iter = int(num_frames / chunk_size) #chunk_iter += int((start_frame + num_frames) % chunk_size) chunk_counter = 0 print_progress_bar(chunk_counter, chunk_iter) topology = md.load_topology(self.topology_file) for i in xrange(start_frame, start_frame + num_frames): chunk_counter += 1 self.process_chunk(i, chunk_size, topology, energy, hbonds, entropy) print_progress_bar(chunk_counter, chunk_iter) if chunk_counter == chunk_iter: break for voxel in xrange(self.voxeldata.shape[0]): if self.voxeldata[voxel, 4] >= 1.0: self.voxeldata[ voxel, 12] = self.voxeldata[voxel, 11] / self.voxeldata[voxel, 4] self.voxeldata[voxel, 11] /= (num_frames * self.voxel_vol) self.voxeldata[voxel, 14] = self.voxeldata[voxel, 13] / ( self.voxeldata[voxel, 4] * 2.0) self.voxeldata[voxel, 13] /= (num_frames * self.voxel_vol * 2.0) if self.voxeldata[voxel, 17] > 0.0: self.voxeldata[voxel, 16] = self.voxeldata[voxel, 15] / ( self.voxeldata[voxel, 17] * 2.0) self.voxeldata[voxel, 15] /= (num_frames * self.voxel_vol * self.voxeldata[voxel, 17] * 2.0) for i in range(17, 35, 2): self.voxeldata[voxel, i + 1] = self.voxeldata[ voxel, i] / self.voxeldata[voxel, 4] self.voxeldata[voxel, i] /= (num_frames * self.voxel_vol) if entropy: self.calculate_entropy(num_frames=num_frames)
def test_box_load_save(write_traj_with_box, get_fn): t = md.load(get_fn('native2.pdb'), no_boxchk=True) top = md.load_topology(get_fn('native.pdb'), no_boxchk=True) # make sure than through a load/save # cycle, the box information is preserved: t.save(write_traj_with_box.fn) t2 = md.load(write_traj_with_box.fn, top=top) assert t.unitcell_vectors is not None assert eq(t.xyz, t2.xyz, decimal=3) assert eq(t.unitcell_vectors, t2.unitcell_vectors) assert eq(t.unitcell_angles, t2.unitcell_angles) assert eq(t.unitcell_lengths, t2.unitcell_lengths)
def get_average_structure( topology: PathLike, trajectory: List[str], *, mask: str = "all", stride: Optional[int] = None, ) -> md.Trajectory: """Compute the average structure of a trajectory. Parameters ---------- topology : PathLike Topology file trajectory : list of str List of trajectory files mask : str Atom selection stride : int, optional Number of steps to read Returns ------- Trajectory The average positions """ n_frames: int = 0 positions_: List[NDArray[(Any, ...), Float]] = [] indices: Optional[NDArray[(Any, ...), Float]] = ( md.load_topology(topology).select(mask) if mask != "all" else None ) filenames = ( glob.iglob(*trajectory) if len(trajectory) == 1 and "*" in "".join(trajectory) else trajectory ) for filename in filenames: for frames in md.iterload( filename, top=topology, atom_indices=indices, stride=stride ): n_frames += frames.n_frames coordinates = frames.xyz.sum(axis=0) positions_.append(coordinates) # MDTraj stores positions in nanometers; we convert it to Ångstroms. positions: NDArray[(Any, ...), Float] = np.asfarray(positions_) frames.xyz = positions.sum(axis=0) / n_frames frames.unitcell_angles = frames.unitcell_angles[0, :] frames.unitcell_lengths = frames.unitcell_lengths[0, :] return frames
def to_mdtraj_Topology(item, atom_indices='all', check=True): if check: digest_item(item, 'file:pdb') atom_indices = digest_atom_indices(atom_indices) from mdtraj import load_topology from ..mdtraj_Topology import extract as extract_mdtraj_Topology tmp_item = load_topology(item) tmp_item = extract_mdtraj_Topology(tmp_item, atom_indices=atom_indices, check=False) return tmp_item
def trajs_from_irrows(self, irow): """ Load each trajectory in the rows of an msmbuilder.metadata object :param irow: iterable coming from pd.DataFrame.iterrow method :return i, traj: The traj id (starting at 0) and the mdtraj.Trajectory object """ i, row = irow logger.info('Loading {}'.format(row['traj_fn'])) atom_ids = mdtraj.load_topology(row['top_fn']).select( self.atoms_to_load) logger.debug('Will load {} atoms'.format(len(atom_ids))) traj = mdtraj.load(row['traj_fn'], top=row['top_fn'], stride=self.stride, atom_indices=atom_ids) return i, traj
def test_box_load_save(): t = md.load(get_fn('native2.pdb'), no_boxchk=True) # these four tempfile have extensions (dcd, xtc, trr, h5) that # should store the box information. lets make sure than through a load/save # cycle, the box information is preserved: top = md.load_topology(get_fn('native.pdb'), no_boxchk=True) for temp_fn in [tmpfns['xtc'], tmpfns['dcd'], tmpfns['trr'], tmpfns['h5']]: t.save(temp_fn) if temp_fn.endswith('.h5'): t2 = md.load(temp_fn) else: t2 = md.load(temp_fn, top=top) assert t.unitcell_vectors is not None yield lambda: eq(t.xyz, t2.xyz, decimal=3) yield lambda: eq(t.unitcell_vectors, t2.unitcell_vectors) yield lambda: eq(t.unitcell_angles, t2.unitcell_angles) yield lambda: eq(t.unitcell_lengths, t2.unitcell_lengths)
def preload_top(meta): """Load one topology file into memory. This function checks to make sure there's only one topology file in play. When sampling frames, you have to have all the same topology to concatenate. Parameters ---------- meta : pd.DataFrame The DataFrame of metadata with a column named 'top_fn' Returns ------- top : md.Topology The one topology file that can be used for all trajectories. """ top_fns = set(meta['top_fn']) if len(top_fns) != 1: raise ValueError("More than one topology is used in this project!") return md.load_topology(top_fns.pop())
def __init__(self, topo_path, bins, strc_path_ref=None, pdb_path_ref=None): self.topo = md.load_topology(topo_path) self.solvent_O_idxs = self.topo.select("water and name O") self.bins = bins self.grid_resolution = np.array([1., 1., 1.]) self.xx = np.zeros(3, dtype=float) self.yy = np.zeros(3, dtype=float) self.zz = np.zeros(3, dtype=float) self.center = np.zeros(3, dtype=float) self.f2r = np.eye(3, 3) self.f = None self.fitting = False self.ref = None self.ref_sele = None self.sele = None if strc_path_ref != None and pdb_path_ref != None: self.ref = md.load_frame(strc_path_ref, 0, top=pdb_path_ref) self.ref_sele = self.ref.topology.select( "name CA or name N or name C") self.sele = self.topo.select("name CA or name N or name C") self.fitting = True
def preload_tops(meta): """Load all topology files into memory. This might save some performance compared to re-parsing the topology file for each trajectory you try to load in. Typically, you have far fewer (possibly 1) topologies than trajectories Parameters ---------- meta : pd.DataFrame The DataFrame of metadata with a column named 'top_fn' Returns ------- tops : dict Dictionary of ``md.Topology`` objects, keyed by "top_fn" values. """ top_fns = set(meta['top_fn']) tops = {} for tfn in top_fns: tops[tfn] = md.load_topology(tfn) return tops
def calculate_site_quantities(self, energy=True, entropy=True, hbonds=True, energy_lr_breakdown=False, angular_structure=False, shell_radii=None, r_theta_cutoff=6.0): """ Performs site-based solvation thermodynamics and structure calculations by iterating over frames in the trajectory. If water molecules in hydration sites are already determined (the case when clustering is already done), then the list of hydration site waters in each frame is used to iterate over each water and calculate its properties. If externally determined hydration sites are provided (when self.clustercenter_file is set to a pdb file of hydration sites) then for each site, corresponding water is found in each frame and is used for caclulations. Parameters ---------- energy : bool, optional Description hbonds : bool, optional Description entropy : bool, optional Description Returns ------- None : NoneType This function updates hydration site data structures to store the results of calculations. """ print_progress_bar(0, self.num_frames) topology = md.load_topology(self.topology_file) read_num_frames = 0 if energy_lr_breakdown: if shell_radii is None: shell_radii = [3.5, 5.5, 8.5] else: assert len(shell_radii) == 3, "Water-water energy decomposition supported only upto 3 solvation shells." \ "Please provide outer radii for three shells." shell_radii = [i**2 for i in shell_radii] shell_radii.insert(0, 0.0) self.energy_ww_lr_breakdown = [[ 0.0 for s in shell_radii ] for i in range(self.hsa_data.shape[0])] if angular_structure: if r_theta_cutoff > 8.0: print( "Warning: r_theta_cutoff > 8.0 can take a long time." "Resetting angular structure distance cutoff to 8.0 Angstrom" ) r_theta_cutoff = 8.0 self.angular_st_distribution = [ [] for i in range(self.hsa_data.shape[0]) ] with md.open(self.trajectory) as f: for frame_i in range(self.start_frame, self.start_frame + self.num_frames): print_progress_bar(frame_i - self.start_frame, self.num_frames) f.seek(frame_i) trj = f.read_as_traj(topology, n_frames=1, stride=1) if trj.n_frames == 0: print("No more frames to read.") break else: self._process_frame(trj, frame_i, energy, hbonds, entropy, energy_lr_breakdown, angular_structure, shell_radii, r_theta_cutoff) read_num_frames += 1 if read_num_frames < self.num_frames: print(( "{0:d} frames found in the trajectory, resetting self.num_frames." .format(read_num_frames))) self.num_frames = read_num_frames if entropy: self.generate_data_for_entropycalcs(self.start_frame, self.num_frames) self.run_entropy_scripts() self.normalize_site_quantities(self.num_frames)
def _dist_atom_selection(topology, atom1, atom2, verbose, unpythonize): """ Function takes a topology file and residue selection and verifies if the lattter is possible. It also returns the C-alpha atom selection Input: topology: mdtraj.Topology or string Either mdtraj.Topology object or path to trajectory to be loaded residue_selection: string or list with integers String will be interpreted with the Mdtraj atom selection language. The list will be treated as atom number selection (by default True): boolean if true the function will try to return the residue/ atom selection Output: atom_subset numpy.array array with all the atom numbers corresponding to selection md_topology mdtraj.core.topology.Topology object of protein """ ## First have to load all the inputs if they are defined by a path if isinstance(topology, str): if os.path.exists(topology): try: md_topology = md.load_topology(topology) except: sys.exit('Make sure you have provided a valid path to topology file!') else: if verbose > 0: print 'The following topology file was succesfully loaded: \n %s \n' %(md_topology) elif isinstance(topology, md.core.topology.Topology): md_topology = topology if verbose > 0: print 'The following topology file was succesfully loaded: \n %s \n' %(md_topology) else: sys.exit('Invalid input! Must be a valid path to topology file or mdtraj.Topology object') ## check chosen atoms if isinstance(atom1, str): try: atom_subset1 = md_topology.select(atom1) except: sys.exit('Invalid atom selection!') else: if verbose > 0: print 'Atom1 number: %s' %(atom_subset1[0] + 1) elif isinstance(atom1, int): atom1 = atom1 - 1 try: atom_subset1 = [] if atom1 < md_topology.n_atoms: atom_subset1.append(atom1) except: sys.exit('Atom selection invalid for given topology!') else: if verbose > 0: print 'Atom1 number: %s' %(atom_subset1[0] + 1) else: sys.exit('Invalid atom selection, you need to provide a string or an integer') if isinstance(atom2, str): try: atom_subset2 = md_topology.select(atom2) except: sys.exit('Invalid atom selection!') else: if verbose > 0: print 'Atom2 number: %s' %(atom_subset2[0] + 1) elif isinstance(atom2, int): atom2 = atom2 - 1 try: atom_subset2 = [] if atom2 < md_topology.n_atoms: atom_subset2.append(atom2) except: sys.exit('Invalid atom selection!') else: if verbose > 0: print 'Atom2 number: %s' %(atom_subset2[0] + 1) else: sys.exit('Invalid atom selection, you need to provide a string or an integer') pair = np.append(atom_subset1, atom_subset2).reshape(1,2) # should work until 1109 if unpythonize: # rename residues (still testing) final_resid_name = [] for i in md_topology.subset(pair).atoms: residues = str(i) # position to substitute pos = [] num = [] for i in range(len(residues)): # skip first three letters if i<6 and residues[i].isdigit(): pos.append(i) num.append(int(residues[i])) if len(pos) == 1: if num[-1] < 9: num[-1] += 1 else: num[-1] = 10 if len(pos) == 2: if num[-1] < 9: num[-1] += 1 else: num[-1] = 0 num[0] += 1 if len(pos) == 3: if num[-1] < 9: num[-1] += 1 else: num[1] += 1 num[-1] = 0 z = 0 residues = list(residues) for j in pos: residues[j] = str(num[z]) z+=1 final_resid_name.append("".join(residues)) final_resid = " - ".join(final_resid_name) if verbose > 0: print 'Calculating distance between following atoms: %s \n' %(final_resid) return md_topology, pair, final_resid
def Search_atom_index(df_noes, gro_file): ''' Assing the topology index to each atom Input: - df_noes (pandas dataframe): NOE distances dataframe - gro_file (str): topology file Output: - df_noes (pandas dataframe) : NOE distances modified dataframe ''' top = md.load_topology(gro_file) AtomID1 = [] AtomID2 = [] for noe in df_noes.itertuples(): ai = top.select("resid " + str(int(noe[1]) - 1) + " and(name " + noe[3] + ")") aj = top.select("resid " + str(int(noe[4]) - 1) + " and(name " + noe[6] + ")") if len(ai) == 0: ai = -1 else: ai = ai[0] if len(aj) == 0: aj = -1 else: aj = aj[0] AtomID1.append(int(ai)) AtomID2.append(int(aj)) df_noes['AtomID1'] = AtomID1 df_noes['AtomID2'] = AtomID2 todel = [] toadd = [] for i, row in df_noes.iterrows(): if int(row.AtomID1) > int(row.AtomID2): newrow = row.copy() rid1 = row.ResID1 rt1 = row.ResType1 a1 = row.Atom1 aid1 = row.AtomID1 newrow.ResID1 = row.ResID2 newrow.ResType1 = row.ResType2 newrow.Atom1 = row.Atom2 newrow.AtomID1 = row.AtomID2 newrow.ResID2 = rid1 newrow.ResType2 = rt1 newrow.Atom2 = a1 newrow.AtomID2 = aid1 todel.append(i) toadd.append(newrow) df_noes = df_noes.drop(todel) df_noes = df_noes.append(toadd, ignore_index=True) return df_noes
nframe = Config.getint('makeVectors', 'nframes') structFile = Config.get('makeVectors', 'structFile') trajFile = Config.get('makeVectors', 'trajFile') projection = Config.getboolean('makeVectors', 'projection') nProcess = Config.getint('makeVectors', 'nProcess') atoms = Config.get('makeVectors', 'atoms') pool = Pool(processes=nProcess) ref = True descriptorsList = [] eigenVectorsList = [] eigenValuesList = [] meansList = [] top = md.load_topology(structFile) traj = md.load(glob(trajFile), top=top, atom_indices=top.select(atoms)) print('Loaded topology: \n%s \n' % top) print('Loaded trajectory: \n%s \n' % traj) mask = numpy.ones((traj.xyz.shape[1]), dtype="bool") assert nframe <= traj.n_frames, "More frames selected (%s) than provided (%s)!" % ( nframe, traj.n_frames) print('Starting PCA...') trajIndex = 0 pbar = tqdm(total=nframe, unit='Frame') while trajIndex < nframe:
import matplotlib.cm as cm #}}} n = 4 # Ligand Number N = n-1 # Index for Ligand get_data = True fname = 'ligand%s'%n # Filename for outputs gfiles = [8690,8693,8696,8699] # list of grofile names # List of all the Karplus coefficient Models Models = ["Ruterjans1999","Bax2007","Bax1997","Habeck" ,"Vuister","Pardi"] # Set paths data = '/Users/tuc41004/Desktop/nmr-biceps/BICePs_2.0/test_J_coupling/%s/'%fname gro = data+'%s.gro'%gfiles[N] top = md.load_topology(gro) #trajs = [data + 'traj%s.xtc'%i for i in range(len(glob.glob(data+'traj*.xtc')))] trajs = sorted(glob.glob(data+'traj*.xtc')) # Experimental Data from Erdelyi et al - Table S5. exp_1 = np.array([7.9,7.3,0,7.7,8.4,0,0,0,0]) exp_2 = np.array([0,7.4,0,6.2,7.4,0,7.8,0,0]) exp_3 = np.array([7.3,8.6,0,8.0,8.2,7.3,7.5,0,0]) exp_4 = np.array([6.6,7.3,0,6.8,0,7.4,0,0,0]) # Get Theoretical:{{{ if get_data == True: J = {} # Dictionary of all J3_HN_HA values for each model J_val = {} # J_3_exp = [] #
def _atom_selection(topology, residue_selection, selection, verbose): """ Function takes a topology file and residue selection and verifies if the lattter is possible. It also returns the C-alpha atom selection Input: topology: mdtraj.Topology or string Either mdtraj.Topology object or path to trajectory to be loaded residue_selection: string or list with integers String will be interpreted with the Mdtraj atom selection language. The list will be treated as atom number selection (by default True): boolean if true the function will try to return the residue/ atom selection Output: atom_subset numpy.array array with all the atom numbers corresponding to selection md_topology mdtraj.core.topology.Topology object of protein """ ## First have to load all the inputs if they are defined by a path if isinstance(topology, str): if os.path.exists(topology): try: md_topology = md.load_topology(topology) except: sys.exit('Make sure you have provided a valid path to topology file!') else: if verbose > 0: print 'The following topology file was succesfully loaded: \n %s \n' %(md_topology) elif isinstance(topology, md.core.topology.Topology): md_topology = topology if verbose > 0: print 'The following topology file was succesfully loaded: \n %s \n' %(md_topology) else: sys.exit('Invalid input! Must be a valid path to topology file or mdtraj.Topology object') ## if selection is True the function will try to obtain the specified atoms/residues ## if residue name is specified it will by default look for C-alpha atoms if selection: if isinstance(residue_selection, list): try: atom_subset = md_topology.select(residue_selection) except: sys.exit('Invalid atom selection in list!') else: if verbose > 1: print 'Your selection includes the following atom(s): \n %s \n' %(atom_subset) print 'Your selection includes the following residues: \n' for residue in md_topology.subset(atom_subset).residues: print residue elif isinstance(residue_selection, str): try: atom_subset = md_topology.select('name CA and ' + residue_selection) except: sys.exit('Check if your atom selection command is recognized by the Mdtraj atom selection language!') else: if verbose > 1: print 'Your selection includes the following atom(s): \n %s \n' %(atom_subset) print 'Your selection includes the following residues: \n' for residue in md_topology.subset(atom_subset).residues: print residue else: sys.exit('Make sure you provided a valid residue selection!') else: atom_subset=md_topology.select('name CA and ' + 'all') # now that we are sure that both topology and selection are valid we can return atom_subset # and use the loaded topology file return atom_subset, md_topology
def grid_solvent(option_dict, _start, _stop): _results = OrderedDict() traj_path = option_dict['trajin'][0] parm_path = option_dict['parm'][0] zz_sele = "".join(option_dict['zz']) xx_sele = "".join(option_dict['xx']) cntr_sele = "".join(option_dict['center_sele']) dims = np.array(option_dict['dims']) zz_ref_sele = "".join(option_dict['zz_ref']) xx_ref_sele = "".join(option_dict['xx_ref']) water_str = "".join(option_dict['water']) image = not option_dict['noimage'] verbose = option_dict['verbose'] if verbose: print "Preparing selection masks..." no_zz_ref = False zz_ref_crds = None if zz_ref_sele.rstrip().lstrip().startswith("None"): no_zz_ref = True no_xx_ref = False xx_ref_crds = None if xx_ref_sele.rstrip().lstrip().startswith("None"): no_xx_ref = True topo = md.load_topology(parm_path) ### Z axis zz_indxs = topo.select(zz_sele) if not no_zz_ref: zz_ref_indxs = topo.select(zz_ref_sele) ### X axis xx_indxs = topo.select(xx_sele) if not no_xx_ref: xx_ref_indxs = topo.select(xx_ref_sele) ### Center if cntr_sele.rstrip().lstrip().startswith("None"): center_idxs = np.unique(np.concatenate((xx_indxs, zz_indxs))) solvent = topo.select("%s and not (%s or %s)" % (water_str, zz_sele, xx_sele)) solvent_O = topo.select("(%s and name O) and not (%s or %s)" % (water_str, zz_sele, xx_sele)) else: center_idxs = topo.select(cntr_sele) solvent = topo.select("%s and not (%s or %s or %s)" % (water_str, zz_sele, xx_sele, cntr_sele)) solvent_O = topo.select("(%s and name O) and not (%s or %s or %s)" % (water_str, zz_sele, xx_sele, cntr_sele)) sites = solvent_O[2] - solvent_O[1] solv_field = solvent_field(solvent_O.shape[0], sites, dims) frame_range = range(_start, _stop) N_frames = _stop - _start uc_data = np.zeros((N_frames * 3, 3), dtype=np.float) pop_data = np.zeros(N_frames, dtype=np.int) center_data = np.zeros((N_frames, 3), dtype=np.float) origin_data = np.zeros((N_frames, 3), dtype=np.float) O_idxs_data = None theta_data = None phi_data = None psi_data = None xx1_wat_data = None xx2_wat_data = None yy_wat_data = None zz_wat_data = None O_frac_data = None H1_frac_data = None H2_frac_data = None frame_data = None if image: uc = np.eye(3, 3) else: uc = None zz_crds = np.zeros((zz_indxs.shape[0], 3), dtype=np.float) xx_crds = np.zeros((xx_indxs.shape[0], 3), dtype=np.float) solv_crds = np.zeros((solvent.shape[0], 3), dtype=np.float) cntr_crds = np.zeros((center_idxs.shape[0], 3), dtype=np.float) if verbose: print "Start processing trajectory..." with md.open(traj_path) as md_traj: started_fill = False for i in range(N_frames): frame_i = frame_range[i] if verbose and i % 100 == 0: print "Frame %d..." % frame_i if traj_path.endswith(".pdb"): frame = md.load_frame(traj_path, index=frame_i, top=topo) else: md_traj.seek(frame_i) frame = md_traj.read_as_traj(topo, n_frames=1, stride=1) if not no_zz_ref: zz_ref_crds = np.mean(frame.xyz[0][zz_ref_indxs] * 10., axis=0) if not no_xx_ref: xx_ref_crds = np.mean(frame.xyz[0][xx_ref_indxs] * 10., axis=0) zz_crds = frame.xyz[0][zz_indxs] * 10. xx_crds = frame.xyz[0][xx_indxs] * 10. solv_crds = frame.xyz[0][solvent] * 10. cntr_crds = frame.xyz[0][center_idxs] * 10. if image: uc[:] = frame.unitcell_vectors[0] * 10. solv_field.set_axis(xx_crds, zz_crds, xx_ref_crds, zz_ref_crds) solv_field.set_center(cntr_crds.mean(axis=0)) solv_field.update_field(solv_crds, uc) j = i * 3 uc_data[j:j + 3, :] = solv_field.get_nice_frac2real() / solv_field.delta pop_data[i] = solv_field.N_inside center_data[i] = solv_field.center origin_data[i] = solv_field.origin if solv_field.N_inside > 0: if not started_fill: O_idxs_data = np.copy(solv_field.inside_idxs) theta_data = np.copy(solv_field.theta) phi_data = np.copy(solv_field.phi) psi_data = np.copy(solv_field.psi) xx1_wat_data = np.copy(solv_field.xx1_wat) xx2_wat_data = np.copy(solv_field.xx2_wat) yy_wat_data = np.copy(solv_field.yy_wat) zz_wat_data = np.copy(solv_field.zz_wat) O_frac_data = np.copy(solv_field.O_crds_frac) H1_frac_data = np.copy(solv_field.H1_crds_frac) H2_frac_data = np.copy(solv_field.H2_crds_frac) frame_data = np.empty(solv_field.N_inside, dtype=np.int) frame_data.fill(frame_i) started_fill = True else: O_idxs_data = np.concatenate( (O_idxs_data, solv_field.inside_idxs)) theta_data = np.concatenate((theta_data, solv_field.theta)) phi_data = np.concatenate((phi_data, solv_field.phi)) psi_data = np.concatenate((psi_data, solv_field.psi)) xx1_wat_data = np.concatenate( (xx1_wat_data, solv_field.xx1_wat)) xx2_wat_data = np.concatenate( (xx2_wat_data, solv_field.xx2_wat)) yy_wat_data = np.concatenate( (yy_wat_data, solv_field.yy_wat)) zz_wat_data = np.concatenate( (zz_wat_data, solv_field.zz_wat)) O_frac_data = np.concatenate( (O_frac_data, solv_field.O_crds_frac)) H1_frac_data = np.concatenate( (H1_frac_data, solv_field.H1_crds_frac)) H2_frac_data = np.concatenate( (H2_frac_data, solv_field.H2_crds_frac)) __frame_data = np.empty(solv_field.N_inside, dtype=np.int) __frame_data.fill(frame_i) frame_data = np.concatenate((frame_data, __frame_data)) if verbose and i % 100 == 0: write_files(XYZ=solv_field.O_crds_frac, Format='PDB', Filename='frac_frame%d.pdb' % frame_i) write_files(XYZ=solv_field.O_crds, Format='PDB', Filename='real_frame%d.pdb' % frame_i) _results['uc_data'] = uc_data _results['pop_data'] = pop_data _results['center_data'] = center_data _results['origin_data'] = origin_data _results['O_idxs_data'] = O_idxs_data _results['theta_data'] = theta_data _results['phi_data'] = phi_data _results['psi_data'] = psi_data _results['xx1_wat_data'] = xx1_wat_data _results['xx2_wat_data'] = xx2_wat_data _results['yy_wat_data'] = yy_wat_data _results['zz_wat_data'] = zz_wat_data _results['O_frac_data'] = O_frac_data _results['H1_frac_data'] = H1_frac_data _results['H2_frac_data'] = H2_frac_data _results['frame_data'] = frame_data return _results
def calculate_grid_quantities(self, energy=True, entropy=True, hbonds=True): """ Performs grid-based solvation thermodynamics and structure calculations by iterating over frames in the trajectory. Parameters ---------- energy : bool, optional entropy : hbonds : Returns ------- """ print_progress_bar(0, self.num_frames) if not self.topology_file.endswith(".h5"): topology = md.load_topology(self.topology_file) read_num_frames = 0 with md.open(self.trajectory) as f: for frame_i in range(self.start_frame, self.start_frame + self.num_frames): print_progress_bar(frame_i - self.start_frame, self.num_frames) f.seek(frame_i) if not self.trajectory.endswith(".h5"): trj = f.read_as_traj(topology, n_frames=1, stride=1) else: trj = f.read_as_traj(n_frames=1, stride=1) if trj.n_frames == 0: print("No more frames to read.") break else: self._process_frame(trj, energy, hbonds, entropy) read_num_frames += 1 if read_num_frames < self.num_frames: print(("{0:d} frames found in the trajectory, resetting self.num_frames.".format(read_num_frames))) self.num_frames = read_num_frames # Normalize voxel quantities for voxel in range(self.voxeldata.shape[0]): if self.voxeldata[voxel, 4] > 1.0: self.voxeldata[voxel, 14] = self.voxeldata[voxel, 13] / (self.voxeldata[voxel, 4] * 2.0) self.voxeldata[voxel, 13] /= (self.num_frames * self.voxel_vol * 2.0) self.voxeldata[voxel, 16] = self.voxeldata[voxel, 15] / (self.voxeldata[voxel, 4] * 2.0) self.voxeldata[voxel, 15] /= (self.num_frames * self.voxel_vol * 2.0) if self.voxeldata[voxel, 19] > 0.0: self.voxeldata[voxel, 18] = self.voxeldata[voxel, 17] / (self.voxeldata[voxel, 19] * 2.0) self.voxeldata[voxel, 17] /= (self.num_frames * self.voxel_vol * self.voxeldata[voxel, 19] * 2.0) for i in range(19, 35, 2): self.voxeldata[voxel, i + 1] = self.voxeldata[voxel, i] / self.voxeldata[voxel, 4] self.voxeldata[voxel, i] /= (self.num_frames * self.voxel_vol) else: self.voxeldata[voxel, 13] *= 0.0 self.voxeldata[voxel, 15] *= 0.0 if self.voxeldata[voxel, 19] > 0.0: self.voxeldata[voxel, 17] *= 0.0 for i in range(19, 35, 2): self.voxeldata[voxel, i] *= 0.0 # Calculate entropies if entropy: self.calculate_entropy(num_frames=self.num_frames)
def Extend_noes(df_noes, gro_file): ''' Ambiguous NOEs (M* and Q*) indicate multiple protons and are made explicit. Input: - df_noes (pandas dataframe): NOE distances dataframe - gro_file (str): topology file Output: - df_noes (pandas dataframe) : NOE distances modified dataframe ''' top = md.load_topology(gro_file) # First Atom ## todel = [] toadd = [] for i, row in df_noes.iterrows(): if row.Atom1.startswith("M"): selection = "resid "+str(int(row.ResID1)-1) + \ " and (name =~ 'H"+row.Atom1[1:]+".*')" sel = top.select(selection) assert len(sel) == 3, str(row) + str(sel) todel.append(i) for atom in [top.atom(a) for a in sel]: newrow = row.copy() newrow.Atom1 = atom.name newrow.AtomID1 = atom.index newrow.Origin += 10 toadd.append(newrow) elif row.Atom1.startswith("Q"): selection = "resid "+str(int(row.ResID1)-1) + \ " and (name =~ 'H"+row.Atom1[1:]+".*')" sel = top.select(selection) assert len(sel) == 2, str(row) + str(sel) todel.append(i) for atom in [top.atom(a) for a in sel]: newrow = row.copy() newrow.Atom1 = atom.name newrow.AtomID1 = atom.index newrow.Origin += 10 toadd.append(newrow) df_noes = df_noes.drop(todel) df_noes = df_noes.append(toadd, ignore_index=True) # Second Atom ## todel = [] toadd = [] for i, row in df_noes.iterrows(): if row.Atom2.startswith("M"): selection = "resid "+str(int(row.ResID2)-1) + \ " and (name =~ 'H"+row.Atom2[1:]+".*')" sel = top.select(selection) assert len(sel) == 3, str(row) + str(sel) todel.append(i) for atom in [top.atom(a) for a in sel]: newrow = row.copy() newrow.Atom2 = atom.name newrow.AtomID2 = atom.index newrow.Origin += 10 toadd.append(newrow) elif row.Atom2.startswith("Q"): selection = "resid "+str(int(row.ResID2)-1) + \ " and (name =~ 'H"+row.Atom2[1:]+".*')" sel = top.select(selection) assert len(sel) == 2, str(row) + str(sel) todel.append(i) for atom in [top.atom(a) for a in sel]: newrow = row.copy() newrow.Atom2 = atom.name newrow.AtomID2 = atom.index newrow.Origin += 10 toadd.append(newrow) df_noes = df_noes.drop(todel) df_noes = df_noes.append(toadd, ignore_index=True) df_noes = df_noes.sort_values(['ResID1', 'ResID2']) return df_noes
## Calculate the correlation functions and the standard deviation in the correlation function. ## Save the correlation functions in a dataframe and then to a csv file for later use. Ct, dCt = calc_Ct(vecs_LS) CtDF = pd.DataFrame(Ct, index = np.arange(1,Ct.shape[0]+1)*20/1000, columns=NH_Res) dCtDF = pd.DataFrame(dCt, index = np.arange(1,dCt.shape[0]+1)*20/1000, columns=NH_Res) CtDF.to_csv('/scratch/users/ah14k/ChiZ/Analysis/AMBER03WS/Ct_{}_comb_36us.csv'.format(tauL)) dCtDF.to_csv('/scratch/users/ah14k/ChiZ/Analysis/AMBER03WS/dCt_{}_comb_36us.csv'.format(tauL)) # Begin Curve Fitting; If you don't need to calculate the vectors then skip to here. # In[20]: ## Load Experimental NOE data: This will depend on the shape of your NMR data top14 = "{}AMBER14SB/Tip4pD/PROD/Analysis/07_Prod.noH20.ChiZN_0.025M-NaCl_capped.prmtop".format(ChiZLoc) parm714 = md.load_topology(top14) CAsel = parm714.select('name N and not resname PRO') RESCaINFO = np.array(["{}".format(parm714.atom(x)) for x in CAsel]) RESINFO = np.array([x.replace('-N',"") for x in RESCaINFO]) EXPNOEF = "/scratch/users/ah14k/ChiZ/ChiZN164NOEpH7_New.csv" EXPNOEdf = pd.read_table(EXPNOEF,delimiter=',',skiprows=1,header=None, names=['Residue Number','T1','T1_Err','T2','T2_Err','NOE','NOE_Err']) EXPNOEdf = EXPNOEdf.drop(0) EXPNOEdf = EXPNOEdf.replace('-',np.nan) EXPNOEdf['RES'] = RESINFO EXPNOEdf.iloc[:,1:5] = EXPNOEdf.iloc[:,1:5].astype('float')/1000 EXPNOEdf.iloc[:,5:7] = EXPNOEdf.iloc[:,5:7].astype('float') # In[21]: ## Calculate mean array for the experimental data T1MEANArrExpNT = np.array([0.5732]*EXPNOEdf.loc[3:24].shape[0])
def _truncate_trajectory_file(self, number_of_frames): """Truncates the trajectory file to the specified number of frames. Parameters ---------- number_of_frames: int The number of frames to truncate to. """ import mdtraj from mdtraj.formats.dcd import DCDTrajectoryFile from mdtraj.utils import in_units_of # Load in the required topology object. topology = mdtraj.load_topology(self.input_coordinate_file) # Parse the internal mdtraj distance unit. While private access is # undesirable, this is never publicly defined and I believe this # route to be preferable over hard coding this unit here. base_distance_unit = mdtraj.Trajectory._distance_unit # Get an accurate measurement of the length of the trajectory # without reading it into memory. trajectory_length = 0 for chunk in mdtraj.iterload(self._local_trajectory_path, top=topology): trajectory_length += len(chunk) # Make sure there is at least the expected number of frames. if trajectory_length < number_of_frames: raise ValueError( f'The saved number of trajectory frames ({trajectory_length}) ' f'is less than expected ({number_of_frames}).') elif trajectory_length == number_of_frames: return # Truncate the trajectory by streaming one frame of the trajectory at # a time. temporary_trajectory_path = f'{self._local_trajectory_path}.tmp' with DCDTrajectoryFile(self._local_trajectory_path, 'r') as input_file: with DCDTrajectoryFile(temporary_trajectory_path, 'w') as output_file: for frame_index in range(0, number_of_frames): frame = input_file.read_as_traj(topology, n_frames=1, stride=1) output_file.write( xyz=in_units_of(frame.xyz, base_distance_unit, output_file.distance_unit), cell_lengths=in_units_of(frame.unitcell_lengths, base_distance_unit, output_file.distance_unit), cell_angles=frame.unitcell_angles[0]) os.replace(temporary_trajectory_path, self._local_trajectory_path) # Do a sanity check to make sure the trajectory was successfully truncated. new_trajectory_length = 0 for chunk in mdtraj.iterload(self._local_trajectory_path, top=self.input_coordinate_file): new_trajectory_length += len(chunk) if new_trajectory_length != number_of_frames: raise ValueError('The trajectory was incorrectly truncated.')
start_time = 0.0 step = 200.0 # ps per md frame max_conf = 1000 # max number of conformations that can be handled to efficiently calculate RMSDs n_clusters = 6 n_ref = 4 # number of reference models to use for comparison # Load trajectories to analyse trajs = [] for i in [1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20]: print(i) #for i in range(1,no_replicas+1): trajs.append( mdtraj.load('../' + str(i) + '/peptide_conf/all_novsite_fit.xtc', top='../frame0_chainA_novsite.pdb')) topology = mdtraj.load_topology('../frame0_chainA_novsite.pdb') # Load reference structures sys1 = mdtraj.load_topology('pept_exp_models/semiclosed23_new1.pdb') sys2 = mdtraj.load_topology('pept_exp_models/Tamm_1.pdb') sys3 = mdtraj.load_topology('pept_exp_models/Bax_cut_20aa_1.pdb') sys4 = mdtraj.load_topology('pept_exp_models/1xop_straight_helix_model1.pdb') ref1 = mdtraj.load('pept_exp_models/semiclosed23_new1.pdb', atom_indices=sys1.select('resid 0 to 19 and backbone')) ref2 = mdtraj.load('pept_exp_models/Tamm_1.pdb', atom_indices=sys2.select('backbone')) ref3 = mdtraj.load('pept_exp_models/Bax_cut_20aa_1.pdb', atom_indices=sys3.select('backbone')) ref4 = mdtraj.load('pept_exp_models/1xop_straight_helix_model1.pdb',
def generate_clusters(self, density_factor, ligand_file, clustercenter_file): """Generate hydration sites from water molecules found in the binding site during the simulation. Clustering is done in two steps; i). An initial clustering over a 10% of frames, and ii). A refinement step where all frames are used. Parameters ---------- ligand_file : string Name of the PDB file containing atomic coordinates of the ligand, assumed to be co-crystallized with the protein. Returns ------- final_cluster_coords : numpy.ndarray Coordinates of hydration sites, represented by a 2-D array with shape N x 3, where N is the number of hydration sites identified during clustering. site_waters : list List of N sub-lists where N is the number of identified hydration sites, each sublist consist of a 3-element tuple for every water identified in that site. First element of the tuple is frame number, second element is correct index of the oxygen atom in the the original topology and third element is the offset index as read from a version of a trimmed version trajectory for clustering. Notes ----- The following attributes of the object are updated when the clustering is successfully completed. self.hsa_region_O_ids: The indices of water oxygen atoms in HSA region for each frame are stored in the corresponding lists. self.hsa_region_flat_ids: Same as above except that indices are not atom indices from the topology but in a sequence from 0 to N, where N is the total number of water oxygen atoms found in the HSA region throughout the simulation. self.hsa_region_water_coords: An N x 3 numpy array is initialized, where N is the total number of water water oxygen atoms found in the HSA region throughout the simulation. The array gets populated during individual frame processing. """ sphere_radius = md.utils.in_units_of(1.0, "angstroms", "nanometers") topology = md.load_topology(self.topology_file) if self.non_water_atom_ids.shape[0] == 0: raise Exception( ValueError, "Clustering is supported only for solute-solvent systems, no solute atoms found." ) ligand = md.load_pdb(ligand_file, no_boxchk=True) ligand_coords = ligand.xyz[0, :, :] binding_site_atom_indices = np.asarray( list(range(ligand_coords.shape[0]))) init_cluster_coords = None # Step 1: Initial Clustering if user didn't provide cluster centers if clustercenter_file is None: clustering_stride = 10 print("Reading trajectory for clustering.") with md.open(self.trajectory) as f: f.seek(self.start_frame) # read all frames if no frames specified by user if self.num_frames is None: trj_short = f.read_as_traj( topology, atom_indices=np.concatenate( (binding_site_atom_indices, self.wat_oxygen_atom_ids )))[self.start_frame::clustering_stride] else: trj_short = f.read_as_traj( topology, atom_indices=np.concatenate((binding_site_atom_indices, self.wat_oxygen_atom_ids)) )[self.start_frame:self.num_frames:clustering_stride] print(trj_short.n_frames) if trj_short.n_frames < 10: sys.exit( "Clustering requires at least 100 frames, current trajectory contains {0:d} frames." .format(trj_short.n_frames)) print("Performing an initial clustering over {0:d} frames.". format(trj_short.n_frames)) # Obtain water molecules solvating the binding site # FIXME: This is a workaround to use MDTraj compute_neighbor function xyz coordinates of the trajectory are # modified such that first n atoms coordinates are switched to n atoms of ligand coordinates. # Unexpected things will happen if the number of solute atoms less than the number of ligand atoms, which is # highly unlikely. coords = trj_short.xyz for i_frame in range(trj_short.n_frames): for pseudo_index in range( binding_site_atom_indices.shape[0]): coords[i_frame, pseudo_index, :] = ligand_coords[ pseudo_index, :] haystack = np.setdiff1d(trj_short.topology.select("all"), binding_site_atom_indices) binding_site_waters = md.compute_neighbors( trj_short, self.hsa_region_radius, binding_site_atom_indices, haystack_indices=haystack) # generate a list of tuples, each tuple is a water and corresponding frame number in trj_short water_id_frame_list = [(i, nbr) for i in range(len(binding_site_waters)) for nbr in binding_site_waters[i]] # Start initial clustering by building a KDTree and get initial neighbor count for all waters water_coordinates = np.ma.array( [coords[wat[0], wat[1], :] for wat in water_id_frame_list], mask=False) tree = spatial.cKDTree(water_coordinates) nbr_list = tree.query_ball_point(water_coordinates, sphere_radius) nbr_count_list = np.ma.array([len(nbrs) for nbrs in nbr_list], mask=False) cutoff = trj_short.n_frames * density_factor * 0.1401 if np.ceil(cutoff) - cutoff <= 0.5: cutoff = np.ceil(cutoff) else: cutoff = np.floor(cutoff) n_wat = 3 * cutoff # Set up clustering loop cluster_list = [] cluster_iter = 0 while n_wat > cutoff: # Get water with max nbrs and retrieve its neighbors and marked for exclusion in next iteration max_index = np.argmax(nbr_count_list) to_exclude = np.array(nbr_list[max_index]) # Set current water count to current neighbors plus one for the water itself n_wat = len(to_exclude) + 1 # Mask current water, its neighbors so that they are not considered in the next iteration nbr_count_list.mask[to_exclude] = True nbr_count_list.mask[max_index] = True # Mask current waters' and its neighbors' coords so that they are not considered in the next iteration water_coordinates.mask[to_exclude] = True water_coordinates.mask[max_index] = True # Accumulate neighbors for each water in current cluster, removing common neighbors nbrs_of_to_exclude = np.unique( np.array([ n_excluded for excluded_nbrs in nbr_list[to_exclude] for n_excluded in excluded_nbrs ])) # Obtain the list of waters whose neighbors need to be updated due to exclusion of the waters above to_update = np.setxor1d(to_exclude, nbrs_of_to_exclude) to_update = np.setdiff1d(to_update, np.asarray(max_index)) # Update the neighbor count for each water from the list generated above if to_update.shape[0] != 0: tree = spatial.cKDTree(water_coordinates) updated_nbr_list = tree.query_ball_point( water_coordinates[to_update], sphere_radius) # for each updated member, get its original index and update the original neighbor search list for index, nbrs in enumerate(updated_nbr_list): if not nbr_count_list.mask[to_update[index]]: nbr_count_list[to_update[index]] = len(nbrs) # Check distances with previously identified clusters and do not consider if within 1.2 A # of an existing cluster current_wat = water_id_frame_list[max_index] current_wat_coords = md.utils.in_units_of( coords[current_wat[0], current_wat[1], :], "nanometers", "angstroms") near_flag = 0 if len(cluster_list) != 0: for clust in cluster_list: clust_coords = coords[clust[0], clust[1], :] dist = np.linalg.norm(current_wat_coords - clust_coords) if dist < 1.20: near_flag += 1 if near_flag == 0: cluster_iter += 1 cluster_list.append(water_id_frame_list[max_index]) init_cluster_coords = [ coords[cluster[0], cluster[1], :] for cluster in cluster_list ] else: clusters_pdb_file = md.load_pdb(clustercenter_file, no_boxchk=True) init_cluster_coords = clusters_pdb_file.xyz[0, :, :] # Read full trajectory print("Reading trajectory to obtain water molecules for each cluster.") with md.open(self.trajectory) as f: f.seek(self.start_frame) if self.num_frames is None: trj = f.read_as_traj(topology, stride=1, atom_indices=np.concatenate( (binding_site_atom_indices, self.wat_oxygen_atom_ids))) self.num_frames = trj.n_frames else: trj = f.read_as_traj(topology, n_frames=self.num_frames, stride=1, atom_indices=np.concatenate( (binding_site_atom_indices, self.wat_oxygen_atom_ids))) if trj.n_frames < self.num_frames: print(( "Warning: {0:d} frames found in the trajectory, resetting self.num_frames." .format(trj.n_frames))) self.num_frames = trj.n_frames for i_frame in range(trj.n_frames): for pseudo_index in range(binding_site_atom_indices.shape[0]): trj.xyz[i_frame, pseudo_index, :] = ligand_coords[pseudo_index, :] haystack = np.setdiff1d(trj.topology.select("all"), binding_site_atom_indices) start_point = haystack[0] binding_site_waters = md.compute_neighbors( trj, self.hsa_region_radius, binding_site_atom_indices, haystack_indices=haystack) # From the full frame-wise set of waters in the binding site, build two more frame-wise lists # one where each frame has a correct index of waters and another with a new index which ranges from # 0 to M, where M is the total number of hsa region waters - 1 start = 0 for i in range(len(binding_site_waters)): self.hsa_region_O_ids.append([]) self.hsa_region_flat_ids.append([]) for wat in binding_site_waters[i]: wat_0 = wat - start_point wat_offset = ( wat_0 * self.water_sites) + self.wat_oxygen_atom_ids[0] self.hsa_region_O_ids[i].append(wat_offset) self.hsa_region_flat_ids[i].append(start) start += 3 water_id_frame_list = [(i, nbr) for i in range(len(binding_site_waters)) for nbr in binding_site_waters[i]] water_coordinates = np.array( [trj.xyz[wat[0], wat[1], :] for wat in water_id_frame_list]) # Initialize array that stores coordinates all water molecules in HSA region, used for entropy calcs self.hsa_region_water_coords = np.zeros( (len(water_id_frame_list) * 3, 3), dtype=float) tree = spatial.cKDTree(water_coordinates) nbr_list = tree.query_ball_point(init_cluster_coords, sphere_radius) final_cluster_coords = [] cutoff = int(self.num_frames * density_factor * 0.1401) if np.ceil(cutoff) - cutoff <= 0.5: cutoff = np.ceil(cutoff) else: cutoff = np.floor(cutoff) # apply refinement if user defined clusters not provided if clustercenter_file is None: # Step 2: Refinement # Initialize variables and data structures # Read in the trajectory but only first N solute atoms where N equals the number of ligand atoms # plus all water oxygen atoms # WARNING: This shifts indices of waters and once they are assigned to clusters, the indices need to # be corrected. print(( "Refining initial cluster positions by considering {0:d} frames." .format(self.num_frames))) # For each cluster, set cluster center equal to geometric center of all waters in the cluster site_waters = [] cluster_index = 1 for cluster in nbr_list: cluster_water_coords = water_coordinates[cluster] if len(cluster) > cutoff: near_flag = 0 waters_offset = [ (water_id_frame_list[wat][0] + self.start_frame, ((water_id_frame_list[wat][1] - start_point) * self.water_sites) + self.wat_oxygen_atom_ids[0]) for wat in cluster ] com = np.zeros(3) masses = np.ones(cluster_water_coords.shape[0]) masses /= masses.sum() com[:] = water_coordinates[cluster].T.dot(masses) cluster_center = com[:] # Raise flag if the current cluster center is within 1.2 A of existing cluster center for other, coord in enumerate(final_cluster_coords[:-1]): dist = np.linalg.norm( md.utils.in_units_of(cluster_center, "nanometers", "angstroms") - coord) if dist < 1.20: near_flag += 1 # Only add cluster center if it is at a safe distance from others if near_flag == 0: final_cluster_coords.append( md.utils.in_units_of(cluster_center, "nanometers", "angstroms")) site_waters.append(waters_offset) cluster_index += 1 # otherwise store data for each user defined cluster else: # For each cluster, set cluster center equal to geometric center of all waters in the cluster final_cluster_coords = md.utils.in_units_of( init_cluster_coords, "nanometers", "angstroms") site_waters = [] cluster_index = 1 for cluster in nbr_list: waters_offset = [ (water_id_frame_list[wat][0] + self.start_frame, ((water_id_frame_list[wat][1] - start_point) * self.water_sites) + self.wat_oxygen_atom_ids[0]) for wat in cluster ] site_waters.append(waters_offset) cluster_index += 1 # Write clustercenter file write_watpdb_from_coords("clustercenterfile", final_cluster_coords) self.clustercenter_file = "clustercenterfile.pdb" print(("Final number of clusters: {0:d}".format( len(final_cluster_coords)))) return np.asarray(final_cluster_coords), site_waters
parser = argparse.ArgumentParser( description='This script demonstrates integration of pymdash with mdtraj' ' to extract the representative frame for each dash state from a raw ' 'trajectory file.') parser.add_argument('dash_out_file', help='Path to the output file from dash') parser.add_argument('trajectory_file', help='Path to the trajectory data file used to provide' ' input for the dash run.') parser.add_argument( '-t', '--topology', help='Path to a suitable structure file that can be ' 'used to derive a topology by mdtraj. See md_traj.load_toplogy for details' ) args = parser.parse_args() if args.topology is not None: topology = md.load_topology(args.topology) else: topology = None with open(args.dash_out_file) as f: dash = mdash.DashOutput(f) print('Writing pdb files for {} dash states'.format(dash.n_states)) for state in dash.states: traj = md.load_frame(args.trajectory_file, state.rep_frame - 1, top=topology) traj.save_pdb('state_{}.pdb'.format(state.index))