def load_restrt(filename, top=None, atom_indices=None): """Load an AMBER ASCII restart/inpcrd file. Since this file doesn't contain information to specify the topology, you need to supply a topology Parameters ---------- filename : str name of the AMBER restart file top : {str, Trajectory, Topology} Pass in either the path to a file containing topology information (e.g., a PDB, an AMBER prmtop, or certain types of Trajectory objects) to supply the necessary topology information that is not present in these files atom_indices : array_like, optional If not None, then read only a subset of the atoms coordinates from the file. Returns ------- trajectory : md.Trajectory The resulting trajectory, as an md.Trajectory object See Also -------- mdtraj.AmberRestartFile : Low level interface to AMBER restart files """ from mdtraj.core.trajectory import _parse_topology topology = _parse_topology(top) atom_indices = cast_indices(atom_indices) with AmberRestartFile(filename) as f: return f.read_as_traj(topology, atom_indices=atom_indices)
def __init__(self, path, mode='r', topology=None, stride=1, atom_indices=None, verbose=False): if mode != 'r': raise ValueError('mode must be "r"') self.path = path self.topology = topology self.stride = stride self.atom_indices = atom_indices self.verbose = verbose if isinstance(path, list): self.glob_matches = [expanduser(fn) for fn in path] else: self.glob_matches = sorted(glob.glob(expanduser(path)), key=_keynat) if topology is None: self._topology = None else: self._topology = _parse_topology(os.path.expanduser(topology))
def load(filename, chunks=10, **kwargs): """ A loader that will mimic mdtraj.Trajectory.load, but construct a dasktraj.Trajectory with a dask.array as xyz """ top = kwargs.pop('top', None) extension = _get_extension(filename) if extension not in _TOPOLOGY_EXTS: topology = _parse_topology(top) length = len(open(filename)) n_chunks = int(length / chunks) frames_left = length % chunks if frames_left != 0: n_chunks += 1 # TODO this needs to be closed at some point data = load_chunks(filename, extension, chunks, range(n_chunks), **kwargs) #TODO: use this to construct unitcells # Pop out irelevant info uv = data.pop('unitcell_vectors') traj = Trajectory(topology=topology, delayed_objects=data, **data) if uv is not None: traj.unitcell_vectors = uv return traj
def load_mdcrd(filename, top=None, stride=None, atom_indices=None, frame=None): """Load an AMBER mdcrd file. Parameters ---------- filename : path-like Path of AMBER mdcrd file. top : {str, Trajectory, Topology} The BINPOS format does not contain topology information. Pass in either the path to a pdb file, a trajectory, or a topology to supply this information. stride : int, default=None Only read every stride-th frame atom_indices : array_like, optional If not none, then read only a subset of the atoms coordinates from the file. frame : int, optional Use this option to load only a single frame from a trajectory on disk. If frame is None, the default, the entire trajectory will be loaded. If supplied, ``stride`` will be ignored. Returns ------- trajectory : md.Trajectory The resulting trajectory, as an md.Trajectory object. See Also -------- mdtraj.MDCRDTrajectoryFile : Low level interface to MDCRD files """ from mdtraj.core.trajectory import _parse_topology, Trajectory # we make it not required in the signature, but required here. although this # is a little wierd, its good because this function is usually called by a # dispatch from load(), where top comes from **kwargs. So if its not supplied # we want to give the user an informative error message if top is None: raise ValueError('"top" argument is required for load_mdcrd') if not isinstance(filename, (string_types, os.PathLike)): raise TypeError('filename must be of type path-like for load_mdcrd. ' 'you supplied %s' % type(filename)) topology = _parse_topology(top) atom_indices = cast_indices(atom_indices) with MDCRDTrajectoryFile(filename, topology.n_atoms) as f: if frame is not None: f.seek(frame) n_frames = 1 else: n_frames = None return f.read_as_traj(topology, n_frames=n_frames, stride=stride, atom_indices=atom_indices)
def load_netcdf(filename, top=None, stride=None, atom_indices=None, frame=None): """Load an AMBER NetCDF file. Since the NetCDF format doesn't contain information to specify the topology, you need to supply a topology Parameters ---------- filename : str filename of AMBER NetCDF file. top : {str, Trajectory, Topology} The NetCDF format does not contain topology information. Pass in either the path to a pdb file, a trajectory, or a topology to supply this information. stride : int, default=None Only read every stride-th frame atom_indices : array_like, optional If not None, then read only a subset of the atoms coordinates from the file. This may be slightly slower than the standard read because it requires an extra copy, but will save memory. frame : int, optional Use this option to load only a single frame from a trajectory on disk. If frame is None, the default, the entire trajectory will be loaded. If supplied, ``stride`` will be ignored. Returns ------- trajectory : md.Trajectory The resulting trajectory, as an md.Trajectory object. See Also -------- mdtraj.NetCDFTrajectoryFile : Low level interface to NetCDF files """ from mdtraj.core.trajectory import _parse_topology, Trajectory if top is None: raise ValueError('"top" argument is required for load_netcdf') topology = _parse_topology(top) atom_indices = cast_indices(atom_indices) with NetCDFTrajectoryFile(filename) as f: if frame is not None: f.seek(frame) n_frames = 1 else: n_frames = None return f.read_as_traj(topology, n_frames=n_frames, atom_indices=atom_indices, stride=stride)
def load_mdcrd(filename, top=None, stride=None, atom_indices=None, frame=None): """Load an AMBER mdcrd file. Parameters ---------- filename : str String filename of AMBER mdcrd file. top : {str, Trajectory, Topology} The BINPOS format does not contain topology information. Pass in either the path to a pdb file, a trajectory, or a topology to supply this information. stride : int, default=None Only read every stride-th frame atom_indices : array_like, optional If not none, then read only a subset of the atoms coordinates from the file. frame : int, optional Use this option to load only a single frame from a trajectory on disk. If frame is None, the default, the entire trajectory will be loaded. If supplied, ``stride`` will be ignored. Returns ------- trajectory : md.Trajectory The resulting trajectory, as an md.Trajectory object. See Also -------- mdtraj.MDCRDTrajectoryFile : Low level interface to MDCRD files """ from mdtraj.core.trajectory import _parse_topology, Trajectory # we make it not required in the signature, but required here. although this # is a little wierd, its good because this function is usually called by a # dispatch from load(), where top comes from **kwargs. So if its not supplied # we want to give the user an informative error message if top is None: raise ValueError('"top" argument is required for load_mdcrd') if not isinstance(filename, string_types): raise TypeError('filename must be of type string for load_mdcrd. ' 'you supplied %s' % type(filename)) topology = _parse_topology(top) atom_indices = cast_indices(atom_indices) with MDCRDTrajectoryFile(filename, topology.n_atoms) as f: if frame is not None: f.seek(frame) n_frames = 1 else: n_frames = None return f.read_as_traj(topology, n_frames=n_frames, stride=stride, atom_indices=atom_indices)
def load_netcdf(filename, top=None, stride=None, atom_indices=None, frame=None): """Load an AMBER NetCDF file. Since the NetCDF format doesn't contain information to specify the topology, you need to supply a topology Parameters ---------- filename : str filename of AMBER NetCDF file. top : {str, Trajectory, Topology} The NetCDF format does not contain topology information. Pass in either the path to a pdb file, a trajectory, or a topology to supply this information. stride : int, default=None Only read every stride-th frame atom_indices : array_like, optional If not none, then read only a subset of the atoms coordinates from the file. This may be slightly slower than the standard read because it requires an extra copy, but will save memory. frame : int, optional Use this option to load only a single frame from a trajectory on disk. If frame is None, the default, the entire trajectory will be loaded. If supplied, ``stride`` will be ignored. Returns ------- trajectory : md.Trajectory The resulting trajectory, as an md.Trajectory object. See Also -------- mdtraj.NetCDFTrajectoryFile : Low level interface to NetCDF files """ from mdtraj.core.trajectory import _parse_topology, Trajectory topology = _parse_topology(top) atom_indices = cast_indices(atom_indices) if atom_indices is not None: topology = topology.subset(atom_indices) with NetCDFTrajectoryFile(filename) as f: if frame is not None: f.seek(frame) xyz, time, cell_lengths, cell_angles = f.read(n_frames=1, atom_indices=atom_indices) else: xyz, time, cell_lengths, cell_angles = f.read(stride=stride, atom_indices=atom_indices) xyz = in_units_of(xyz, f.distance_unit, Trajectory._distance_unit, inplace=True) cell_lengths = in_units_of(cell_lengths, f.distance_unit, Trajectory._distance_unit, inplace=True) trajectory = Trajectory(xyz=xyz, topology=topology, time=time, unitcell_lengths=cell_lengths, unitcell_angles=cell_angles) return trajectory
def load_restrt(filename, top=None, atom_indices=None): """Load an AMBER ASCII restart/inpcrd file. Since this file doesn't contain information to specify the topology, you need to supply a topology Parameters ---------- filename : str name of the AMBER restart file top : {str, Trajectory, Topology} Pass in either the path to a file containing topology information (e.g., a PDB, an AMBER prmtop, or certain types of Trajectory objects) to supply the necessary topology information that is not present in these files atom_indices : array_like, optional If not None, then read only a subset of the atoms coordinates from the file. Returns ------- trajectory : md.Trajectory The resulting trajectory, as an md.Trajectory object See Also -------- mdtraj.AmberRestartFile : Low level interface to AMBER restart files """ from mdtraj.core.trajectory import _parse_topology, Trajectory topology = _parse_topology(top) atom_indices = cast_indices(atom_indices) if atom_indices is not None: topology = topology.subset(atom_indices) with AmberRestartFile(filename) as f: xyz, time, cell_lengths, cell_angles = f.read( atom_indices=atom_indices) xyz = in_units_of(xyz, f.distance_unit, Trajectory._distance_unit, inplace=True) cell_lengths = in_units_of(cell_lengths, f.distance_unit, Trajectory._distance_unit, inplace=True) trajectory = Trajectory(xyz=xyz, topology=topology, time=time, unitcell_lengths=cell_lengths, unitcell_angles=cell_angles) return trajectory
def load_xml(filename, top=None): """Load a single conformation from an OpenMM XML file. The OpenMM serialized state XML format contains additional information that is not read by this method, including forces, energies, and velocities. Here, we just read the positions and the box vectors. Parameters ---------- filename : string The path on disk to the XML file top : {str, Trajectory, Topology} The XML format does not contain topology information. Pass in either the path to a pdb file, a trajectory, or a topology to supply this information. Returns ------- trajectory : md.Trajectory The resulting trajectory, as an md.Trajectory object. """ import xml.etree.cElementTree as etree from mdtraj.core.trajectory import _parse_topology, Trajectory topology = _parse_topology(top) tree = etree.parse(filename) # get all of the positions from the XML into a list of tuples # then convert to a numpy array positions = [] for position in tree.getroot().find('Positions'): positions.append((float(position.attrib['x']), float(position.attrib['y']), float(position.attrib['z']))) box = [] vectors = tree.getroot().find('PeriodicBoxVectors') for name in ['A', 'B', 'C']: box.append((float(vectors.find(name).attrib['x']), float(vectors.find(name).attrib['y']), float(vectors.find(name).attrib['z']))) traj = Trajectory(xyz=np.array(positions), topology=topology) traj.unitcell_vectors = np.array(box).reshape(1,3,3) return traj
def load_xml(filename, top=None): """Load a single conformation from an OpenMM XML file. The OpenMM serialized state XML format contains additional information that is not read by this method, including forces, energies, and velocities. Here, we just read the positions and the box vectors. Parameters ---------- filename : path-like The path on disk to the XML file top : {str, Trajectory, Topology} The XML format does not contain topology information. Pass in either the path to a pdb file, a trajectory, or a topology to supply this information. Returns ------- trajectory : md.Trajectory The resulting trajectory, as an md.Trajectory object. """ import xml.etree.cElementTree as etree from mdtraj.core.trajectory import _parse_topology, Trajectory topology = _parse_topology(top) tree = etree.parse(filename) # get all of the positions from the XML into a list of tuples # then convert to a numpy array positions = [] for position in tree.getroot().find('Positions'): positions.append((float(position.attrib['x']), float(position.attrib['y']), float(position.attrib['z']))) box = [] vectors = tree.getroot().find('PeriodicBoxVectors') for name in ['A', 'B', 'C']: box.append((float(vectors.find(name).attrib['x']), float(vectors.find(name).attrib['y']), float(vectors.find(name).attrib['z']))) traj = Trajectory(xyz=np.array(positions), topology=topology) traj.unitcell_vectors = np.array(box).reshape(1,3,3) return traj
def load_restrt(filename, top=None, atom_indices=None): """Load an AMBER ASCII restart/inpcrd file. Since this file doesn't contain information to specify the topology, you need to supply a topology Parameters ---------- filename : str name of the AMBER restart file top : {str, Trajectory, Topology} Pass in either the path to a file containing topology information (e.g., a PDB, an AMBER prmtop, or certain types of Trajectory objects) to supply the necessary topology information that is not present in these files atom_indices : array_like, optional If not None, then read only a subset of the atoms coordinates from the file. Returns ------- trajectory : md.Trajectory The resulting trajectory, as an md.Trajectory object See Also -------- mdtraj.AmberRestartFile : Low level interface to AMBER restart files """ from mdtraj.core.trajectory import _parse_topology, Trajectory topology = _parse_topology(top) atom_indices = cast_indices(atom_indices) if atom_indices is not None: topology = topology.subset(atom_indices) with AmberRestartFile(filename) as f: xyz, time, cell_lengths, cell_angles = f.read(atom_indices=atom_indices) xyz = in_units_of(xyz, f.distance_unit, Trajectory._distance_unit, inplace=True) cell_lengths = in_units_of(cell_lengths, f.distance_unit, Trajectory._distance_unit, inplace=True) trajectory = Trajectory(xyz=xyz, topology=topology, time=time, unitcell_lengths=cell_lengths, unitcell_angles=cell_angles) return trajectory
def load(filename, chunks=10, **kwargs): """ A loader that will mimic :py:func:`mdtraj.Trajectory.load()`, but construct a :py:class:`dasktraj.Trajectory` with a :py:class:`dask.array` as xyz, time, and unitcell properties. Parameters ---------- filename : string Filename of the file to load. chunks : int Number of frames per chunk. Returns ------- trajectory A :py:class:`dasktraj.Trajectory` """ top = kwargs.pop("top", None) extension = _get_extension(filename) if extension not in _TOPOLOGY_EXTS: topology = _parse_topology(top) filename = os.path.abspath(filename) length = len(open(filename)) n_chunks = int(length / chunks) frames_left = length % chunks if frames_left != 0: n_chunks += 1 # TODO this needs to be closed at some point data = load_chunks(filename, extension, chunks, range(n_chunks), **kwargs) # TODO: use this to construct unitcells # Pop out irrelevant info uv = data.pop("unitcell_vectors") traj = Trajectory(topology=topology, delayed_objects=data, **data) if uv is not None: traj.unitcell_vectors = uv return traj
def load_mdcrd(filename, top=None, stride=None, atom_indices=None, frame=None): """Load an AMBER mdcrd file. Parameters ---------- filename : str String filename of AMBER mdcrd file. top : {str, Trajectory, Topology} The BINPOS format does not contain topology information. Pass in either the path to a pdb file, a trajectory, or a topology to supply this information. stride : int, default=None Only read every stride-th frame atom_indices : array_like, optional If not none, then read only a subset of the atoms coordinates from the file. frame : int, optional Use this option to load only a single frame from a trajectory on disk. If frame is None, the default, the entire trajectory will be loaded. If supplied, ``stride`` will be ignored. Returns ------- trajectory : md.Trajectory The resulting trajectory, as an md.Trajectory object. See Also -------- mdtraj.MDCRDTrajectoryFile : Low level interface to MDCRD files """ from mdtraj.core.trajectory import _parse_topology, Trajectory # we make it not required in the signature, but required here. although this # is a little wierd, its good because this function is usually called by a # dispatch from load(), where top comes from **kwargs. So if its not supplied # we want to give the user an informative error message if top is None: raise ValueError('"top" argument is required for load_mdcrd') if not isinstance(filename, string_types): raise TypeError('filename must be of type string for load_mdcrd. ' 'you supplied %s' % type(filename)) topology = _parse_topology(top) atom_indices = cast_indices(atom_indices) with MDCRDTrajectoryFile(filename, n_atoms=topology._numAtoms) as f: if frame is not None: f.seek(frame) xyz, cell_lengths = f.read(n_frames=1, atom_indices=atom_indices) else: xyz, cell_lengths = f.read(stride=stride, atom_indices=atom_indices) in_units_of(xyz, f.distance_unit, Trajectory._distance_unit, inplace=True) if cell_lengths is not None: in_units_of(cell_lengths, f.distance_unit, Trajectory._distance_unit, inplace=True) # Assume that its a rectilinear box cell_angles = 90.0 * np.ones_like(cell_lengths) if atom_indices is not None: topology = topology.subset(atom_indices) time = np.arange(len(xyz)) if frame is not None: time += frame elif stride is not None: time *= stride t = Trajectory(xyz=xyz, topology=topology, time=time) if cell_lengths is not None: t.unitcell_lengths = cell_lengths t.unitcell_angles = cell_angles return t
def main(args, verbose=True): """Run the main script. Parameters ---------- args : argparse.Namespace The collected command line arguments """ if args.atom_indices is not None: atom_indices = np.loadtxt(args.atom_indices, int) else: atom_indices = None out_x = ext(args.output) out_units = units[out_x] out_fields = fields[out_x] OutFileFormat = formats[out_x] in_x = ext(args.input[0]) InFileFormat = formats[in_x] if args.topology is not None: topology = _parse_topology(args.topology) else: topology = None if topology is not None and atom_indices is not None: topology = topology.subset(atom_indices) n_total = 0 if args.index is not None: assert len(args.input) == 1 # when chunk is None, we load up ALL of the frames. this isn't # strictly necessary, and it costs more memory, but it's ALOT # harder to get the code correct when we need to use data[start:end] # notation when all of the data isn't loaded up at once. it's easy # for hdf5 and netcdf, but for the others... assert args.chunk is None # this is the normal invocation pattern, but for PDBTrajectoryFile it's # different outfile_factory = functools.partial(OutFileFormat, args.output, 'w', force_overwrite=args.force) with outfile_factory() as outfile: for fn in args.input: assert in_x == ext(fn) with InFileFormat(fn, 'r') as infile: while True: data, in_units, n_frames = read(infile, args.chunk, stride=args.stride, atom_indices=atom_indices) if n_frames == 0: break if topology is not None: # if the user supplied a topology, we should probably # do some simple checks if data['xyz'].shape[1] != topology._numAtoms: warnings.warn('sdsfsd!!!!') data['topology'] = topology # if they want a specific set of frames, get those # with slice notation if args.index is not None: _data = {} for k, v in iteritems(data): if isinstance(v, np.ndarray): # we don't want the dimensionality to go deficient if isinstance(args.index, int): _data[k] = v[np.newaxis, args.index] else: _data[k] = v[args.index] elif isinstance(v, md.Topology): _data[k] = v else: raise RuntineError() data = _data print(list(data.keys())) n_frames = len(data['xyz']) convert(data, in_units, out_units, out_fields) write(outfile, data) n_total += n_frames if verbose: sys.stdout.write('\rconverted %d frames, %d atoms' % (n_total, data['xyz'].shape[1])) sys.stdout.flush() if verbose: print(' ')
def load_lammpstrj(filename, top=None, stride=None, atom_indices=None, frame=None, unit_set='real'): """Load a LAMMPS trajectory file. Parameters ---------- filename : str String filename of LAMMPS trajectory file. top : {str, Trajectory, Topology} The lammpstrj format does not contain topology information. Pass in either the path to a pdb file, a trajectory, or a topology to supply this information. stride : int, default=None Only read every stride-th frame atom_indices : array_like, optional If not none, then read only a subset of the atoms coordinates from the file. frame : int, optional Use this option to load only a single frame from a trajectory on disk. If frame is None, the default, the entire trajectory will be loaded. If supplied, ``stride`` will be ignored. unit_set : str, optional The LAMMPS unit set that the simulation was performed in. See http://lammps.sandia.gov/doc/units.html for options. Currently supported unit sets: 'real'. Returns ------- trajectory : md.Trajectory The resulting trajectory, as an md.Trajectory object. See Also -------- mdtraj.LAMMPSTrajectoryFile : Low level interface to lammpstrj files """ from mdtraj.core.trajectory import _parse_topology, Trajectory # We make `top` required. Although this is a little weird, its good because # this function is usually called by a dispatch from load(), where top comes # from **kwargs. So if its not supplied, we want to give the user an # informative error message. if top is None: raise ValueError('"top" argument is required for load_lammpstrj') if not isinstance(filename, string_types): raise TypeError('filename must be of type string for load_lammpstrj. ' 'you supplied %s'.format(type(filename))) topology = _parse_topology(top) atom_indices = cast_indices(atom_indices) if atom_indices is not None: topology = topology.subset(atom_indices) with LAMMPSTrajectoryFile(filename) as f: # TODO: Support other unit sets. if unit_set == 'real': f.distance_unit == 'angstroms' else: raise ValueError( 'Unsupported unit set specified: {0}.'.format(unit_set)) if frame is not None: f.seek(frame) xyz, cell_lengths, cell_angles = f.read(n_frames=1, atom_indices=atom_indices) else: xyz, cell_lengths, cell_angles = f.read(stride=stride, atom_indices=atom_indices) in_units_of(xyz, f.distance_unit, Trajectory._distance_unit, inplace=True) time = np.arange(len(xyz)) if frame is not None: time += frame elif stride is not None: time *= stride t = Trajectory(xyz=xyz, topology=topology, time=time) t.unitcell_lengths = cell_lengths t.unitcell_angles = cell_angles return t
def iterload(filename, chunk=100, **kwargs): """An iterator over a trajectory from one or more files on disk, in fragments This may be more memory efficient than loading an entire trajectory at once Parameters ---------- filename : str Path to the trajectory file on disk chunk : int Number of frames to load at once from disk per iteration. If 0, load all. Other Parameters ---------------- top : {str, Trajectory, Topology} Most trajectory formats do not contain topology information. Pass in either the path to a RCSB PDB file, a trajectory, or a topology to supply this information. This option is not required for the .h5, .lh5, and .pdb formats, which already contain topology information. stride : int, default=None Only read every stride-th frame. atom_indices : array_like, optional If not none, then read only a subset of the atoms coordinates from the file. This may be slightly slower than the standard read because it requires an extra copy, but will save memory. See Also -------- load, load_frame Examples -------- >>> import mdtraj as md >>> for chunk in md.iterload('output.xtc', top='topology.pdb') # doctest: +SKIP ... print chunk # doctest: +SKIP <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90> <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90> <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90> <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90> <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90> """ stride = kwargs.pop('stride', 1) atom_indices = cast_indices(kwargs.pop('atom_indices', None)) top = kwargs.pop('top', None) skip = kwargs.pop('skip', 0) extension = _get_extension(filename) if extension not in _TOPOLOGY_EXTS: topology = _parse_topology(top) else: topology = top if chunk == 0: # If chunk was 0 then we want to avoid filetype-specific code # in case of undefined behavior in various file parsers. # TODO: this will first apply stride, then skip! if extension not in _TOPOLOGY_EXTS: kwargs['top'] = top yield load(filename, **kwargs)[skip:] elif extension in ('.pdb', '.pdb.gz'): # the PDBTrajectortFile class doesn't follow the standard API. Fixing it # to support iterload could be worthwhile, but requires a deep refactor. t = load(filename, stride=stride, atom_indices=atom_indices) for i in range(0, len(t), chunk): yield t[i:i + chunk] elif isinstance(stride, np.ndarray): with (lambda x: open(x, n_atoms=topology.n_atoms) if extension in ('.crd', '.mdcrd') else open(filename))(filename) as f: x_prev = 0 curr_size = 0 traj = [] leftovers = [] for k, g in groupby(enumerate(stride), lambda a: a[0] - a[1]): grouped_stride = list(map(itemgetter(1), g)) seek_offset = (1 if x_prev != 0 else 0) seek_to = grouped_stride[0] - x_prev - seek_offset f.seek(seek_to, whence=1) x_prev = grouped_stride[-1] group_size = len(grouped_stride) if curr_size + group_size > chunk: leftovers = grouped_stride else: local_traj = _get_local_traj_object( atom_indices, extension, f, group_size, topology, **kwargs) traj.append(local_traj) curr_size += len(grouped_stride) if curr_size == chunk: yield _efficient_traj_join(traj) curr_size = 0 traj = [] while leftovers: local_chunk = leftovers[:min(chunk, len(leftovers))] local_traj = _get_local_traj_object( atom_indices, extension, f, len(local_chunk), topology, **kwargs) traj.append(local_traj) leftovers = leftovers[min(chunk, len(leftovers)):] curr_size += len(local_chunk) if curr_size == chunk: yield _efficient_traj_join(traj) curr_size = 0 traj = [] if traj: yield _efficient_traj_join(traj) raise StopIteration() else: with (lambda x: open(x, n_atoms=topology.n_atoms) if extension in ('.crd', '.mdcrd') else open(filename))(filename) as f: if skip > 0: f.seek(skip) while True: if extension not in _TOPOLOGY_EXTS: traj = f.read_as_traj(topology, n_frames=chunk * stride, stride=stride, atom_indices=atom_indices, **kwargs) else: traj = f.read_as_traj(n_frames=chunk * stride, stride=stride, atom_indices=atom_indices, **kwargs) if len(traj) == 0: raise StopIteration() yield traj
def load_netcdf(filename, top=None, stride=None, atom_indices=None, frame=None): """Load an AMBER NetCDF file. Since the NetCDF format doesn't contain information to specify the topology, you need to supply a topology Parameters ---------- filename : str filename of AMBER NetCDF file. top : {str, Trajectory, Topology} The NetCDF format does not contain topology information. Pass in either the path to a pdb file, a trajectory, or a topology to supply this information. stride : int, default=None Only read every stride-th frame atom_indices : array_like, optional If not None, then read only a subset of the atoms coordinates from the file. This may be slightly slower than the standard read because it requires an extra copy, but will save memory. frame : int, optional Use this option to load only a single frame from a trajectory on disk. If frame is None, the default, the entire trajectory will be loaded. If supplied, ``stride`` will be ignored. Returns ------- trajectory : md.Trajectory The resulting trajectory, as an md.Trajectory object. See Also -------- mdtraj.NetCDFTrajectoryFile : Low level interface to NetCDF files """ from mdtraj.core.trajectory import _parse_topology, Trajectory topology = _parse_topology(top) atom_indices = cast_indices(atom_indices) if atom_indices is not None: topology = topology.subset(atom_indices) with NetCDFTrajectoryFile(filename) as f: if frame is not None: f.seek(frame) xyz, time, cell_lengths, cell_angles = f.read( n_frames=1, atom_indices=atom_indices) else: xyz, time, cell_lengths, cell_angles = f.read( stride=stride, atom_indices=atom_indices) xyz = in_units_of(xyz, f.distance_unit, Trajectory._distance_unit, inplace=True) cell_lengths = in_units_of(cell_lengths, f.distance_unit, Trajectory._distance_unit, inplace=True) trajectory = Trajectory(xyz=xyz, topology=topology, time=time, unitcell_lengths=cell_lengths, unitcell_angles=cell_angles) return trajectory
def iterload(filename, chunk=100, **kwargs): """An iterator over a trajectory from one or more files on disk, in fragments This may be more memory efficient than loading an entire trajectory at once Parameters ---------- filename : str Path to the trajectory file on disk chunk : int Number of frames to load at once from disk per iteration. If 0, load all. Other Parameters ---------------- top : {str, Trajectory, Topology} Most trajectory formats do not contain topology information. Pass in either the path to a RCSB PDB file, a trajectory, or a topology to supply this information. This option is not required for the .h5, .lh5, and .pdb formats, which already contain topology information. stride : int, default=None Only read every stride-th frame. atom_indices : array_like, optional If not none, then read only a subset of the atoms coordinates from the file. This may be slightly slower than the standard read because it requires an extra copy, but will save memory. See Also -------- load, load_frame Examples -------- >>> import mdtraj as md >>> for chunk in md.iterload('output.xtc', top='topology.pdb') >>> print chunk <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90> <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90> <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90> <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90> <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90> """ stride = kwargs.get('stride', 1) atom_indices = cast_indices(kwargs.get('atom_indices', None)) if chunk % stride != 0 and filename.endswith('.dcd'): raise ValueError('Stride must be a divisor of chunk. stride=%d does not go ' 'evenly into chunk=%d' % (stride, chunk)) if chunk == 0: yield load(filename, **kwargs) # If chunk was 0 then we want to avoid filetype-specific code in case of undefined behavior in various file parsers. else: skip = kwargs.get('skip', 0) if filename.endswith('.h5'): if 'top' in kwargs: warnings.warn('top= kwarg ignored since file contains topology information') with HDF5TrajectoryFile(filename) as f: if skip > 0: xyz, _, _, _ = f.read(skip, atom_indices=atom_indices) if len(xyz) == 0: raise StopIteration() if atom_indices is None: topology = f.topology else: topology = f.topology.subset(atom_indices) while True: data = f.read(chunk*stride, stride=stride, atom_indices=atom_indices) if data == []: raise StopIteration() in_units_of(data.coordinates, f.distance_unit, Trajectory._distance_unit, inplace=True) in_units_of(data.cell_lengths, f.distance_unit, Trajectory._distance_unit, inplace=True) yield Trajectory(xyz=data.coordinates, topology=topology, time=data.time, unitcell_lengths=data.cell_lengths, unitcell_angles=data.cell_angles) if filename.endswith('.lh5'): if 'top' in kwargs: warnings.warn('top= kwarg ignored since file contains topology information') with LH5TrajectoryFile(filename) as f: if atom_indices is None: topology = f.topology else: topology = f.topology.subset(atom_indices) ptr = 0 if skip > 0: xyz, _, _, _ = f.read(skip, atom_indices=atom_indices) if len(xyz) == 0: raise StopIteration() while True: xyz = f.read(chunk*stride, stride=stride, atom_indices=atom_indices) if len(xyz) == 0: raise StopIteration() in_units_of(xyz, f.distance_unit, Trajectory._distance_unit, inplace=True) time = np.arange(ptr, ptr+len(xyz)*stride, stride) ptr += len(xyz)*stride yield Trajectory(xyz=xyz, topology=topology, time=time) elif filename.endswith('.xtc'): topology = _parse_topology(kwargs.get('top', None)) with XTCTrajectoryFile(filename) as f: if skip > 0: xyz, _, _, _ = f.read(skip) if len(xyz) == 0: raise StopIteration() while True: xyz, time, step, box = f.read(chunk*stride, stride=stride, atom_indices=atom_indices) if len(xyz) == 0: raise StopIteration() in_units_of(xyz, f.distance_unit, Trajectory._distance_unit, inplace=True) in_units_of(box, f.distance_unit, Trajectory._distance_unit, inplace=True) trajectory = Trajectory(xyz=xyz, topology=topology, time=time) trajectory.unitcell_vectors = box yield trajectory elif filename.endswith('.dcd'): topology = _parse_topology(kwargs.get('top', None)) with DCDTrajectoryFile(filename) as f: ptr = 0 if skip > 0: xyz, _, _ = f.read(skip, atom_indices=atom_indices) if len(xyz) == 0: raise StopIteration() while True: # for reasons that I have not investigated, dcdtrajectory file chunk and stride # together work like this method, but HDF5/XTC do not. xyz, box_length, box_angle = f.read(chunk, stride=stride, atom_indices=atom_indices) if len(xyz) == 0: raise StopIteration() in_units_of(xyz, f.distance_unit, Trajectory._distance_unit, inplace=True) in_units_of(box_length, f.distance_unit, Trajectory._distance_unit, inplace=True) time = np.arange(ptr, ptr+len(xyz)*stride, stride) ptr += len(xyz)*stride yield Trajectory(xyz=xyz, topology=topology, time=time, unitcell_lengths=box_length, unitcell_angles=box_angle) else: log.critical("loading complete traj into mem! This might no be desired.") t = load(filename, **kwargs) for i in range(skip, len(t), chunk): yield t[i:i+chunk]
def __init__(self, filename, chunk=100, **kwargs): """An iterator over a trajectory from one or more files on disk, in fragments This may be more memory efficient than loading an entire trajectory at once Parameters ---------- filename : str Path to the trajectory file on disk chunk : int Number of frames to load at once from disk per iteration. If 0, load all. Other Parameters ---------------- top : {str, Trajectory, Topology} Most trajectory formats do not contain topology information. Pass in either the path to a RCSB PDB file, a trajectory, or a topology to supply this information. This option is not required for the .h5, .lh5, and .pdb formats, which already contain topology information. stride : int, default=None Only read every stride-th frame. atom_indices : array_like, optional If not none, then read only a subset of the atoms coordinates from the file. This may be slightly slower than the standard read because it requires an extra copy, but will save memory. See Also -------- load, load_frame Examples -------- >>> import mdtraj as md >>> for chunk in md.iterload('output.xtc', top='topology.pdb') # doctest: +SKIP ... print chunk # doctest: +SKIP <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90> <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90> <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90> <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90> <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90> """ self._filename = filename self._stride = kwargs.pop('stride', 1) self._atom_indices = cast_indices(kwargs.pop('atom_indices', None)) self._top = kwargs.pop('top', None) self._skip = kwargs.pop('skip', 0) self._kwargs = kwargs self._chunksize = chunk self._extension = _get_extension(self._filename) self._closed = False if self._extension not in _TOPOLOGY_EXTS: self._topology = _parse_topology(self._top) else: self._topology = self._top self._mode = None if self._chunksize > 0 and self._extension in ('.pdb', '.pdb.gz'): self._mode = 'pdb' self._t = load(self._filename, stride=self._stride, atom_indices=self._atom_indices) self._i = 0 elif isinstance(self._stride, np.ndarray): self._mode = 'random_access' self._f = (lambda x: open(x, n_atoms=self._topology.n_atoms) if self._extension in ('.crd', '.mdcrd') else open(self._filename))( self._filename) self._ra_it = self._random_access_generator(self._f) else: self._mode = 'traj' self._f = (lambda x: open(x, n_atoms=self._topology.n_atoms) if self._extension in ('.crd', '.mdcrd') else open(self._filename))( self._filename) # offset array handling offsets = kwargs.pop('offsets', None) if hasattr(self._f, 'offsets') and offsets is not None: self._f.offsets = offsets if self._skip > 0: self._f.seek(self._skip)
def load_topology(self, fn): self.fn = fn self.topology = _parse_topology(fn) self.check_topology()
def load_xyz(filename, top=None, stride=None, atom_indices=None, frame=None): """Load a xyz trajectory file. While there is no universal standard for this format, this plugin adheres to the same format as the VMD plugin: http://www.ks.uiuc.edu/Research/vmd/plugins/molfile/xyzplugin.html Most notably, units are in angstroms and anything past the 'z' field is ignored. Parameters ---------- filename : str String filename of xyz trajectory file. top : {str, Trajectory, Topology} The xyz format does not contain topology information. Pass in either the path to a pdb file, a trajectory, or a topology to supply this information. stride : int, default=None Only read every stride-th frame atom_indices : array_like, optional If not none, then read only a subset of the atoms coordinates from the file. frame : int, optional Use this option to load only a single frame from a trajectory on disk. If frame is None, the default, the entire trajectory will be loaded. If supplied, ``stride`` will be ignored. Returns ------- trajectory : md.Trajectory The resulting trajectory, as an md.Trajectory object. See Also -------- mdtraj.XYZTrajectoryFile : Low level interface to xyz files """ from mdtraj.core.trajectory import _parse_topology, Trajectory # We make `top` required. Although this is a little weird, its good because # this function is usually called by a dispatch from load(), where top comes # from **kwargs. So if its not supplied, we want to give the user an # informative error message. if top is None: raise ValueError('"top" argument is required for load_xyz') if not isinstance(filename, string_types): raise TypeError('filename must be of type string for load_xyz. ' 'you supplied %s'.format(type(filename))) topology = _parse_topology(top) atom_indices = cast_indices(atom_indices) with XYZTrajectoryFile(filename) as f: if frame is not None: f.seek(frame) n_frames = 1 else: n_frames = None return f.read_as_traj(topology, n_frames=n_frames, stride=stride, atom_indices=atom_indices)
def load_arc(filename, top=None, stride=None, atom_indices=None): """Load a TINKER .arc file from disk. Parameters ---------- filename : str String filename of TINKER .arc file. top : {str, Trajectory, Topology} The .arc format does not contain topology information. Pass in either the path to a pdb file, a trajectory, or a topology to supply this information. stride : int, default=None Only read every stride-th frame atom_indices : array_like, optional If not none, then read only a subset of the atoms coordinates from the file. Returns ------- trajectory : md.Trajectory The resulting trajectory, as an md.Trajectory object. See Also -------- mdtraj.ArcTrajectoryFile : Low level interface to TINKER .arc files """ from mdtraj.core.trajectory import _parse_topology, Trajectory # we make it not required in the signature, but required here. although this # is a little weird, its good because this function is usually called by a # dispatch from load(), where top comes from **kwargs. So if its not supplied # we want to give the user an informative error message # if top is None: # raise ValueError('"top" argument is required for load_arc') if not isinstance(filename, string_types): raise TypeError('filename must be of type string for load_arc. ' 'you supplied %s' % type(filename)) atom_indices = cast_indices(atom_indices) with ArcTrajectoryFile(filename) as f: xyz, abc, ang = f.read(stride=stride, atom_indices=atom_indices) in_units_of(xyz, f.distance_unit, Trajectory._distance_unit, inplace=True) in_units_of(abc, f.distance_unit, Trajectory._distance_unit, inplace=True) if top is None: topology = f.topology else: topology = _parse_topology(top) if atom_indices is not None: topology = topology.subset(atom_indices) time = np.arange(len(xyz)) if stride is not None: # if we loaded with a stride, the Trajectories's time field should # respect that time *= stride t = Trajectory(xyz=xyz, topology=topology, time=time, unitcell_lengths=abc, unitcell_angles=ang) return t
def load_xyz(filename, top=None, stride=None, atom_indices=None, frame=None): """Load a xyz trajectory file. While there is no universal standard for this format, this plugin adheres to the same format as the VMD plugin: http://www.ks.uiuc.edu/Research/vmd/plugins/molfile/xyzplugin.html Most notably, units are in angstroms and anything past the 'z' field is ignored. Parameters ---------- filename : str String filename of xyz trajectory file. top : {str, Trajectory, Topology} The xyz format does not contain topology information. Pass in either the path to a pdb file, a trajectory, or a topology to supply this information. stride : int, default=None Only read every stride-th frame atom_indices : array_like, optional If not none, then read only a subset of the atoms coordinates from the file. frame : int, optional Use this option to load only a single frame from a trajectory on disk. If frame is None, the default, the entire trajectory will be loaded. If supplied, ``stride`` will be ignored. Returns ------- trajectory : md.Trajectory The resulting trajectory, as an md.Trajectory object. See Also -------- mdtraj.XYZTrajectoryFile : Low level interface to xyz files """ from mdtraj.core.trajectory import _parse_topology, Trajectory # We make `top` required. Although this is a little weird, its good because # this function is usually called by a dispatch from load(), where top comes # from **kwargs. So if its not supplied, we want to give the user an # informative error message. if top is None: raise ValueError('"top" argument is required for load_xyz') if not isinstance(filename, string_types): raise TypeError('filename must be of type string for load_xyz. ' 'you supplied %s'.format(type(filename))) topology = _parse_topology(top) atom_indices = cast_indices(atom_indices) if atom_indices is not None: topology = topology.subset(atom_indices) with XYZTrajectoryFile(filename) as f: if frame is not None: f.seek(frame) xyz = f.read(n_frames=1, atom_indices=atom_indices) else: xyz = f.read(stride=stride, atom_indices=atom_indices) in_units_of(xyz, f.distance_unit, Trajectory._distance_unit, inplace=True) time = np.arange(len(xyz)) if frame is not None: time += frame elif stride is not None: time *= stride t = Trajectory(xyz=xyz, topology=topology, time=time) return t
def load_gsd(filename, top=None, start=None, n_frames=None, stride=None, atom_indices=None, frame=None): """Load a GSD trajectory file. Parameters ----------- filename : path-like Path of GSD trajectory file. top : {path-like, Trajectory, Topology}, None A pdb file, a trajectory, or a topology to supply topology information If None, topology information will be parsed from the GSD file start : int, None First frame to convert n_frames : int, None Number of frames after `start` to convert stride : int Read only every stride-th frame. atom_indices : array_like, optional If not none, then read only a subset of the atoms coordinates from the file. frame : int, optional Use this option to load only a single frame from a trajectory on disk. If frame is None, the default, the entire trajectory will be loaded. If supplied, ``stride`` will be ignored. Returns ------- trajectory : md.Trajectory The resulting trajectory, as an md.Trajectory object. """ from mdtraj.core.trajectory import Trajectory, _parse_topology import gsd.hoomd if not isinstance(filename, (string_types, os.PathLike)): raise TypeError('filename must be of type path-like for load_gsd. ' 'you supplied %s'.format(type(filename))) if top is not None: topology = _parse_topology(top) else: topology = load_gsd_topology(filename) atom_indices = cast_indices(atom_indices) with gsd.hoomd.open(filename, 'rb') as f: if frame is not None: xyz, vectors, time = read_snapshot(frame, f[frame], topology, atom_indices=atom_indices) t = Trajectory(xyz=np.array(xyz), topology=topology, time=np.array([time])) t.unitcell_vectors = np.reshape(vectors, (-1, 3, 3)) return t else: return hoomdtraj_to_traj(f, topology, start=start, n_frames=n_frames, stride=stride, atom_indices=atom_indices)
def load_lammpstrj(filename, top=None, stride=None, atom_indices=None, frame=None, unit_set='real'): """Load a LAMMPS trajectory file. Parameters ---------- filename : str String filename of LAMMPS trajectory file. top : {str, Trajectory, Topology} The lammpstrj format does not contain topology information. Pass in either the path to a pdb file, a trajectory, or a topology to supply this information. stride : int, default=None Only read every stride-th frame atom_indices : array_like, optional If not none, then read only a subset of the atoms coordinates from the file. frame : int, optional Use this option to load only a single frame from a trajectory on disk. If frame is None, the default, the entire trajectory will be loaded. If supplied, ``stride`` will be ignored. unit_set : str, optional The LAMMPS unit set that the simulation was performed in. See http://lammps.sandia.gov/doc/units.html for options. Currently supported unit sets: 'real'. Returns ------- trajectory : md.Trajectory The resulting trajectory, as an md.Trajectory object. See Also -------- mdtraj.LAMMPSTrajectoryFile : Low level interface to lammpstrj files """ from mdtraj.core.trajectory import _parse_topology # We make `top` required. Although this is a little weird, its good because # this function is usually called by a dispatch from load(), where top comes # from **kwargs. So if its not supplied, we want to give the user an # informative error message. if top is None: raise ValueError('"top" argument is required for load_lammpstrj') if not isinstance(filename, string_types): raise TypeError('filename must be of type string for load_lammpstrj. ' 'you supplied %s'.format(type(filename))) topology = _parse_topology(top) atom_indices = cast_indices(atom_indices) with LAMMPSTrajectoryFile(filename) as f: # TODO: Support other unit sets. if unit_set == 'real': f.distance_unit == 'angstroms' else: raise ValueError('Unsupported unit set specified: {0}.'.format(unit_set)) if frame is not None: f.seek(frame) n_frames = 1 else: n_frames = None return f.read_as_traj(topology, n_frames=n_frames, stride=stride, atom_indices=atom_indices)