def iterload(filename, chunk=100, **kwargs): """An iterator over a trajectory from one or more files on disk, in fragments This may be more memory efficient than loading an entire trajectory at once Parameters ---------- filename : str Path to the trajectory file on disk chunk : int Number of frames to load at once from disk per iteration. If 0, load all. Other Parameters ---------------- top : {str, Trajectory, Topology} Most trajectory formats do not contain topology information. Pass in either the path to a RCSB PDB file, a trajectory, or a topology to supply this information. This option is not required for the .h5, .lh5, and .pdb formats, which already contain topology information. stride : int, default=None Only read every stride-th frame. atom_indices : array_like, optional If not none, then read only a subset of the atoms coordinates from the file. This may be slightly slower than the standard read because it requires an extra copy, but will save memory. See Also -------- load, load_frame Examples -------- >>> import mdtraj as md >>> for chunk in md.iterload('output.xtc', top='topology.pdb') >>> print chunk <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90> <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90> <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90> <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90> <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90> """ stride = kwargs.get('stride', 1) atom_indices = cast_indices(kwargs.get('atom_indices', None)) if chunk % stride != 0 and filename.endswith('.dcd'): raise ValueError('Stride must be a divisor of chunk. stride=%d does not go ' 'evenly into chunk=%d' % (stride, chunk)) if chunk == 0: yield load(filename, **kwargs) # If chunk was 0 then we want to avoid filetype-specific code in case of undefined behavior in various file parsers. else: skip = kwargs.get('skip', 0) if filename.endswith('.h5'): if 'top' in kwargs: warnings.warn('top= kwarg ignored since file contains topology information') with HDF5TrajectoryFile(filename) as f: if skip > 0: xyz, _, _, _ = f.read(skip, atom_indices=atom_indices) if len(xyz) == 0: raise StopIteration() if atom_indices is None: topology = f.topology else: topology = f.topology.subset(atom_indices) while True: data = f.read(chunk*stride, stride=stride, atom_indices=atom_indices) if data == []: raise StopIteration() in_units_of(data.coordinates, f.distance_unit, Trajectory._distance_unit, inplace=True) in_units_of(data.cell_lengths, f.distance_unit, Trajectory._distance_unit, inplace=True) yield Trajectory(xyz=data.coordinates, topology=topology, time=data.time, unitcell_lengths=data.cell_lengths, unitcell_angles=data.cell_angles) if filename.endswith('.lh5'): if 'top' in kwargs: warnings.warn('top= kwarg ignored since file contains topology information') with LH5TrajectoryFile(filename) as f: if atom_indices is None: topology = f.topology else: topology = f.topology.subset(atom_indices) ptr = 0 if skip > 0: xyz, _, _, _ = f.read(skip, atom_indices=atom_indices) if len(xyz) == 0: raise StopIteration() while True: xyz = f.read(chunk*stride, stride=stride, atom_indices=atom_indices) if len(xyz) == 0: raise StopIteration() in_units_of(xyz, f.distance_unit, Trajectory._distance_unit, inplace=True) time = np.arange(ptr, ptr+len(xyz)*stride, stride) ptr += len(xyz)*stride yield Trajectory(xyz=xyz, topology=topology, time=time) elif filename.endswith('.xtc'): topology = _parse_topology(kwargs.get('top', None)) with XTCTrajectoryFile(filename) as f: if skip > 0: xyz, _, _, _ = f.read(skip) if len(xyz) == 0: raise StopIteration() while True: xyz, time, step, box = f.read(chunk*stride, stride=stride, atom_indices=atom_indices) if len(xyz) == 0: raise StopIteration() in_units_of(xyz, f.distance_unit, Trajectory._distance_unit, inplace=True) in_units_of(box, f.distance_unit, Trajectory._distance_unit, inplace=True) trajectory = Trajectory(xyz=xyz, topology=topology, time=time) trajectory.unitcell_vectors = box yield trajectory elif filename.endswith('.dcd'): topology = _parse_topology(kwargs.get('top', None)) with DCDTrajectoryFile(filename) as f: ptr = 0 if skip > 0: xyz, _, _ = f.read(skip, atom_indices=atom_indices) if len(xyz) == 0: raise StopIteration() while True: # for reasons that I have not investigated, dcdtrajectory file chunk and stride # together work like this method, but HDF5/XTC do not. xyz, box_length, box_angle = f.read(chunk, stride=stride, atom_indices=atom_indices) if len(xyz) == 0: raise StopIteration() in_units_of(xyz, f.distance_unit, Trajectory._distance_unit, inplace=True) in_units_of(box_length, f.distance_unit, Trajectory._distance_unit, inplace=True) time = np.arange(ptr, ptr+len(xyz)*stride, stride) ptr += len(xyz)*stride yield Trajectory(xyz=xyz, topology=topology, time=time, unitcell_lengths=box_length, unitcell_angles=box_angle) else: log.critical("loading complete traj into mem! This might no be desired.") t = load(filename, **kwargs) for i in range(skip, len(t), chunk): yield t[i:i+chunk]
def __init__(self, filename, chunk=1000, **kwargs): """An iterator over a trajectory from one or more files on disk, in fragments This may be more memory efficient than loading an entire trajectory at once Parameters ---------- filename : str Path to the trajectory file on disk chunk : int Number of frames to load at once from disk per iteration. If 0, load all. Other Parameters ---------------- top : {str, Trajectory, Topology} Most trajectory formats do not contain topology information. Pass in either the path to a RCSB PDB file, a trajectory, or a topology to supply this information. This option is not required for the .h5, .lh5, and .pdb formats, which already contain topology information. stride : int, default=None Only read every stride-th frame. atom_indices : array_like, optional If not none, then read only a subset of the atoms coordinates from the file. This may be slightly slower than the standard read because it requires an extra copy, but will save memory. See Also -------- load, load_frame Examples -------- >>> import mdtraj as md >>> for chunk in md.iterload('output.xtc', top='topology.pdb') # doctest: +SKIP ... print chunk # doctest: +SKIP <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90> <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90> <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90> <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90> <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90> """ self._filename = filename self._stride = kwargs.pop('stride', 1) self._atom_indices = cast_indices(kwargs.pop('atom_indices', None)) self._top = kwargs.pop('top', None) self._skip = kwargs.pop('skip', 0) self._kwargs = kwargs self._chunksize = chunk self._extension = _get_extension(self._filename) self._closed = False self._seeked = False if self._extension not in _TOPOLOGY_EXTS: self._topology = load_topology_cached(self._top) else: self._topology = self._top if self._extension in ('pdb', 'pdb.gz'): raise Exception("Not supported as trajectory format {ext}".format( ext=self._extension)) self._mode = None if isinstance(self._stride, np.ndarray): self._mode = 'random_access' self._f = (lambda x: md_open(x, n_atoms=self._topology.n_atoms) if self._extension in ('.crd', '.mdcrd') else md_open(self._filename))( self._filename) self._ra_it = self._random_access_generator(self._f) else: self._mode = 'traj' self._f = (lambda x: md_open(x, n_atoms=self._topology.n_atoms) if self._extension in ('.crd', '.mdcrd') else md_open(self._filename))( self._filename) # offset array handling offsets = kwargs.pop('offsets', None) if hasattr(self._f, 'offsets') and offsets is not None: self._f.offsets = offsets
def iterload(filename, chunk=100, **kwargs): """An iterator over a trajectory from one or more files on disk, in fragments This may be more memory efficient than loading an entire trajectory at once Parameters ---------- filename : str Path to the trajectory file on disk chunk : int Number of frames to load at once from disk per iteration. If 0, load all. Other Parameters ---------------- top : {str, Trajectory, Topology} Most trajectory formats do not contain topology information. Pass in either the path to a RCSB PDB file, a trajectory, or a topology to supply this information. This option is not required for the .h5, .lh5, and .pdb formats, which already contain topology information. stride : int, default=None Only read every stride-th frame. atom_indices : array_like, optional If not none, then read only a subset of the atoms coordinates from the file. This may be slightly slower than the standard read because it requires an extra copy, but will save memory. See Also -------- load, load_frame Examples -------- >>> import mdtraj as md >>> for chunk in md.iterload('output.xtc', top='topology.pdb') # doctest: +SKIP ... print chunk # doctest: +SKIP <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90> <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90> <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90> <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90> <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90> """ stride = kwargs.pop('stride', 1) atom_indices = cast_indices(kwargs.pop('atom_indices', None)) top = kwargs.pop('top', None) skip = kwargs.pop('skip', 0) extension = _get_extension(filename) if extension not in _TOPOLOGY_EXTS: topology = _parse_topology(top) else: topology = top if chunk == 0: # If chunk was 0 then we want to avoid filetype-specific code # in case of undefined behavior in various file parsers. # TODO: this will first apply stride, then skip! if extension not in _TOPOLOGY_EXTS: kwargs['top'] = top yield load(filename, **kwargs)[skip:] elif extension in ('.pdb', '.pdb.gz'): # the PDBTrajectortFile class doesn't follow the standard API. Fixing it # to support iterload could be worthwhile, but requires a deep refactor. t = load(filename, stride=stride, atom_indices=atom_indices) for i in range(0, len(t), chunk): yield t[i:i + chunk] elif isinstance(stride, np.ndarray): with (lambda x: open(x, n_atoms=topology.n_atoms) if extension in ('.crd', '.mdcrd') else open(filename))(filename) as f: x_prev = 0 curr_size = 0 traj = [] leftovers = [] for k, g in groupby(enumerate(stride), lambda a: a[0] - a[1]): grouped_stride = list(map(itemgetter(1), g)) seek_offset = (1 if x_prev != 0 else 0) seek_to = grouped_stride[0] - x_prev - seek_offset f.seek(seek_to, whence=1) x_prev = grouped_stride[-1] group_size = len(grouped_stride) if curr_size + group_size > chunk: leftovers = grouped_stride else: local_traj = _get_local_traj_object( atom_indices, extension, f, group_size, topology, **kwargs) traj.append(local_traj) curr_size += len(grouped_stride) if curr_size == chunk: yield _efficient_traj_join(traj) curr_size = 0 traj = [] while leftovers: local_chunk = leftovers[:min(chunk, len(leftovers))] local_traj = _get_local_traj_object( atom_indices, extension, f, len(local_chunk), topology, **kwargs) traj.append(local_traj) leftovers = leftovers[min(chunk, len(leftovers)):] curr_size += len(local_chunk) if curr_size == chunk: yield _efficient_traj_join(traj) curr_size = 0 traj = [] if traj: yield _efficient_traj_join(traj) raise StopIteration() else: with (lambda x: open(x, n_atoms=topology.n_atoms) if extension in ('.crd', '.mdcrd') else open(filename))(filename) as f: if skip > 0: f.seek(skip) while True: if extension not in _TOPOLOGY_EXTS: traj = f.read_as_traj(topology, n_frames=chunk * stride, stride=stride, atom_indices=atom_indices, **kwargs) else: traj = f.read_as_traj(n_frames=chunk * stride, stride=stride, atom_indices=atom_indices, **kwargs) if len(traj) == 0: raise StopIteration() yield traj