def load(filename, chunks=10, **kwargs): """ A loader that will mimic mdtraj.Trajectory.load, but construct a dasktraj.Trajectory with a dask.array as xyz """ top = kwargs.pop('top', None) extension = _get_extension(filename) if extension not in _TOPOLOGY_EXTS: topology = _parse_topology(top) length = len(open(filename)) n_chunks = int(length / chunks) frames_left = length % chunks if frames_left != 0: n_chunks += 1 # TODO this needs to be closed at some point data = load_chunks(filename, extension, chunks, range(n_chunks), **kwargs) #TODO: use this to construct unitcells # Pop out irelevant info uv = data.pop('unitcell_vectors') traj = Trajectory(topology=topology, delayed_objects=data, **data) if uv is not None: traj.unitcell_vectors = uv return traj
def load(filename, chunks=10, **kwargs): """ A loader that will mimic :py:func:`mdtraj.Trajectory.load()`, but construct a :py:class:`dasktraj.Trajectory` with a :py:class:`dask.array` as xyz, time, and unitcell properties. Parameters ---------- filename : string Filename of the file to load. chunks : int Number of frames per chunk. Returns ------- trajectory A :py:class:`dasktraj.Trajectory` """ top = kwargs.pop("top", None) extension = _get_extension(filename) if extension not in _TOPOLOGY_EXTS: topology = _parse_topology(top) filename = os.path.abspath(filename) length = len(open(filename)) n_chunks = int(length / chunks) frames_left = length % chunks if frames_left != 0: n_chunks += 1 # TODO this needs to be closed at some point data = load_chunks(filename, extension, chunks, range(n_chunks), **kwargs) # TODO: use this to construct unitcells # Pop out irrelevant info uv = data.pop("unitcell_vectors") traj = Trajectory(topology=topology, delayed_objects=data, **data) if uv is not None: traj.unitcell_vectors = uv return traj
def __init__(self, filename, chunk=1000, **kwargs): """An iterator over a trajectory from one or more files on disk, in fragments This may be more memory efficient than loading an entire trajectory at once Parameters ---------- filename : str Path to the trajectory file on disk chunk : int Number of frames to load at once from disk per iteration. If 0, load all. Other Parameters ---------------- top : {str, Trajectory, Topology} Most trajectory formats do not contain topology information. Pass in either the path to a RCSB PDB file, a trajectory, or a topology to supply this information. This option is not required for the .h5, .lh5, and .pdb formats, which already contain topology information. stride : int, default=None Only read every stride-th frame. atom_indices : array_like, optional If not none, then read only a subset of the atoms coordinates from the file. This may be slightly slower than the standard read because it requires an extra copy, but will save memory. See Also -------- load, load_frame Examples -------- >>> import mdtraj as md >>> for chunk in md.iterload('output.xtc', top='topology.pdb') # doctest: +SKIP ... print chunk # doctest: +SKIP <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90> <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90> <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90> <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90> <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90> """ self._filename = filename self._stride = kwargs.pop('stride', 1) self._atom_indices = cast_indices(kwargs.pop('atom_indices', None)) self._top = kwargs.pop('top', None) self._skip = kwargs.pop('skip', 0) self._kwargs = kwargs self._chunksize = chunk self._extension = _get_extension(self._filename) self._closed = False self._seeked = False if self._extension not in _TOPOLOGY_EXTS: self._topology = load_topology_cached(self._top) else: self._topology = self._top if self._extension in ('pdb', 'pdb.gz'): raise Exception("Not supported as trajectory format {ext}".format( ext=self._extension)) self._mode = None if isinstance(self._stride, np.ndarray): self._mode = 'random_access' self._f = (lambda x: md_open(x, n_atoms=self._topology.n_atoms) if self._extension in ('.crd', '.mdcrd') else md_open(self._filename))( self._filename) self._ra_it = self._random_access_generator(self._f) else: self._mode = 'traj' self._f = (lambda x: md_open(x, n_atoms=self._topology.n_atoms) if self._extension in ('.crd', '.mdcrd') else md_open(self._filename))( self._filename) # offset array handling offsets = kwargs.pop('offsets', None) if hasattr(self._f, 'offsets') and offsets is not None: self._f.offsets = offsets
def iterload(filename, chunk=100, **kwargs): """An iterator over a trajectory from one or more files on disk, in fragments This may be more memory efficient than loading an entire trajectory at once Parameters ---------- filename : str Path to the trajectory file on disk chunk : int Number of frames to load at once from disk per iteration. If 0, load all. Other Parameters ---------------- top : {str, Trajectory, Topology} Most trajectory formats do not contain topology information. Pass in either the path to a RCSB PDB file, a trajectory, or a topology to supply this information. This option is not required for the .h5, .lh5, and .pdb formats, which already contain topology information. stride : int, default=None Only read every stride-th frame. atom_indices : array_like, optional If not none, then read only a subset of the atoms coordinates from the file. This may be slightly slower than the standard read because it requires an extra copy, but will save memory. See Also -------- load, load_frame Examples -------- >>> import mdtraj as md >>> for chunk in md.iterload('output.xtc', top='topology.pdb') # doctest: +SKIP ... print chunk # doctest: +SKIP <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90> <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90> <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90> <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90> <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90> """ stride = kwargs.pop('stride', 1) atom_indices = cast_indices(kwargs.pop('atom_indices', None)) top = kwargs.pop('top', None) skip = kwargs.pop('skip', 0) extension = _get_extension(filename) if extension not in _TOPOLOGY_EXTS: topology = _parse_topology(top) else: topology = top if chunk == 0: # If chunk was 0 then we want to avoid filetype-specific code # in case of undefined behavior in various file parsers. # TODO: this will first apply stride, then skip! if extension not in _TOPOLOGY_EXTS: kwargs['top'] = top yield load(filename, **kwargs)[skip:] elif extension in ('.pdb', '.pdb.gz'): # the PDBTrajectortFile class doesn't follow the standard API. Fixing it # to support iterload could be worthwhile, but requires a deep refactor. t = load(filename, stride=stride, atom_indices=atom_indices) for i in range(0, len(t), chunk): yield t[i:i + chunk] elif isinstance(stride, np.ndarray): with (lambda x: open(x, n_atoms=topology.n_atoms) if extension in ('.crd', '.mdcrd') else open(filename))(filename) as f: x_prev = 0 curr_size = 0 traj = [] leftovers = [] for k, g in groupby(enumerate(stride), lambda a: a[0] - a[1]): grouped_stride = list(map(itemgetter(1), g)) seek_offset = (1 if x_prev != 0 else 0) seek_to = grouped_stride[0] - x_prev - seek_offset f.seek(seek_to, whence=1) x_prev = grouped_stride[-1] group_size = len(grouped_stride) if curr_size + group_size > chunk: leftovers = grouped_stride else: local_traj = _get_local_traj_object( atom_indices, extension, f, group_size, topology, **kwargs) traj.append(local_traj) curr_size += len(grouped_stride) if curr_size == chunk: yield _efficient_traj_join(traj) curr_size = 0 traj = [] while leftovers: local_chunk = leftovers[:min(chunk, len(leftovers))] local_traj = _get_local_traj_object( atom_indices, extension, f, len(local_chunk), topology, **kwargs) traj.append(local_traj) leftovers = leftovers[min(chunk, len(leftovers)):] curr_size += len(local_chunk) if curr_size == chunk: yield _efficient_traj_join(traj) curr_size = 0 traj = [] if traj: yield _efficient_traj_join(traj) raise StopIteration() else: with (lambda x: open(x, n_atoms=topology.n_atoms) if extension in ('.crd', '.mdcrd') else open(filename))(filename) as f: if skip > 0: f.seek(skip) while True: if extension not in _TOPOLOGY_EXTS: traj = f.read_as_traj(topology, n_frames=chunk * stride, stride=stride, atom_indices=atom_indices, **kwargs) else: traj = f.read_as_traj(n_frames=chunk * stride, stride=stride, atom_indices=atom_indices, **kwargs) if len(traj) == 0: raise StopIteration() yield traj