Exemple #1
0
def frames_from_files(files,
                      top,
                      frames,
                      chunksize=1000,
                      stride=1,
                      verbose=False,
                      copy_not_join=None):
    from pyemma.coordinates import source
    # Enforce topology to be a md.Topology object
    top = _enforce_top(top)
    reader = source(files, top=top)
    stride = int(stride)

    if stride != 1:
        frames[:, 1] *= int(stride)
        if verbose:
            log.info('A stride value of = %u was parsed, '
                     'interpreting "indexes" accordingly.' % stride)

    # sort by file and frame index
    sort_inds = np.lexsort((frames[:, 1], frames[:, 0]))
    sorted_inds = frames[sort_inds]
    assert len(sorted_inds) == len(frames)

    for u in np.unique(sorted_inds[:, 0]):
        largest_ind_in_traj = np.max(sorted_inds[sorted_inds == u])
        if reader.trajectory_length(u) < largest_ind_in_traj:
            raise ValueError(
                "largest specified index (%i * stride=%i * %i=%i) "
                "is larger than trajectory length '%s' = %i" (
                    largest_ind_in_traj / stride, largest_ind_in_traj / stride,
                    stride, largest_ind_in_traj, reader.filenames[u],
                    reader.trajectory_length(u)))

    collected_frames = []
    with reader.iterator(chunk=chunksize,
                         stride=sorted_inds,
                         return_trajindex=False) as it:
        for x in it:
            collected_frames.append(x)

    collected_frames = np.vstack(collected_frames)
    collected_frames = collected_frames[sort_inds.argsort()]
    collected_frames = collected_frames.reshape(-1, top.n_atoms, 3)

    return Trajectory(collected_frames, top)
def frames_from_files(files,
                      top,
                      frames,
                      chunksize=1000,
                      stride=1,
                      verbose=False,
                      copy_not_join=None,
                      reader=None):
    """
    Constructs a Trajectory object out of given frames collected from files (or given reader).

    :param files: source files
    :param top: topology file
    :param frames: indices
    :param chunksize:
    :param stride:
    :param verbose:
    :param copy_not_join: not used
    :param reader: if a reader is given, ignore files and top param!
    :return: mdtra.Trajectory consisting out of frames indices.
    """
    # Enforce topology to be a md.Topology object
    if reader is None:
        top = _enforce_top(top)
        reader_given = False
    else:
        if not reader.number_of_trajectories():
            raise ValueError("need at least one trajectory file in reader.")
        if isinstance(reader, FragmentedTrajectoryReader):
            top = reader._readers[0][0].featurizer.topology
        elif isinstance(reader, FeatureReader):
            top = reader.featurizer.topology
        else:
            raise ValueError("unsupported reader (only md readers).")
        reader_given = True

    stride = int(stride)
    frames = np.array(frames)

    # only one file, so we expect frames to be a one dimensional array
    if isinstance(files, str):
        files = [files]
        if frames.ndim == 1:
            # insert a constant column for file index
            frames = np.insert(np.atleast_2d(frames),
                               0,
                               np.zeros_like(frames),
                               axis=0).T

    if stride != 1:
        frames[:, 1] *= stride
        if verbose:
            log.info('A stride value of = %u was parsed, '
                     'interpreting "indexes" accordingly.' % stride)

    # sort by file and frame index
    sort_inds = np.lexsort((frames[:, 1], frames[:, 0]))
    sorted_inds = frames[sort_inds]
    assert len(sorted_inds) == len(frames)

    file_inds_unique = np.unique(sorted_inds[:, 0])
    # construct reader
    if reader is None:
        # filter out files, we would never read, because no indices are pointing to them
        reader = source(np.array(files)[file_inds_unique].tolist(), top=top)
        # re-map indices to reflect filtered files:
        for itraj, c in zip(file_inds_unique, itertools.count(0)):
            mask = sorted_inds[:, 0] == itraj
            sorted_inds[mask, 0] = c

        inds_to_check = np.arange(len(file_inds_unique))
    else:
        inds_to_check = file_inds_unique

    # sanity check of indices
    for itraj in inds_to_check:
        inds_by_traj = sorted_inds[sorted_inds[:, 0] == itraj][:, 1]
        assert inds_by_traj.ndim == 1
        largest_ind_in_traj = np.max(inds_by_traj)
        length = reader.trajectory_length(itraj)
        if largest_ind_in_traj >= length:
            raise ValueError(
                "largest specified index ({largest_without_stride} * stride="
                "{largest_without_stride} * {stride}={largest}) "
                "is larger than trajectory length '{filename}' = {length}".
                format(largest_without_stride=largest_ind_in_traj / stride,
                       stride=stride,
                       largest=largest_ind_in_traj,
                       filename=reader.filenames[itraj],
                       length=length))

    def set_reader_return_traj_objects(reader, flag):
        if isinstance(reader, FeatureReader):
            reader._return_traj_obj = flag
        elif isinstance(reader, FragmentedTrajectoryReader):
            for file in reader.filenames_flat:
                r = reader.reader_by_filename(file)
                if isinstance(r, FeatureReader):
                    r = [r]
                for _r in r:
                    _r._return_traj_obj = flag

    try:
        # If the reader got passed in, it could have the data already mapped to memory.
        # In this case, we cannot force it to return trajectory objects, so we have to re-create it.
        if reader.in_memory:
            reader = source(reader.filenames, top=top, chunksize=chunksize)
        # we want the FeatureReader to return mdtraj.Trajectory objects
        set_reader_return_traj_objects(reader, True)

        it = reader.iterator(chunk=chunksize,
                             stride=sorted_inds,
                             return_trajindex=False)

        with it:
            collected_frames = [f for f in it]
        dest = _preallocate_empty_trajectory(top, len(frames))
        t = 0
        for chunk in collected_frames:
            _copy_traj_attributes(dest, chunk, t)
            t += len(chunk)
        # reshuffle the indices of the final trajectory object to obtain the desired order
        dest = dest.slice(sort_inds.argsort(), copy=False)
    finally:
        # in any case we want to reset the reader to its previous state (return features, instead of md.Trajectory)
        if reader_given:
            set_reader_return_traj_objects(reader, False)
    return dest
Exemple #3
0
def frames_from_files(files,
                      top,
                      frames,
                      chunksize=1000,
                      stride=1,
                      verbose=False,
                      copy_not_join=None,
                      reader=None):
    """
    Constructs a Trajectory object out of given frames collected from files (or given reader).

    :param files: source files
    :param top: topology file
    :param frames: indices
    :param chunksize:
    :param stride:
    :param verbose:
    :param copy_not_join: not used
    :param reader: if a reader is given, ignore files and top param!
    :return: mdtra.Trajectory consisting out of frames indices.
    """
    # Enforce topology to be a md.Topology object
    if reader is None:
        top = _enforce_top(top)
    else:
        if not reader.number_of_trajectories():
            raise ValueError("need at least one trajectory file in reader.")
        if isinstance(reader, FragmentedTrajectoryReader):
            top = reader._readers[0][0].featurizer.topology
        elif isinstance(reader, FeatureReader):
            top = reader.featurizer.topology
        else:
            raise ValueError("unsupported reader (only md readers).")

    stride = int(stride)
    frames = np.array(frames)

    # only one file, so we expect frames to be a one dimensional array
    if isinstance(files, str):
        files = [files]
        if frames.ndim == 1:
            # insert a constant column for file index
            frames = np.insert(np.atleast_2d(frames),
                               0,
                               np.zeros_like(frames),
                               axis=0).T

    if stride != 1:
        frames[:, 1] *= int(stride)
        if verbose:
            log.info('A stride value of = %u was parsed, '
                     'interpreting "indexes" accordingly.' % stride)

    # sort by file and frame index
    sort_inds = np.lexsort((frames[:, 1], frames[:, 0]))
    sorted_inds = frames[sort_inds]
    assert len(sorted_inds) == len(frames)

    file_inds_unique = np.unique(sorted_inds[:, 0])
    # construct reader
    if reader is None:
        # filter out files, we would never read, because no indices are pointing to them
        reader = source(np.array(files)[file_inds_unique].tolist(), top=top)
        # re-map indices to reflect filtered files:
        for itraj, c in zip(file_inds_unique, itertools.count(0)):
            mask = sorted_inds[:, 0] == itraj
            sorted_inds[mask, 0] = c

        inds_to_check = np.arange(len(file_inds_unique))
    else:
        inds_to_check = file_inds_unique

    # sanity check of indices
    for itraj in inds_to_check:
        inds_by_traj = sorted_inds[sorted_inds[:, 0] == itraj]
        largest_ind_in_traj = np.max(inds_by_traj)
        length = reader.trajectory_length(itraj)
        if length < largest_ind_in_traj:
            raise ValueError(
                "largest specified index (%i * stride=%i * %i=%i) "
                "is larger than trajectory length '%s' = %i" %
                (largest_ind_in_traj / stride, largest_ind_in_traj / stride,
                 stride, largest_ind_in_traj, reader.filenames[itraj], length))

    # we want the FeatureReader to return mdtraj.Trajectory objects
    if isinstance(reader, FeatureReader):
        reader._return_traj_obj = True
    elif isinstance(reader, FragmentedTrajectoryReader):
        for file in reader.filenames_flat:
            r = reader.reader_by_filename(file)
            if isinstance(r, FeatureReader):
                r = [r]
            for _r in r:
                _r._return_traj_obj = True

    it = reader.iterator(chunk=chunksize,
                         stride=sorted_inds,
                         return_trajindex=False)
    reporter = ProgressReporter()
    reporter._progress_register(it._n_chunks, description="collecting frames")
    collected_frames = []
    with it:
        for x in it:
            collected_frames.append(x)
            reporter._progress_update(1)
    reporter._progress_force_finish()

    dest = _preallocate_empty_trajectory(top, len(frames))
    i = 0
    for chunk in collected_frames:
        _copy_traj_attributes(dest, chunk, i)
        i += len(chunk)
    dest = dest.slice(sort_inds.argsort(), copy=False)
    return dest
Exemple #4
0
def frames_from_file(file_name, top, frames, chunksize=100,
                     stride=1, verbose=False, copy_not_join=False):
    r"""Reads one "file_name" molecular trajectory and returns an mdtraj trajectory object 
        containing only the specified "frames" in the specified order.

    Extracts the specified sequence of time/trajectory indexes from the input loader
    and saves it in a molecular dynamics trajectory. The output format will be determined
    by the outfile name.

    Parameters
    ----------
    file_name: str.
        Absolute path to the molecular trajectory file, ex. trajout.xtc 

    top : str, mdtraj.Trajectory, or mdtraj.Topology
        Topology information to load the molecular trajectroy file in :py:obj:`file_name`

    frames : ndarray of shape (n_frames, ) and integer type
        Contains the frame indices to be retrieved from "file_name". There is no restriction as to what 
        this array has to look like other than:
             - positive integers
             - <= the total number of frames in "file_name".
        "frames" need not be monotonous or unique, i.e, arrays like
        [3, 1, 4, 1, 5, 9, 9, 9, 9, 3000, 0, 0, 1] are welcome 

    verbose: boolean.
        Level of verbosity while looking for "frames". Useful when using "chunksize" with large trajectories.
        It provides the no. of frames accumulated for every chunk.

    stride  : integer, default is 1
        This parameter informs :py:func:`save_traj` about the stride used in :py:obj:`indexes`. Typically, :py:obj:`indexes`
        contains frame-indexes that match exactly the frames of the files contained in :py:obj:`traj_inp.trajfiles`.
        However, in certain situations, that might not be the case. Examples are cases in which a stride value != 1
        was used when reading/featurizing/transforming/discretizing the files contained in :py:obj:`traj_inp.trajfiles`.

    copy_not_join : boolean, default is False
        This parameter decides how geometry objects are appended onto one another. If left to False, mdtraj's own
        :py:obj:`join` method will be used, which is the recommended method. However, for some combinations of
        py:obj:`chunksizes` and :py:obj:`frames` this might be not very effective. If one sets :py:obj:`copy_not_join`
        to True, the returned :py:obj:`traj` is preallocated and the important attributes (currently traj.xyz, traj.time,
         traj.unit_lengths, traj.unit_angles) are broadcasted onto it.


    Returns
    -------
    traj : an md trajectory object containing the frames specified in "frames",
           in the order specified in "frames".
    """

    assert isinstance(frames, np.ndarray), "input frames frames must be a numpy ndarray, got %s instead "%type(frames)
    assert np.ndim(frames) == 1, "input frames frames must have ndim = 1, got np.ndim = %u instead "%np.ndim(frames)
    assert isinstance(file_name, str), "input file_name must be a string, got %s instead"%type(file_name)
    assert isinstance(top, (str, md.Trajectory, md.Topology)), "input topology must of one of type: " \
                                                                    "str, mdtraj.Trajectory, or mdtraj.Topology. " \
                                                                    "Got %s instead" % type(top)
    # Enforce topology to be a md.Topology object
    top = _enforce_top(top)

    # Prepare the trajectory object
    if copy_not_join:
        traj = _preallocate_empty_trajectory(top, frames.shape[0])
    else:
        traj = None

    # Prepare the running number of accumulated frames
    cum_frames = 0

    # Because the trajectory is streamed "chronologically", but "frames" can have any arbitrary order
    # we store that order in "orig_order" to reshuffle the traj at the end
    orig_order = frames.argsort().argsort()
    sorted_frames = np.sort(frames)

    for jj, traj_chunk in enumerate(md.iterload(file_name, top=top,
                                                chunk=chunksize, stride=stride)):

        # Create an indexing array for this trajchunk
        i_idx = jj*chunksize
        f_idx = i_idx+chunksize
        chunk_frames = np.arange(i_idx, f_idx)[:traj_chunk.n_frames]

        # Frames that appear more than one time will be kept
        good_frames = np.hstack([np.argwhere(ff == chunk_frames).squeeze() for ff in sorted_frames])

        # Keep the good frames of this chunk
        if np.size(good_frames) > 0:

            if copy_not_join:   # => traj has been already preallocated, see above
                traj = _copy_traj_attributes(traj, traj_chunk[good_frames], cum_frames)
            elif traj is None: # => copy_not_join is False AND 1st run
                traj = traj_chunk[good_frames]
            else: # => copy_not_join is False AND we're not on the 1st run
                traj = traj.join(traj_chunk[good_frames])

            cum_frames += np.size(good_frames)

        if verbose:
            log.info('chunk %u of traj has size %u, indices %6u...%6u. Accumulated frames %u'
                 % (jj, traj_chunk.n_frames, chunk_frames[0], chunk_frames[-1], cum_frames))

        # Check if we can already stop iterating
        if chunk_frames[-1] >= frames.max():
            break

    # Make sure that "frames" did not contain impossible frames
    if (frames > chunk_frames[-1]).any():
        raise Exception('Cannot provide frames %s for trajectory %s with n_frames = %u'
                        % (frames[frames > chunk_frames[-1]], file_name, chunk_frames[-1]))

    if stride != 1 and verbose:
        log.info('A stride value of = %u was parsed, interpreting "indexes" accordingly.'%stride)

    # Trajectory coordinates are is returned "reshuffled"
    return traj[orig_order]