Ejemplo n.º 1
0
def get_count_matrix_from_assignments(assignments, n_states=None, lag_time=1, sliding_window=True, use_mask=False):
    """
    Calculate counts matrix from `assignments`.

    Parameters
    ----------
    assignments : ndarray
        A 2d ndarray containing the state assignments.
    n_states : int, optional
        Can be automatically determined, unless you want a model with more states than are observed
    lag_time: int, optional
        the LagTime with which to estimate the count matrix. Default: 1
    sliding_window: bool, optional
        Use a sliding window.  Default: True
    use_mask: bool, optional
        Use a mask object in computing the count matrix. Default: False
    Returns
    -------
    counts : sparse matrix
        `Counts[i,j]` stores the number of times in the assignments that a
        trajectory went from state i to state j in `LagTime` frames

    Notes
    -----
    assignments are input as iterables over numpy 1-d arrays of integers.
    For example a 2-d array where assignments[i,j] gives the ith trajectory, jth frame.
    The beginning and end of each trajectory may be padded with negative ones, which will be ignored.
    If the number of states is not given explitly, it will be determined as one plus the largest state index of the Assignments.
    Sliding window yields non-independent samples, but wastes less data.
    """

    check_assignment_array_input(assignments)

    if not n_states:
        n_states = 1 + int(
            np.max([np.max(a) for a in assignments]))   # Lutz: a single np.max is not enough, b/c it can't handle a list of 1-d arrays of different lengths
        if n_states < 1:
            raise ValueError()

    C = scipy.sparse.lil_matrix((int(n_states), int(n_states)), dtype='float32')  # Lutz: why are we using float for count matrices?

    for A in assignments:
        FirstEntry = np.where(A != -1)[0]
        # New Code by KAB to skip pre-padded negative ones.
        # This should solve issues with Tarjan trimming results.
        if len(FirstEntry) >= 1:
            FirstEntry = FirstEntry[0]
            A = A[FirstEntry:]
            mask = None
            if use_mask != False:
                traj_num = np.where(assignments == A)[0][0]
                mask = mask.get_dihedral_mask(traj_num)
            C = C + get_counts_from_traj(A, n_states, lag_time=lag_time, sliding_window=sliding_window, mask=mask)  # .tolil()

    return C
Ejemplo n.º 2
0
def get_count_matrix_from_assignments(assignments, n_states=None, lag_time=1, sliding_window=True):
    """
    Calculate counts matrix from `assignments`.

    Parameters
    ----------
    assignments : ndarray
        A 2d ndarray containing the state assignments.
    n_states : int, optional
        Can be automatically determined, unless you want a model with more states than are observed
    lag_time: int, optional
        the LagTime with which to estimate the count matrix. Default: 1
    sliding_window: bool, optional
        Use a sliding window.  Default: True

    Returns
    -------
    counts : sparse matrix
        `Counts[i,j]` stores the number of times in the assignments that a
        trajectory went from state i to state j in `LagTime` frames

    Notes
    -----
    assignments are input as iterables over numpy 1-d arrays of integers.
    For example a 2-d array where assignments[i,j] gives the ith trajectory, jth frame.
    The beginning and end of each trajectory may be padded with negative ones, which will be ignored.
    If the number of states is not given explitly, it will be determined as one plus the largest state index of the Assignments.
    Sliding window yields non-independent samples, but wastes less data.
    """

    check_assignment_array_input(assignments)

    if not n_states:
        # Lutz: a single np.max is not enough, b/c it can't handle a list of 1-d arrays of different lengths
        n_states = 1 + int(np.max([np.max(a) for a in assignments]))   
        if n_states < 1:
            raise ValueError()

    # Lutz: why are we using float for count matrices?
    C = scipy.sparse.lil_matrix((int(n_states), int(n_states)), dtype='float32')

    for A in assignments:
        FirstEntry = np.where(A != -1)[0]
        # New Code by KAB to skip pre-padded negative ones.
        # This should solve issues with Tarjan trimming results.
        if len(FirstEntry) >= 1:
            FirstEntry = FirstEntry[0]
            A = A[FirstEntry:]
            # .tolil()
            C = C + get_counts_from_traj(A, n_states, lag_time=lag_time, sliding_window=sliding_window)

    return C
Ejemplo n.º 3
0
def get_counts_from_traj(states, n_states=None, lag_time=1, sliding_window=True):
    """Computes the transition count matrix for a sequence of states (single trajectory).

    Parameters
    ----------
    states : array
        A one-dimensional array of integers representing the sequence of states.
        These integers must be in the range [0, n_states]
    n_states : int
        The total number of states. If not specified, the largest integer in the
        states array plus one will be used.
    lag_time : int, optional
        The time delay over which transitions are counted
    sliding_window : bool, optional
        Use sliding window

    Returns
    -------
    C : sparse matrix of integers
        The computed transition count matrix
    """

    check_assignment_array_input(states, ndim=1)

    if not n_states:
        n_states = np.max(states) + 1

    if sliding_window:
        from_states = states[: -lag_time: 1]
        to_states = states[lag_time:: 1]
    else:
        from_states = states[: -lag_time: lag_time]
        to_states = states[lag_time:: lag_time]
    assert from_states.shape == to_states.shape

    transitions = np.row_stack((from_states, to_states))
    counts = np.ones(transitions.shape[1], dtype=int)
    try:
        C = scipy.sparse.coo_matrix((counts, transitions),
                                    shape=(n_states, n_states))
    except ValueError:
        # Lutz: if we arrive here, there was probably a state with index -1
        # we try to fix it by ignoring transitions in and out of those states
        # (we set both the count and the indices for those transitions to 0)
        mask = transitions < 0
        counts[mask[0, :] | mask[1, :]] = 0
        transitions[mask] = 0
        C = scipy.sparse.coo_matrix((counts, transitions),
                                    shape=(n_states, n_states))

    return C
Ejemplo n.º 4
0
def get_counts_from_traj(states, n_states=None, lag_time=1, sliding_window=True):
    """Computes the transition count matrix for a sequence of states (single trajectory).

    Parameters
    ----------
    states : array
        A one-dimensional array of integers representing the sequence of states.
        These integers must be in the range [0, n_states]
    n_states : int
        The total number of states. If not specified, the largest integer in the
        states array plus one will be used.
    lag_time : int, optional
        The time delay over which transitions are counted
    sliding_window : bool, optional
        Use sliding window

    Returns
    -------
    C : sparse matrix of integers
        The computed transition count matrix
    """

    check_assignment_array_input(states, ndim=1)

    if not n_states:
        n_states = np.max(states) + 1

    if sliding_window:
        from_states = states[: -lag_time: 1]
        to_states = states[lag_time:: 1]
    else:
        from_states = states[: -lag_time: lag_time]
        to_states = states[lag_time:: lag_time]
    assert from_states.shape == to_states.shape

    transitions = np.row_stack((from_states, to_states))
    counts = np.ones(transitions.shape[1], dtype=int)
    try:
        C = scipy.sparse.coo_matrix((counts, transitions),
                                    shape=(n_states, n_states))
    except ValueError:
        # Lutz: if we arrive here, there was probably a state with index -1
        # we try to fix it by ignoring transitions in and out of those states
        # (we set both the count and the indices for those transitions to 0)
        mask = transitions < 0
        counts[mask[0, :] | mask[1, :]] = 0
        transitions[mask] = 0
        C = scipy.sparse.coo_matrix((counts, transitions),
                                    shape=(n_states, n_states))

    return C
Ejemplo n.º 5
0
def invert_assignments(assignments):
    """Invert an assignments array -- that is, produce a mapping
    from state -> traj/frame

    Parameters
    ----------
    assignments : np.ndarray
        2D array of MSMBuilder assignments

    Returns
    -------
    inverse_mapping : collections.defaultdict
        Mapping from state -> traj,frame, such that inverse_mapping[s]
        gives the conformations assigned to state s.

    Notes
    -----
    The assignments array may have -1's, which are simply placeholders
        we do not add these to the inverted assignments. Therefore, doing
        the following will raise a KeyError:

        >>> inv_assignments = MSMLib.invert_assignments(assignments)
        >>> print inv_assignments[-1]
        KeyError: -1
    """

    check_assignment_array_input(assignments)

    inverse_mapping = defaultdict(lambda: ([], []))
    non_neg_inds = np.array(np.where(assignments != -1)).T  
    # we do not care about -1's

    for (i, j) in non_neg_inds:
        inverse_mapping[assignments[i, j]][0].append(i)
        inverse_mapping[assignments[i, j]][1].append(j)

    # convert from lists to numpy arrays
    for key, (trajs, frames) in inverse_mapping.iteritems():
        inverse_mapping[key] = (np.array(trajs), np.array(frames))

    return inverse_mapping
Ejemplo n.º 6
0
def invert_assignments(assignments):
    """Invert an assignments array -- that is, produce a mapping
    from state -> traj/frame

    Parameters
    ----------
    assignments : np.ndarray
        2D array of MSMBuilder assignments

    Returns
    -------
    inverse_mapping : collections.defaultdict
        Mapping from state -> traj,frame, such that inverse_mapping[s]
        gives the conformations assigned to state s.

    Notes
    -----
    The assignments array may have -1's, which are simply placeholders
        we do not add these to the inverted assignments. Therefore, doing
        the following will raise a KeyError:

        >>> inv_assignments = MSMLib.invert_assignments(assignments)
        >>> print inv_assignments[-1]
        KeyError: -1
    """

    check_assignment_array_input(assignments)

    inverse_mapping = defaultdict(lambda: ([], []))
    non_neg_inds = np.array(np.where(assignments != -1)).T
    # we do not care about -1's

    for (i, j) in non_neg_inds:
        inverse_mapping[assignments[i, j]][0].append(i)
        inverse_mapping[assignments[i, j]][1].append(j)

    # convert from lists to numpy arrays
    for key, (trajs, frames) in iteritems(inverse_mapping):
        inverse_mapping[key] = (np.array(trajs), np.array(frames))

    return inverse_mapping
Ejemplo n.º 7
0
def apply_mapping_to_assignments(assignments, mapping):
    """Remap the states in an assignments file according to a mapping.

    Parameters
    ----------
    assignments : ndarray
        Standard 2D assignments array
    mapping : ndarray
        1D numpy array of length equal to the number of states in Assignments.
        Mapping[a] = b means that the frames currently in state a will be mapped
        to state b

    Returns
    -------
    NewAssignments : ndarray

    Notes
    -----
    This function is useful after performing PCCA or Ergodic Trimming. Also, the
    state -1 is treated specially -- it always stays -1 and is not remapped.

    """

    check_assignment_array_input(assignments)

    NewMapping = mapping.copy()
    # Make a special state for things that get deleted by Ergodic Trimming.
    NewMapping[np.where(mapping == -1)] = mapping.max() + 1

    NegativeOneStates = np.where(assignments == -1)

    assignments[:] = NewMapping[assignments]
    WhereEliminatedStates = np.where(assignments == (mapping.max() + 1))

    # These are the dangling 'tails' of trajectories (with no actual data) that we denote state -1.
    assignments[NegativeOneStates] = -1
    # These states have typically been "deleted" by the ergodic trimming
    # algorithm.  Can be at beginning or end of trajectory.
    assignments[WhereEliminatedStates] = -1
Ejemplo n.º 8
0
def renumber_states(assignments):
    """Renumber states to be consecutive integers (0, 1, ... , n), performs
    this transformation in place.

    Parameters
    ----------
    assignments : ndarray
        2D array of msmbuilder assignments

    Returns
    -------
    mapping : ndarray, int
        A mapping from the old numbering scheme to the new, such that
        mapping[new] = old

    Notes
    -----
    Useful if some states have 0 counts.
    """

    check_assignment_array_input(assignments)

    unique = list(np.unique(assignments))
    if unique[0] == -1:
        minus_one = np.where(assignments == -1)
        unique.pop(0)
    else:
        minus_one = []

    inverse_mapping = invert_assignments(assignments)

    for i, x in enumerate(unique):
        assignments[inverse_mapping[x]] = i
    assignments[minus_one] = -1

    mapping = np.array(unique, dtype=int)
    return mapping
Ejemplo n.º 9
0
def renumber_states(assignments):
    """Renumber states to be consecutive integers (0, 1, ... , n), performs
    this transformation in place.

    Parameters
    ----------
    assignments : ndarray
        2D array of msmbuilder assignments

    Returns
    -------
    mapping : ndarray, int
        A mapping from the old numbering scheme to the new, such that
        mapping[new] = old

    Notes
    -----
    Useful if some states have 0 counts.
    """

    check_assignment_array_input(assignments)

    unique = list(np.unique(assignments))
    if unique[0] == -1:
        minus_one = np.where(assignments == -1)
        unique.pop(0)
    else:
        minus_one = []

    inverse_mapping = invert_assignments(assignments)

    for i, x in enumerate(unique):
        assignments[inverse_mapping[x]] = i
    assignments[minus_one] = -1

    mapping = np.array(unique, dtype=int)
    return mapping