def get_implied_timescales(assignments_fn,
                           lag_times,
                           n_implied_times=100,
                           sliding_window=True,
                           trimming=True,
                           symmetrize=None,
                           n_procs=1):
    """Calculate implied timescales in parallel using multiprocessing library.  Does not work in interactive mode.

    Parameters
    ----------
    AssignmentsFn : str
        Path to Assignments.h5 file on disk
    LagTimes : list
        List of lag times to calculate the timescales at
    NumImpledTimes : int, optional
        Number of implied timescales to calculate at each lag time
    Slide : bool, optional
        Use sliding window
    Trim : bool, optional
        Use ergodic trimming
    Symmetrize : {'MLE', 'Transpose', None}
        Symmetrization method
    nProc : int
        number of processes to use in parallel (multiprocessing

    Returns
    -------
    formatedLags : ndarray
        RTM 6/27 I'm not quite sure what the semantics of the output is. It's not
        trivial and undocummented.

    See Also
    --------
    MSMLib.mle_reversible_count_matrix : (MLE symmetrization)
    MSMLib.build_msm
    get_eigenvectors

    """
    pool = multiprocessing.Pool(processes=n_procs)

    # subtle bug possibility; uneven_zip will let strings be iterable, whicj
    # we dont want
    inputs = uneven_zip([assignments_fn], lag_times, n_implied_times,
                        sliding_window, trimming, [symmetrize])
    result = pool.map_async(get_implied_timescales_helper, inputs)
    lags = result.get(999999)

    # reformat
    formatted_lags = []
    for i, (lag_time_array, implied_timescale_array) in enumerate(lags):
        for j, lag_time in enumerate(lag_time_array):
            implied_timescale = implied_timescale_array[j]
            formatted_lags.append([lag_time, implied_timescale])

    formatted_lags = np.array(formatted_lags)

    pool.close()

    return formatted_lags
def get_implied_timescales(
    assignments_fn, lag_times, n_implied_times=100, sliding_window=True, trimming=True, symmetrize=None, n_procs=1
):
    """Calculate implied timescales in parallel using multiprocessing library.  Does not work in interactive mode.

    Parameters
    ----------
    AssignmentsFn : str
        Path to Assignments.h5 file on disk
    LagTimes : list
        List of lag times to calculate the timescales at
    NumImpledTimes : int, optional
        Number of implied timescales to calculate at each lag time
    Slide : bool, optional
        Use sliding window
    Trim : bool, optional
        Use ergodic trimming
    Symmetrize : {'MLE', 'Transpose', None}
        Symmetrization method
    nProc : int
        number of processes to use in parallel (multiprocessing

    Returns
    -------
    formatedLags : ndarray
        RTM 6/27 I'm not quite sure what the semantics of the output is. It's not
        trivial and undocummented.

    See Also
    --------
    MSMLib.mle_reversible_count_matrix : (MLE symmetrization)
    MSMLib.build_msm
    get_eigenvectors

    """
    pool = multiprocessing.Pool(processes=n_procs)

    # subtle bug possibility; uneven_zip will let strings be iterable, whicj
    # we dont want
    inputs = uneven_zip([assignments_fn], lag_times, n_implied_times, sliding_window, trimming, [symmetrize])
    result = pool.map_async(get_implied_timescales_helper, inputs)
    lags = result.get(999999)

    # reformat
    formatted_lags = []
    for i, (lag_time_array, implied_timescale_array) in enumerate(lags):
        for j, lag_time in enumerate(lag_time_array):
            implied_timescale = implied_timescale_array[j]
            formatted_lags.append([lag_time, implied_timescale])

    formatted_lags = np.array(formatted_lags)

    pool.close()

    return formatted_lags
Exemple #3
0
    def __init__(self,
                 metric,
                 trajectories,
                 k,
                 num_samples,
                 shrink_multiple,
                 num_local_minima=10,
                 max_neighbors=20,
                 local_swap=False,
                 parallel=None):
        """ Run the CLARANS algorithm (see the Clarans class for more description) on
        multiple subsamples of the data drawn randomly.

        Parameters
        ----------
        metric : msmbuilder.metrics.AbstractDistanceMetric
            A metric capable of handling `ptraj`
        trajectory : Trajectory or list of msmbuilder.Trajectory
            data to cluster
        k : int
            number of desired clusters
        num_samples : int
            number of random subsamples to draw
        shrink_multiple : int
            Each of the subsamples drawn will be of size equal to the total
            number of frames divided by this number
        num_local_minima : int, optional
            number of local minima in the set of all possible clusterings
            to identify. Execution time will scale linearly with this
            parameter. The best of these local minima will be returned.
        max_neighbors : int, optional
            number of rejected swaps in a row necessary to declare a proposed
            clustering a local minima
        local_swap : bool, optional
            If true, proposed swaps will be between a medoid and a data point
            currently assigned to that medoid. If false, the data point for
            the proposed swap is selected randomly
        parallel : {None, 'multiprocessing', 'dtm}
            Which parallelization library to use. Each of the random subsamples
            are run independently
        """

        super(SubsampledClarans, self).__init__(metric, trajectories)

        if parallel is None:
            mymap = map
        elif parallel == 'multiprocessing':
            mymap = Pool().map
        elif parallel == 'dtm':
            mymap = dtm.map
        else:
            raise ValueError('Unrecognized parallelization')

        # function that returns a list of random indices
        gen_sub_indices = lambda: np.array(
            random.sample(range(self.num_frames), self.num_frames /
                          shrink_multiple))
        #gen_sub_indices = lambda: np.arange(self.num_frames)

        sub_indices = [gen_sub_indices() for i in range(num_samples)]
        ptrajs = [self.ptraj[sub_indices[i]] for i in range(num_samples)]

        clarans_args = uneven_zip(metric, ptrajs, k, num_local_minima,
                                  max_neighbors, local_swap, ['kcenters'],
                                  None, None, False)

        results = mymap(_clarans_helper, clarans_args)
        medoids_list, assignments_list, distances_list = zip(*results)
        best_i = np.argmin([np.sum(d) for d in distances_list])

        #print 'best i', best_i
        #print 'best medoids (relative to subindices)', medoids_list[best_i]
        #print 'sub indices', sub_indices[best_i]
        #print 'best_medoids', sub_indices[best_i][medoids_list[best_i]]
        self._generator_indices = sub_indices[best_i][medoids_list[best_i]]
Exemple #4
0
    def __init__(self, metric, trajectories=None, prep_trajectories=None, k=None,
                 num_samples=None, shrink_multiple=None, num_local_minima=10,
                 max_neighbors=20, local_swap=False, parallel=None):
        """ Run the CLARANS algorithm (see the Clarans class for more description) on
        multiple subsamples of the data drawn randomly.

        Parameters
        ----------
        metric : msmbuilder.metrics.AbstractDistanceMetric
            A metric capable of handling `ptraj`
        trajectories : Trajectory or list of msmbuilder.Trajectory
            data to cluster
        prep_trajectories : np.ndarray or None
            prepared trajectories instead of msmbuilder.Trajectory
        k : int
            number of desired clusters
        num_samples : int
            number of random subsamples to draw
        shrink_multiple : int
            Each of the subsamples drawn will be of size equal to the total
            number of frames divided by this number
        num_local_minima : int, optional
            number of local minima in the set of all possible clusterings
            to identify. Execution time will scale linearly with this
            parameter. The best of these local minima will be returned.
        max_neighbors : int, optional
            number of rejected swaps in a row necessary to declare a proposed
            clustering a local minima
        local_swap : bool, optional
            If true, proposed swaps will be between a medoid and a data point
            currently assigned to that medoid. If false, the data point for
            the proposed swap is selected randomly
        parallel : {None, 'multiprocessing', 'dtm}
            Which parallelization library to use. Each of the random subsamples
            are run independently
        """

        super(SubsampledClarans, self).__init__(metric, trajectories, prep_trajectories)

        if parallel is None:
            mymap = map
        elif parallel == 'multiprocessing':
            mymap = Pool().map
        elif parallel == 'dtm':
            mymap = dtm.map
        else:
            raise ValueError('Unrecognized parallelization')


        # function that returns a list of random indices
        gen_sub_indices = lambda: np.array(random.sample(range(self.num_frames), self.num_frames / shrink_multiple))
        # gen_sub_indices = lambda: np.arange(self.num_frames)

        sub_indices = [gen_sub_indices() for i in range(num_samples)]
        ptrajs = [self.ptraj[sub_indices[i]] for i in range(num_samples)]

        clarans_args = uneven_zip(metric, ptrajs, k, num_local_minima, max_neighbors, local_swap, ['kcenters'], None, None, False)

        results = mymap(_clarans_helper, clarans_args)
        medoids_list, assignments_list, distances_list = zip(*results)
        best_i = np.argmin([np.sum(d) for d in distances_list])

        # print 'best i', best_i
        # print 'best medoids (relative to subindices)', medoids_list[best_i]
        # print 'sub indices', sub_indices[best_i]
        # print 'best_medoids', sub_indices[best_i][medoids_list[best_i]]
        self._generator_indices = sub_indices[best_i][medoids_list[best_i]]