def get_implied_timescales(assignments_fn, lag_times, n_implied_times=100, sliding_window=True, trimming=True, symmetrize=None, n_procs=1): """Calculate implied timescales in parallel using multiprocessing library. Does not work in interactive mode. Parameters ---------- AssignmentsFn : str Path to Assignments.h5 file on disk LagTimes : list List of lag times to calculate the timescales at NumImpledTimes : int, optional Number of implied timescales to calculate at each lag time Slide : bool, optional Use sliding window Trim : bool, optional Use ergodic trimming Symmetrize : {'MLE', 'Transpose', None} Symmetrization method nProc : int number of processes to use in parallel (multiprocessing Returns ------- formatedLags : ndarray RTM 6/27 I'm not quite sure what the semantics of the output is. It's not trivial and undocummented. See Also -------- MSMLib.mle_reversible_count_matrix : (MLE symmetrization) MSMLib.build_msm get_eigenvectors """ pool = multiprocessing.Pool(processes=n_procs) # subtle bug possibility; uneven_zip will let strings be iterable, whicj # we dont want inputs = uneven_zip([assignments_fn], lag_times, n_implied_times, sliding_window, trimming, [symmetrize]) result = pool.map_async(get_implied_timescales_helper, inputs) lags = result.get(999999) # reformat formatted_lags = [] for i, (lag_time_array, implied_timescale_array) in enumerate(lags): for j, lag_time in enumerate(lag_time_array): implied_timescale = implied_timescale_array[j] formatted_lags.append([lag_time, implied_timescale]) formatted_lags = np.array(formatted_lags) pool.close() return formatted_lags
def get_implied_timescales( assignments_fn, lag_times, n_implied_times=100, sliding_window=True, trimming=True, symmetrize=None, n_procs=1 ): """Calculate implied timescales in parallel using multiprocessing library. Does not work in interactive mode. Parameters ---------- AssignmentsFn : str Path to Assignments.h5 file on disk LagTimes : list List of lag times to calculate the timescales at NumImpledTimes : int, optional Number of implied timescales to calculate at each lag time Slide : bool, optional Use sliding window Trim : bool, optional Use ergodic trimming Symmetrize : {'MLE', 'Transpose', None} Symmetrization method nProc : int number of processes to use in parallel (multiprocessing Returns ------- formatedLags : ndarray RTM 6/27 I'm not quite sure what the semantics of the output is. It's not trivial and undocummented. See Also -------- MSMLib.mle_reversible_count_matrix : (MLE symmetrization) MSMLib.build_msm get_eigenvectors """ pool = multiprocessing.Pool(processes=n_procs) # subtle bug possibility; uneven_zip will let strings be iterable, whicj # we dont want inputs = uneven_zip([assignments_fn], lag_times, n_implied_times, sliding_window, trimming, [symmetrize]) result = pool.map_async(get_implied_timescales_helper, inputs) lags = result.get(999999) # reformat formatted_lags = [] for i, (lag_time_array, implied_timescale_array) in enumerate(lags): for j, lag_time in enumerate(lag_time_array): implied_timescale = implied_timescale_array[j] formatted_lags.append([lag_time, implied_timescale]) formatted_lags = np.array(formatted_lags) pool.close() return formatted_lags
def __init__(self, metric, trajectories, k, num_samples, shrink_multiple, num_local_minima=10, max_neighbors=20, local_swap=False, parallel=None): """ Run the CLARANS algorithm (see the Clarans class for more description) on multiple subsamples of the data drawn randomly. Parameters ---------- metric : msmbuilder.metrics.AbstractDistanceMetric A metric capable of handling `ptraj` trajectory : Trajectory or list of msmbuilder.Trajectory data to cluster k : int number of desired clusters num_samples : int number of random subsamples to draw shrink_multiple : int Each of the subsamples drawn will be of size equal to the total number of frames divided by this number num_local_minima : int, optional number of local minima in the set of all possible clusterings to identify. Execution time will scale linearly with this parameter. The best of these local minima will be returned. max_neighbors : int, optional number of rejected swaps in a row necessary to declare a proposed clustering a local minima local_swap : bool, optional If true, proposed swaps will be between a medoid and a data point currently assigned to that medoid. If false, the data point for the proposed swap is selected randomly parallel : {None, 'multiprocessing', 'dtm} Which parallelization library to use. Each of the random subsamples are run independently """ super(SubsampledClarans, self).__init__(metric, trajectories) if parallel is None: mymap = map elif parallel == 'multiprocessing': mymap = Pool().map elif parallel == 'dtm': mymap = dtm.map else: raise ValueError('Unrecognized parallelization') # function that returns a list of random indices gen_sub_indices = lambda: np.array( random.sample(range(self.num_frames), self.num_frames / shrink_multiple)) #gen_sub_indices = lambda: np.arange(self.num_frames) sub_indices = [gen_sub_indices() for i in range(num_samples)] ptrajs = [self.ptraj[sub_indices[i]] for i in range(num_samples)] clarans_args = uneven_zip(metric, ptrajs, k, num_local_minima, max_neighbors, local_swap, ['kcenters'], None, None, False) results = mymap(_clarans_helper, clarans_args) medoids_list, assignments_list, distances_list = zip(*results) best_i = np.argmin([np.sum(d) for d in distances_list]) #print 'best i', best_i #print 'best medoids (relative to subindices)', medoids_list[best_i] #print 'sub indices', sub_indices[best_i] #print 'best_medoids', sub_indices[best_i][medoids_list[best_i]] self._generator_indices = sub_indices[best_i][medoids_list[best_i]]
def __init__(self, metric, trajectories=None, prep_trajectories=None, k=None, num_samples=None, shrink_multiple=None, num_local_minima=10, max_neighbors=20, local_swap=False, parallel=None): """ Run the CLARANS algorithm (see the Clarans class for more description) on multiple subsamples of the data drawn randomly. Parameters ---------- metric : msmbuilder.metrics.AbstractDistanceMetric A metric capable of handling `ptraj` trajectories : Trajectory or list of msmbuilder.Trajectory data to cluster prep_trajectories : np.ndarray or None prepared trajectories instead of msmbuilder.Trajectory k : int number of desired clusters num_samples : int number of random subsamples to draw shrink_multiple : int Each of the subsamples drawn will be of size equal to the total number of frames divided by this number num_local_minima : int, optional number of local minima in the set of all possible clusterings to identify. Execution time will scale linearly with this parameter. The best of these local minima will be returned. max_neighbors : int, optional number of rejected swaps in a row necessary to declare a proposed clustering a local minima local_swap : bool, optional If true, proposed swaps will be between a medoid and a data point currently assigned to that medoid. If false, the data point for the proposed swap is selected randomly parallel : {None, 'multiprocessing', 'dtm} Which parallelization library to use. Each of the random subsamples are run independently """ super(SubsampledClarans, self).__init__(metric, trajectories, prep_trajectories) if parallel is None: mymap = map elif parallel == 'multiprocessing': mymap = Pool().map elif parallel == 'dtm': mymap = dtm.map else: raise ValueError('Unrecognized parallelization') # function that returns a list of random indices gen_sub_indices = lambda: np.array(random.sample(range(self.num_frames), self.num_frames / shrink_multiple)) # gen_sub_indices = lambda: np.arange(self.num_frames) sub_indices = [gen_sub_indices() for i in range(num_samples)] ptrajs = [self.ptraj[sub_indices[i]] for i in range(num_samples)] clarans_args = uneven_zip(metric, ptrajs, k, num_local_minima, max_neighbors, local_swap, ['kcenters'], None, None, False) results = mymap(_clarans_helper, clarans_args) medoids_list, assignments_list, distances_list = zip(*results) best_i = np.argmin([np.sum(d) for d in distances_list]) # print 'best i', best_i # print 'best medoids (relative to subindices)', medoids_list[best_i] # print 'sub indices', sub_indices[best_i] # print 'best_medoids', sub_indices[best_i][medoids_list[best_i]] self._generator_indices = sub_indices[best_i][medoids_list[best_i]]