def _estimate(self, dtrajs): ### PREPARE AND CHECK DATA # TODO: Currently only discrete trajectories are implemented. For a general class this needs to be changed. dtrajs = _types.ensure_dtraj_list(dtrajs) # check trajectory lengths if self._estimated: # if dtrajs has now changed, unset the _estimated flag to re-set every derived quantity. assert hasattr(self, '_last_dtrajs_input_hash') current_hash = _hash_dtrajs(dtrajs) if self._last_dtrajs_input_hash != current_hash: self.logger.warning("estimating from new data, discard all previously computed models.") self._estimated = False self._last_dtrajs_input_hash = current_hash else: self._last_dtrajs_input_hash = _hash_dtrajs(dtrajs) self._trajlengths = np.fromiter((len(traj) for traj in dtrajs), dtype=int, count=len(dtrajs)) maxlength = np.max(self._trajlengths) # set lag times by data if not yet set if self._lags is None: maxlag = 0.5 * np.sum(self._trajlengths) / float(len(self._trajlengths)) self._lags = _generate_lags(maxlag, 1.5) # check if some lag times are forbidden. if np.max(self._lags) >= maxlength: Ifit = np.where(self._lags < maxlength)[0] Inofit = np.where(self._lags >= maxlength)[0] self.logger.warning('Ignoring lag times that exceed the longest trajectory: %s', self._lags[Inofit]) self._lags = self._lags[Ifit] ### RUN ESTIMATION if self._estimated: # we already had run an estimation, determine which lag times we need to compute # TODO: this will re-evaluate problematic lag times, wont it? lags = sorted(list(set(self._lags).difference(self._last_lags))) if len(lags) == 0: self.logger.info("All lag times already estimated.") return self assert lags self.logger.info("Running estimating for not yet estimated lags times: %s", lags) else: lags = self._lags # construct all parameter sets for the estimator param_sets = tuple(param_grid({'lag': lags})) # run estimation on all lag times pg = ProgressReporter() with pg.context(): models, estimators = estimate_param_scan(self.estimator, dtrajs, param_sets, failfast=False, return_estimators=True, n_jobs=self.n_jobs, progress_reporter=pg, return_exceptions=True) self._estimators = estimators self._postprocess_results(models) return self
class TestProgress(unittest.TestCase): def setUp(self): self.pg = ProgressReporter() self.pg._progress_register(100, "test") def test_config_override(self): self.pg.show_progress = True with settings(show_progress_bars=False): assert self.pg.show_progress == False def test_config_2(self): self.pg.show_progress = False with settings(show_progress_bars=True): assert not self.pg.show_progress
def test_force_finish(self): import warnings worker = ProgressReporter() worker._progress_register(100) # intentionally overshoot registered work with warnings.catch_warnings(record=True) as cm: worker._progress_update(101) self.assertIn("more work than registered", cm[0].message[0]) worker._progress_force_finish()
def test_ctx3(self): pg = ProgressReporter() assert pg.show_progress pg.register(100, stage='test') pg.register(40, stage='test2') pg.register(25, stage='test3') try: with pg.context(stage=('test', 'test3')): pg.update(50, stage='test') pg.update(2, stage='test3') raise Exception() except Exception: assert pg.num_registered == 1 assert 'test2' in pg.registered_stages
def _estimate(self, iterable, **kw): partial_fit = 'partial' in kw it = iterable.iterator(return_trajindex=False, chunk=self.chunksize, stride=self.stride, skip=self.skip) from pyemma._base.progress import ProgressReporter pg = ProgressReporter() pg.register(it.n_chunks, "calc mean+cov", 0) with it, pg.context(): self._init_covar(partial_fit, it.n_chunks) for chunk in it: self._covar.add(chunk) pg.update(1) self.cov = self._covar.cov_XX(bessel=True) self.mu = self._covar.mean_X() self._model.update_model_params(mean=self._covar.mean_X()) if not partial_fit: self._diagonalize() return self._model
def test_mute_progress(self): """ switch mute on shall turn off progress bars""" from pyemma._base.progress import ProgressReporter import mock rp = ProgressReporter() self.config_inst.mute = True with mock.patch('pyemma.config', self.config_inst): assert not rp.show_progress
def test_ctx4(self): pg = ProgressReporter() pg.register(100, 'test') pg.register(40, 'test2') try: with pg.context(): pg.update(50, stage='all') raise Exception() except Exception: assert pg.num_registered == 0
def test_callback(self): self.has_been_called = 0 def call_back(stage, progressbar, *args, **kw): self.has_been_called += 1 assert isinstance(stage, int) assert isinstance(progressbar, ProgressBar) amount_of_work = 100 worker = ProgressReporter() worker._progress_register(amount_of_work, description="hard working", stage=0) worker.register_progress_callback(call_back, stage=0) for _ in range(amount_of_work): worker._progress_update(1, stage=0) self.assertEqual(self.has_been_called, amount_of_work)
def filenames(self, filename_list): if isinstance(filename_list, str): filename_list = [filename_list] uniq = set(filename_list) if len(uniq) != len(filename_list): self.logger.warning("duplicate files/arrays detected") filename_list = list(uniq) from pyemma.coordinates.data.data_in_memory import DataInMemory if self._is_reader: if isinstance(self, DataInMemory): import warnings warnings.warn('filenames are not being used for DataInMemory') return self._ntraj = len(filename_list) if self._ntraj == 0: raise ValueError("empty file list") # validate files for f in filename_list: try: stat = os.stat(f) except EnvironmentError: self.logger.exception('Error during access of file "%s"' % f) raise ValueError('could not read file "%s"' % f) if not os.path.isfile( f): # can be true for symlinks to directories raise ValueError('"%s" is not a valid file') if stat.st_size == 0: raise ValueError('file "%s" is empty' % f) # number of trajectories/data sets self._filenames = filename_list # determine len and dim via cache lookup, lengths = [] offsets = [] ndims = [] # avoid cyclic imports from pyemma.coordinates.data.util.traj_info_cache import TrajectoryInfoCache from pyemma._base.progress import ProgressReporter pg = ProgressReporter() pg.register(len(filename_list), 'Obtaining file info') with pg.context(): for filename in filename_list: if config.use_trajectory_lengths_cache: info = TrajectoryInfoCache.instance()[filename, self] else: info = self._get_traj_info(filename) # nested data set support. if hasattr(info, 'children'): lengths.append(info.length) offsets.append(info.offsets) ndims.append(info.ndim) for c in info.children: lengths.append(c.length) offsets.append(c.offsets) ndims.append(c.ndim) else: lengths.append(info.length) offsets.append(info.offsets) ndims.append(info.ndim) if len(filename_list) > 3: pg.update(1) # ensure all trajs have same dim if not np.unique(ndims).size == 1: # group files by their dimensions to give user indicator ndims = np.array(ndims) filename_list = np.asarray(filename_list) sort_inds = np.argsort(ndims) import itertools, operator res = {} for dim, files in itertools.groupby( zip(ndims[sort_inds], filename_list[sort_inds]), operator.itemgetter(0)): res[dim] = list(f[1] for f in files) raise ValueError( "Input data has different dimensions ({dims})!" " Files grouped by dimensions: {groups}".format( dims=res.keys(), groups=res)) self._ndim = ndims[0] self._lengths = lengths self._offsets = offsets else: # propagate this until we finally have a a reader self.data_producer.filenames = filename_list
def write_to_csv(self, filename=None, extension='.dat', overwrite=False, stride=1, chunksize=None, **kw): """ write all data to csv with numpy.savetxt Parameters ---------- filename : str, optional filename string, which may contain placeholders {itraj} and {stride}: * itraj will be replaced by trajetory index * stride is stride argument of this method If filename is not given, it is being tried to obtain the filenames from the data source of this iterator. extension : str, optional, default='.dat' filename extension of created files overwrite : bool, optional, default=False shall existing files be overwritten? If a file exists, this method will raise. stride : int omit every n'th frame chunksize: int, default=None how many frames to process at once kw : dict, optional named arguments passed into numpy.savetxt (header, seperator etc.) Example ------- Assume you want to save features calculated by some FeatureReader to ASCII: >>> import numpy as np, pyemma >>> import os >>> from pyemma.util.files import TemporaryDirectory >>> from pyemma.util.contexts import settings >>> data = [np.random.random((10,3))] * 3 >>> reader = pyemma.coordinates.source(data) >>> filename = "distances_{itraj}.dat" >>> with TemporaryDirectory() as td, settings(show_progress_bars=False): ... out = os.path.join(td, filename) ... reader.write_to_csv(out, header='', delimiter=';') ... print(sorted(os.listdir(td))) ['distances_0.dat', 'distances_1.dat', 'distances_2.dat'] """ import os if not filename: assert hasattr(self, 'filenames') # raise RuntimeError("could not determine filenames") filenames = [] for f in self.filenames: base, _ = os.path.splitext(f) filenames.append(base + extension) elif isinstance(filename, str): filename = filename.replace('{stride}', str(stride)) filenames = [ filename.replace('{itraj}', str(itraj)) for itraj in range(self.number_of_trajectories()) ] else: raise TypeError("filename should be str or None") self.logger.debug("write_to_csv, filenames=%s" % filenames) # check files before starting to write import errno for f in filenames: try: st = os.stat(f) raise OSError(errno.EEXIST) except OSError as e: if e.errno == errno.EEXIST: if overwrite: continue elif e.errno == errno.ENOENT: continue raise f = None from pyemma._base.progress import ProgressReporter pg = ProgressReporter() it = self.iterator(stride, chunk=chunksize, return_trajindex=False) pg.register(it.n_chunks, "saving to csv") with it, pg.context(): oldtraj = -1 for X in it: if oldtraj != it.current_trajindex: if f is not None: f.close() fn = filenames[it.current_trajindex] self.logger.debug("opening file %s for writing csv." % fn) f = open(fn, 'wb') oldtraj = it.current_trajindex np.savetxt(f, X, **kw) f.flush() pg.update(1, 0) if f is not None: f.close()
def write_to_hdf5(self, filename, group='/', data_set_prefix='', overwrite=False, stride=1, chunksize=None, h5_opt=None): """ writes all data of this Iterable to a given HDF5 file. This is equivalent of writing the result of func:`pyemma.coordinates.data._base.DataSource.get_output` to a file. Parameters ---------- filename: str file name of output HDF5 file group: str, default='/' write all trajectories to this HDF5 group. The group name may not already exist in the file. data_set_prefix: str, default=None data set name prefix, will postfixed with the index of the trajectory. overwrite: bool, default=False if group and data sets already exist, shall we overwrite data? stride: int, default=1 stride argument to iterator chunksize: int, default=None how many frames to process at once h5_opt: dict optional parameters for h5py.create_dataset Notes ----- You can pass the following via h5_opt to enable compression/filters/shuffling etc: chunks (Tuple) Chunk shape, or True to enable auto-chunking. maxshape (Tuple) Make the dataset resizable up to this shape. Use None for axes you want to be unlimited. compression (String or int) Compression strategy. Legal values are 'gzip', 'szip', 'lzf'. If an integer in range(10), this indicates gzip compression level. Otherwise, an integer indicates the number of a dynamically loaded compression filter. compression_opts Compression settings. This is an integer for gzip, 2-tuple for szip, etc. If specifying a dynamically loaded compression filter number, this must be a tuple of values. scaleoffset (Integer) Enable scale/offset filter for (usually) lossy compression of integer or floating-point data. For integer data, the value of scaleoffset is the number of bits to retain (pass 0 to let HDF5 determine the minimum number of bits necessary for lossless compression). For floating point data, scaleoffset is the number of digits after the decimal place to retain; stored values thus have absolute error less than 0.5*10**(-scaleoffset). shuffle (T/F) Enable shuffle filter. Only effective in combination with chunks. fletcher32 (T/F) Enable fletcher32 error detection. Not permitted in conjunction with the scale/offset filter. fillvalue (Scalar) Use this value for uninitialized parts of the dataset. track_times (T/F) Enable dataset creation timestamps. """ if h5_opt is None: h5_opt = {} import h5py from pyemma._base.progress import ProgressReporter pg = ProgressReporter() it = self.iterator(stride=stride, chunk=chunksize, return_trajindex=True) pg.register(it.n_chunks, 'writing output') with h5py.File(filename, mode='a') as f, it, pg.context(): if group not in f: g = f.create_group(group) elif group == '/': # root always exists. g = f[group] elif group in f and overwrite: self.logger.info('overwriting group "{}"'.format(group)) del f[group] g = f.create_group(group) else: raise ValueError( 'Given group "{}" already exists. Choose another one.'. format(group)) # check output data sets data_sets = {} for itraj in np.arange(self.ntraj): template = '{prefix}_{index}' if data_set_prefix else '{index}' ds_name = template.format(prefix=data_set_prefix, index='{:04d}'.format(itraj)) # group can be reused, eg. was empty before now check if we will overwrite something if ds_name in g: if not overwrite: raise ValueError( 'Refusing to overwrite data in group "{}".'.format( group)) else: data_sets[itraj] = g.require_dataset( ds_name, shape=(self.trajectory_length(itraj=itraj, stride=stride), self.ndim), dtype=self.output_type(), **h5_opt) for itraj, X in it: ds = data_sets[itraj] ds[it.pos:it.pos + len(X)] = X pg.update(1)
def get_output(self, dimensions=slice(0, None), stride=1, skip=0, chunk=None): """Maps all input data of this transformer and returns it as an array or list of arrays Parameters ---------- dimensions : list-like of indexes or slice, default=all indices of dimensions you like to keep. stride : int, default=1 only take every n'th frame. skip : int, default=0 initially skip n frames of each file. chunk: int, default=None How many frames to process at once. If not given obtain the chunk size from the source. Returns ------- output : list of ndarray(T_i, d) the mapped data, where T is the number of time steps of the input data, or if stride > 1, floor(T_in / stride). d is the output dimension of this transformer. If the input consists of a list of trajectories, Y will also be a corresponding list of trajectories """ if isinstance(dimensions, int): ndim = 1 dimensions = slice(dimensions, dimensions + 1) elif isinstance(dimensions, (list, np.ndarray, tuple, slice)): if hasattr(dimensions, 'ndim') and dimensions.ndim > 1: raise ValueError( 'dimension indices can\'t have more than one dimension') ndim = len(np.zeros(self.ndim)[dimensions]) else: raise ValueError('unsupported type (%s) of "dimensions"' % type(dimensions)) assert ndim > 0, "ndim was zero in %s" % self.__class__.__name__ if chunk is None: chunk = self.chunksize # create iterator if self.in_memory and not self._mapping_to_mem_active: from pyemma.coordinates.data.data_in_memory import DataInMemory assert self._Y is not None it = DataInMemory(self._Y)._create_iterator(skip=skip, chunk=chunk, stride=stride, return_trajindex=True) else: it = self._create_iterator(skip=skip, chunk=chunk, stride=stride, return_trajindex=True) with it: # allocate memory try: from pyemma import config if config.coordinates_check_output: trajs = [ np.full((l, ndim), np.nan, dtype=self.output_type()) for l in it.trajectory_lengths() ] else: # TODO: avoid having a copy here, if Y is already filled trajs = [ np.empty((l, ndim), dtype=self.output_type()) for l in it.trajectory_lengths() ] except MemoryError: self.logger.exception( "Could not allocate enough memory to map all data." " Consider using a larger stride.") return if self._logger_is_active(self._loglevel_DEBUG): self.logger.debug("get_output(): dimensions=%s" % str(dimensions)) self.logger.debug( "get_output(): created output trajs with shapes: %s" % [x.shape for x in trajs]) self.logger.debug("nchunks :%s, chunksize=%s" % (it.n_chunks, it.chunksize)) # fetch data from pyemma._base.progress import ProgressReporter pg = ProgressReporter() pg.register(it.n_chunks, description='getting output of %s' % self.__class__.__name__) with pg.context(), it: for itraj, chunk in it: i = slice(it.pos, it.pos + len(chunk)) assert i.stop - i.start > 0 trajs[itraj][i, :] = chunk[:, dimensions] pg.update(1) if config.coordinates_check_output: for i, t in enumerate(trajs): finite = self._chunk_finite(t) if not np.all(finite): # determine position frames = np.where(np.logical_not(finite)) if not len(frames): raise RuntimeError( 'nothing got assigned for traj {}'.format(i)) raise RuntimeError( 'unassigned sections in traj {i} in range [{frames}]'. format(frames=frames, i=i)) return trajs
def frames_from_files(files, top, frames, chunksize=1000, stride=1, verbose=False, copy_not_join=None, reader=None): """ Constructs a Trajectory object out of given frames collected from files (or given reader). :param files: source files :param top: topology file :param frames: indices :param chunksize: :param stride: :param verbose: :param copy_not_join: not used :param reader: if a reader is given, ignore files and top param! :return: mdtra.Trajectory consisting out of frames indices. """ # Enforce topology to be a md.Topology object if reader is None: top = _enforce_top(top) else: if not reader.number_of_trajectories(): raise ValueError("need at least one trajectory file in reader.") if isinstance(reader, FragmentedTrajectoryReader): top = reader._readers[0][0].featurizer.topology elif isinstance(reader, FeatureReader): top = reader.featurizer.topology else: raise ValueError("unsupported reader (only md readers).") stride = int(stride) frames = np.array(frames) # only one file, so we expect frames to be a one dimensional array if isinstance(files, str): files = [files] if frames.ndim == 1: # insert a constant column for file index frames = np.insert(np.atleast_2d(frames), 0, np.zeros_like(frames), axis=0).T if stride != 1: frames[:, 1] *= int(stride) if verbose: log.info('A stride value of = %u was parsed, ' 'interpreting "indexes" accordingly.' % stride) # sort by file and frame index sort_inds = np.lexsort((frames[:, 1], frames[:, 0])) sorted_inds = frames[sort_inds] assert len(sorted_inds) == len(frames) file_inds_unique = np.unique(sorted_inds[:, 0]) # construct reader if reader is None: # filter out files, we would never read, because no indices are pointing to them reader = source(np.array(files)[file_inds_unique].tolist(), top=top) # re-map indices to reflect filtered files: for itraj, c in zip(file_inds_unique, itertools.count(0)): mask = sorted_inds[:, 0] == itraj sorted_inds[mask, 0] = c inds_to_check = np.arange(len(file_inds_unique)) else: inds_to_check = file_inds_unique # sanity check of indices for itraj in inds_to_check: inds_by_traj = sorted_inds[sorted_inds[:, 0] == itraj] largest_ind_in_traj = np.max(inds_by_traj) length = reader.trajectory_length(itraj) if length < largest_ind_in_traj: raise ValueError( "largest specified index (%i * stride=%i * %i=%i) " "is larger than trajectory length '%s' = %i" % (largest_ind_in_traj / stride, largest_ind_in_traj / stride, stride, largest_ind_in_traj, reader.filenames[itraj], length)) # we want the FeatureReader to return mdtraj.Trajectory objects if isinstance(reader, FeatureReader): reader._return_traj_obj = True elif isinstance(reader, FragmentedTrajectoryReader): for file in reader.filenames_flat: r = reader.reader_by_filename(file) if isinstance(r, FeatureReader): r = [r] for _r in r: _r._return_traj_obj = True it = reader.iterator(chunk=chunksize, stride=sorted_inds, return_trajindex=False) reporter = ProgressReporter() reporter._progress_register(it._n_chunks, description="collecting frames") collected_frames = [] with it: for x in it: collected_frames.append(x) reporter._progress_update(1) reporter._progress_force_finish() dest = _preallocate_empty_trajectory(top, len(frames)) i = 0 for chunk in collected_frames: _copy_traj_attributes(dest, chunk, i) i += len(chunk) dest = dest.slice(sort_inds.argsort(), copy=False) return dest
def count_lagged(self, lag, count_mode='sliding', mincount_connectivity='1/n', show_progress=True, n_jobs=None, name='', core_set=None, milestoning_method='last_core'): r""" Counts transitions at given lag time Parameters ---------- lag : int lagtime in trajectory steps count_mode : str, optional, default='sliding' mode to obtain count matrices from discrete trajectories. Should be one of: * 'sliding' : A trajectory of length T will have :math:`T-\tau` counts at time indexes .. math:: (0 \rightarray \tau), (1 \rightarray \tau+1), ..., (T-\tau-1 \rightarray T-1) * 'effective' : Uses an estimate of the transition counts that are statistically uncorrelated. Recommended when used with a Bayesian MSM. * 'sample' : A trajectory of length T will have :math:`T / \tau` counts at time indexes .. math:: (0 \rightarray \tau), (\tau \rightarray 2 \tau), ..., (((T/tau)-1) \tau \rightarray T) show_progress: bool, default=True show the progress for the expensive effective count mode computation. n_jobs: int or None """ # store lag time self._lag = lag # Compute count matrix count_mode = count_mode.lower() if core_set is not None and count_mode in ('sliding', 'sample'): if milestoning_method == 'last_core': # assign -1 frames to last visited core for d in self._dtrajs: assert d[0] != -1 while -1 in d: mask = (d == -1) d[mask] = d[np.roll(mask, -1)] self._C = msmest.count_matrix(self._dtrajs, lag, sliding=count_mode == 'sliding') else: raise NotImplementedError('Milestoning method {} not implemented.'.format(milestoning_method)) elif count_mode == 'sliding': self._C = msmest.count_matrix(self._dtrajs, lag, sliding=True) elif count_mode == 'sample': self._C = msmest.count_matrix(self._dtrajs, lag, sliding=False) elif count_mode == 'effective': if core_set is not None: raise RuntimeError('Cannot estimate core set MSM with effective counting.') from pyemma.util.reflection import getargspec_no_self argspec = getargspec_no_self(msmest.effective_count_matrix) kw = {} from pyemma.util.contexts import nullcontext ctx = nullcontext() if 'callback' in argspec.args: # msmtools effective cmatrix ready for multiprocessing? from pyemma._base.progress import ProgressReporter from pyemma._base.parallel import get_n_jobs kw['n_jobs'] = get_n_jobs() if n_jobs is None else n_jobs if show_progress: pg = ProgressReporter() # this is a fast operation C_temp = msmest.count_matrix(self._dtrajs, lag, sliding=True) pg.register(C_temp.nnz, '{}: compute stat. inefficiencies'.format(name), stage=0) del C_temp kw['callback'] = pg.update ctx = pg.context(stage=0) with ctx: self._C = msmest.effective_count_matrix(self._dtrajs, lag, **kw) else: raise ValueError('Count mode ' + count_mode + ' is unknown.') # store mincount_connectivity if mincount_connectivity == '1/n': mincount_connectivity = 1.0 / np.shape(self._C)[0] self._mincount_connectivity = mincount_connectivity # Compute reversibly connected sets if self._mincount_connectivity > 0: self._connected_sets = \ self._compute_connected_sets(self._C, mincount_connectivity=self._mincount_connectivity) else: self._connected_sets = msmest.connected_sets(self._C) # set sizes and count matrices on reversibly connected sets self._connected_set_sizes = np.zeros((len(self._connected_sets))) self._C_sub = np.empty((len(self._connected_sets)), dtype=np.object) for i in range(len(self._connected_sets)): # set size self._connected_set_sizes[i] = len(self._connected_sets[i]) # submatrix # self._C_sub[i] = submatrix(self._C, self._connected_sets[i]) # largest connected set self._lcs = self._connected_sets[0] # if lcs has no counts, make lcs empty if submatrix(self._C, self._lcs).sum() == 0: self._lcs = np.array([], dtype=int) # mapping from full to lcs self._full2lcs = -1 * np.ones((self._nstates), dtype=int) self._full2lcs[self._lcs] = np.arange(len(self._lcs)) # remember that this function was called self._counted_at_lag = True
def setUp(self): self.pg = ProgressReporter() self.pg._progress_register(100, "test")
def count_lagged(self, lag, count_mode='sliding', mincount_connectivity='1/n', show_progress=True): r""" Counts transitions at given lag time Parameters ---------- lag : int lagtime in trajectory steps count_mode : str, optional, default='sliding' mode to obtain count matrices from discrete trajectories. Should be one of: * 'sliding' : A trajectory of length T will have :math:`T-\tau` counts at time indexes .. math:: (0 \rightarray \tau), (1 \rightarray \tau+1), ..., (T-\tau-1 \rightarray T-1) * 'effective' : Uses an estimate of the transition counts that are statistically uncorrelated. Recommended when used with a Bayesian MSM. * 'sample' : A trajectory of length T will have :math:`T / \tau` counts at time indexes .. math:: (0 \rightarray \tau), (\tau \rightarray 2 \tau), ..., (((T/tau)-1) \tau \rightarray T) show_progress: bool, default=True show the progress for the expensive effective count mode computation. """ # store lag time self._lag = lag # Compute count matrix count_mode = count_mode.lower() if count_mode == 'sliding': self._C = msmest.count_matrix(self._dtrajs, lag, sliding=True) elif count_mode == 'sample': self._C = msmest.count_matrix(self._dtrajs, lag, sliding=False) elif count_mode == 'effective': from pyemma.util.reflection import getargspec_no_self argspec = getargspec_no_self(msmest.effective_count_matrix) kw = {} if show_progress and 'callback' in argspec.args: from pyemma._base.progress import ProgressReporter from pyemma._base.parallel import get_n_jobs pg = ProgressReporter() # this is a fast operation C_temp = msmest.count_matrix(self._dtrajs, lag, sliding=True) pg.register(C_temp.nnz, 'compute statistical inefficiencies') del C_temp callback = lambda: pg.update(1) kw['callback'] = callback kw['n_jobs'] = get_n_jobs() self._C = msmest.effective_count_matrix(self._dtrajs, lag, **kw) else: raise ValueError('Count mode ' + count_mode + ' is unknown.') # store mincount_connectivity if mincount_connectivity == '1/n': mincount_connectivity = 1.0 / np.shape(self._C)[0] self._mincount_connectivity = mincount_connectivity # Compute reversibly connected sets if self._mincount_connectivity > 0: self._connected_sets = \ self._compute_connected_sets(self._C, mincount_connectivity=self._mincount_connectivity) else: self._connected_sets = msmest.connected_sets(self._C) # set sizes and count matrices on reversibly connected sets self._connected_set_sizes = np.zeros((len(self._connected_sets))) self._C_sub = np.empty((len(self._connected_sets)), dtype=np.object) for i in range(len(self._connected_sets)): # set size self._connected_set_sizes[i] = len(self._connected_sets[i]) # submatrix # self._C_sub[i] = submatrix(self._C, self._connected_sets[i]) # largest connected set self._lcs = self._connected_sets[0] # if lcs has no counts, make lcs empty if submatrix(self._C, self._lcs).sum() == 0: self._lcs = np.array([], dtype=int) # mapping from full to lcs self._full2lcs = -1 * np.ones((self._nstates), dtype=int) self._full2lcs[self._lcs] = np.arange(len(self._lcs)) # remember that this function was called self._counted_at_lag = True
def _estimate(self, iterable, partial_fit=False): indim = iterable.dimension() if not indim: raise ValueError("zero dimension from data source!") if not any( iterable.trajectory_lengths(stride=self.stride, skip=self.lag + self.skip) > 0): if partial_fit: self.logger.warning( "Could not use data passed to partial_fit(), " "because no single data set [longest=%i] is longer than lag+skip [%i]", max( iterable.trajectory_lengths(self.stride, skip=self.skip)), self.lag + self.skip) return self else: raise ValueError( "None single dataset [longest=%i] is longer than" " lag+skip [%i]." % (max( iterable.trajectory_lengths( self.stride, skip=self.skip)), self.lag + self.skip)) self.logger.debug( "will use %s total frames for %s", iterable.trajectory_lengths(self.stride, skip=self.skip), self.name) chunksize = 0 if partial_fit else iterable.chunksize it = iterable.iterator(lag=self.lag, return_trajindex=False, stride=self.stride, skip=self.skip, chunk=chunksize) # iterator over input weights if hasattr(self.weights, 'iterator'): if hasattr(self.weights, '_transform_array'): self.weights.data_producer = iterable it_weights = self.weights.iterator(lag=0, return_trajindex=False, stride=self.stride, skip=self.skip, chunk=chunksize) if it_weights.number_of_trajectories( ) != iterable.number_of_trajectories(): raise ValueError( "number of weight arrays did not match number of input data sets. {} vs. {}" .format(it_weights.number_of_trajectories(), iterable.number_of_trajectories())) else: # if we only have a scalar, repeat it. import itertools it_weights = itertools.repeat(self.weights) # TODO: we could possibly optimize the case lag>0 and c0t=False using skip. # Access how much iterator hassle this would be. #self.skipped=0 pg = ProgressReporter() pg.register(it.n_chunks, 'calculate covariances', stage=0) with it, pg.context(stage=0): self._init_covar(partial_fit, it.n_chunks) for data, weight in zip(it, it_weights): if self.lag != 0: X, Y = data else: X, Y = data, None if weight is not None: if isinstance(weight, np.ndarray): weight = weight.squeeze()[:len(X)] # TODO: if the weight is exactly zero it makes not sense to add the chunk to running moments. # however doing so, leads to wrong results... # if np.all(np.abs(weight) < np.finfo(np.float).eps): # #print("skip") # self.skipped += len(X) # continue if self.remove_constant_mean is not None: X = X - self.remove_constant_mean[np.newaxis, :] if Y is not None: Y = Y - self.remove_constant_mean[np.newaxis, :] try: self._rc.add(X, Y, weights=weight) except MemoryError: raise MemoryError( 'Covariance matrix does not fit into memory. ' 'Input is too high-dimensional ({} dimensions). '. format(X.shape[1])) pg.update(1, stage=0) if partial_fit: if '_rc' not in self.__serialize_fields: self.__serialize_fields.append('_rc') else: if '_rc' in self.__serialize_fields: self.__serialize_fields.remove('_rc') return self
def test_below_threshold(self): # show not raise pg = ProgressReporter() pg.register(2) pg.update(1) pg.set_description('dummy')