class WESTKineticsBase(WESTSubcommand): ''' Common argument processing for w_direct/w_reweight subcommands. Mostly limited to handling input and output from w_assign. ''' def __init__(self, parent): super(WESTKineticsBase,self).__init__(parent) self.data_reader = WESTDataReader() self.iter_range = IterRangeSelection() self.progress = ProgressIndicatorComponent() self.output_filename = None # This is actually applicable to both. self.assignment_filename = None self.output_file = None self.assignments_file = None self.evolution_mode = None self.mcbs_alpha = None self.mcbs_acalpha = None self.mcbs_nsets = None # Now we're adding in things that come from the old w_kinetics self.do_compression = True def add_args(self, parser): self.progress.add_args(parser) self.data_reader.add_args(parser) self.iter_range.include_args['iter_step'] = True self.iter_range.add_args(parser) iogroup = parser.add_argument_group('input/output options') iogroup.add_argument('-a', '--assignments', default='assign.h5', help='''Bin assignments and macrostate definitions are in ASSIGNMENTS (default: %(default)s).''') iogroup.add_argument('-o', '--output', dest='output', default=self.default_output_file, help='''Store results in OUTPUT (default: %(default)s).''') def process_args(self, args): self.progress.process_args(args) self.data_reader.process_args(args) with self.data_reader: self.iter_range.process_args(args, default_iter_step=None) if self.iter_range.iter_step is None: #use about 10 blocks by default self.iter_range.iter_step = max(1, (self.iter_range.iter_stop - self.iter_range.iter_start) // 10) self.output_filename = args.output self.assignments_filename = args.assignments
class WFluxanlTool(WESTTool): prog = 'w_fluxanl' description = '''\ Extract fluxes into pre-defined target states from WEST data, average, and construct confidence intervals. Monte Carlo bootstrapping is used to account for the correlated and possibly non-Gaussian statistical error in flux measurements. All non-graphical output (including that to the terminal and HDF5) assumes that the propagation/resampling period ``tau`` is equal to unity; to obtain results in familiar units, divide all fluxes and multiply all correlation lengths by the true value of ``tau``. ''' output_format_version = 2 def __init__(self): super(WFluxanlTool, self).__init__() self.data_reader = WESTDataReader() self.iter_range = IterRangeSelection() self.output_h5file = None self.output_group = None self.target_groups = {} self.fluxdata = {} self.alpha = None self.autocorrel_alpha = None self.n_sets = None self.do_evol = False self.evol_step = 1 def add_args(self, parser): self.data_reader.add_args(parser) self.iter_range.add_args(parser) ogroup = parser.add_argument_group('output options') ogroup.add_argument( '-o', '--output', default='fluxanl.h5', help= 'Store intermediate data and analysis results to OUTPUT (default: %(default)s).' ) cgroup = parser.add_argument_group('calculation options') cgroup.add_argument( '--disable-bootstrap', '-db', dest='bootstrap', action='store_const', const=False, help='''Enable the use of Monte Carlo Block Bootstrapping.''') cgroup.add_argument('--disable-correl', '-dc', dest='correl', action='store_const', const=False, help='''Disable the correlation analysis.''') cgroup.add_argument( '-a', '--alpha', type=float, default=0.05, help= '''Calculate a (1-ALPHA) confidence interval on the average flux' (default: %(default)s)''') cgroup.add_argument( '--autocorrel-alpha', type=float, dest='acalpha', metavar='ACALPHA', help='''Evaluate autocorrelation of flux to (1-ACALPHA) significance. Note that too small an ACALPHA will result in failure to detect autocorrelation in a noisy flux signal. (Default: same as ALPHA.)''' ) cgroup.add_argument( '-N', '--nsets', type=int, help= '''Use NSETS samples for bootstrapping (default: chosen based on ALPHA)''' ) cgroup.add_argument( '--evol', action='store_true', dest='do_evol', help= '''Calculate time evolution of flux confidence intervals (expensive).''' ) cgroup.add_argument( '--evol-step', type=int, default=1, metavar='ESTEP', help= '''Calculate time evolution of flux confidence intervals every ESTEP iterations (default: %(default)s)''') def process_args(self, args): self.data_reader.process_args(args) self.data_reader.open() self.iter_range.data_manager = self.data_reader self.iter_range.process_args(args) self.output_h5file = h5py.File(args.output, 'w') self.alpha = args.alpha # Disable the bootstrap or the correlation analysis. self.mcbs_enable = args.bootstrap if args.bootstrap is not None else True self.do_correl = args.correl if args.correl is not None else True self.autocorrel_alpha = args.acalpha or self.alpha self.n_sets = args.nsets or mclib.get_bssize(self.alpha) self.do_evol = args.do_evol self.evol_step = args.evol_step or 1 def calc_store_flux_data(self): westpa.rc.pstatus( 'Calculating mean flux and confidence intervals for iterations [{},{})' .format(self.iter_range.iter_start, self.iter_range.iter_stop)) fluxdata = extract_fluxes(self.iter_range.iter_start, self.iter_range.iter_stop, self.data_reader) # Create a group to store data in output_group = h5io.create_hdf5_group(self.output_h5file, 'target_flux', replace=False, creating_program=self.prog) self.output_group = output_group output_group.attrs['version_code'] = self.output_format_version self.iter_range.record_data_iter_range(output_group) n_targets = len(fluxdata) index = np.empty((len(fluxdata), ), dtype=target_index_dtype) avg_fluxdata = np.empty((n_targets, ), dtype=ci_dtype) for itarget, (target_label, target_fluxdata) in enumerate(fluxdata.items()): # Create group and index entry index[itarget]['target_label'] = str(target_label) target_group = output_group.create_group( 'target_{}'.format(itarget)) self.target_groups[target_label] = target_group # Store per-iteration values target_group['n_iter'] = target_fluxdata['n_iter'] target_group['count'] = target_fluxdata['count'] target_group['flux'] = target_fluxdata['flux'] h5io.label_axes(target_group['flux'], ['n_iter'], units=['tau^-1']) # Calculate flux autocorrelation fluxes = target_fluxdata['flux'] mean_flux = fluxes.mean() fmm = fluxes - mean_flux acorr = fftconvolve(fmm, fmm[::-1]) acorr = acorr[len(acorr) // 2:] acorr /= acorr[0] acorr_ds = target_group.create_dataset('flux_autocorrel', data=acorr) h5io.label_axes(acorr_ds, ['lag'], ['tau']) # Calculate overall averages and CIs #avg, lb_ci, ub_ci, correl_len = mclib.mcbs_ci_correl(fluxes, np.mean, self.alpha, self.n_sets, # autocorrel_alpha=self.autocorrel_alpha, subsample=np.mean) avg, lb_ci, ub_ci, sterr, correl_len = mclib.mcbs_ci_correl( {'dataset': fluxes}, estimator=(lambda stride, dataset: np.mean(dataset)), alpha=self.alpha, n_sets=self.n_sets, autocorrel_alpha=self.autocorrel_alpha, subsample=np.mean, do_correl=self.do_correl, mcbs_enable=self.mcbs_enable) avg_fluxdata[itarget] = (self.iter_range.iter_start, self.iter_range.iter_stop, avg, lb_ci, ub_ci, sterr, correl_len) westpa.rc.pstatus('target {!r}:'.format(target_label)) westpa.rc.pstatus( ' correlation length = {} tau'.format(correl_len)) westpa.rc.pstatus( ' mean flux and CI = {:e} ({:e},{:e}) tau^(-1)'.format( avg, lb_ci, ub_ci)) index[itarget]['mean_flux'] = avg index[itarget]['mean_flux_ci_lb'] = lb_ci index[itarget]['mean_flux_ci_ub'] = ub_ci index[itarget]['mean_flux_correl_len'] = correl_len # Write index and summary index_ds = output_group.create_dataset('index', data=index) index_ds.attrs['mcbs_alpha'] = self.alpha index_ds.attrs['mcbs_autocorrel_alpha'] = self.autocorrel_alpha index_ds.attrs['mcbs_n_sets'] = self.n_sets self.fluxdata = fluxdata self.output_h5file['avg_flux'] = avg_fluxdata def calc_evol_flux(self): westpa.rc.pstatus( 'Calculating cumulative evolution of flux confidence intervals every {} iteration(s)' .format(self.evol_step)) for itarget, (target_label, target_fluxdata) in enumerate(self.fluxdata.items()): fluxes = target_fluxdata['flux'] target_group = self.target_groups[target_label] iter_start = target_group['n_iter'][0] iter_stop = target_group['n_iter'][-1] iter_count = iter_stop - iter_start n_blocks = iter_count // self.evol_step if iter_count % self.evol_step > 0: n_blocks += 1 cis = np.empty((n_blocks, ), dtype=ci_dtype) for iblock in range(n_blocks): block_iter_stop = min( iter_start + (iblock + 1) * self.evol_step, iter_stop) istop = min((iblock + 1) * self.evol_step, len(target_fluxdata['flux'])) fluxes = target_fluxdata['flux'][:istop] #avg, ci_lb, ci_ub, correl_len = mclib.mcbs_ci_correl(fluxes, np.mean, self.alpha, self.n_sets, # autocorrel_alpha = self.autocorrel_alpha, # subsample=np.mean) avg, ci_lb, ci_ub, sterr, correl_len = mclib.mcbs_ci_correl( {'dataset': fluxes}, estimator=(lambda stride, dataset: np.mean(dataset)), alpha=self.alpha, n_sets=self.n_sets, autocorrel_alpha=self.autocorrel_alpha, subsample=np.mean, do_correl=self.do_correl, mcbs_enable=self.mcbs_enable) cis[iblock]['iter_start'] = iter_start cis[iblock]['iter_stop'] = block_iter_stop cis[iblock]['expected'], cis[iblock]['ci_lbound'], cis[iblock][ 'ci_ubound'] = avg, ci_lb, ci_ub cis[iblock]['corr_len'] = correl_len cis[iblock]['sterr'] = sterr del fluxes cis_ds = target_group.create_dataset('flux_evolution', data=cis) cis_ds.attrs['iter_step'] = self.evol_step cis_ds.attrs['mcbs_alpha'] = self.alpha cis_ds.attrs['mcbs_autocorrel_alpha'] = self.autocorrel_alpha cis_ds.attrs['mcbs_n_sets'] = self.n_sets def go(self): self.calc_store_flux_data() if self.do_evol: self.calc_evol_flux()
class WPDist(WESTParallelTool): prog = 'w_pdist' description = '''\ Calculate time-resolved, multi-dimensional probability distributions of WE datasets. ----------------------------------------------------------------------------- Source data ----------------------------------------------------------------------------- Source data is provided either by a user-specified function (--construct-dataset) or a list of "data set specifications" (--dsspecs). If neither is provided, the progress coordinate dataset ''pcoord'' is used. To use a custom function to extract or calculate data whose probability distribution will be calculated, specify the function in standard Python MODULE.FUNCTION syntax as the argument to --construct-dataset. This function will be called as function(n_iter,iter_group), where n_iter is the iteration whose data are being considered and iter_group is the corresponding group in the main WEST HDF5 file (west.h5). The function must return data which can be indexed as [segment][timepoint][dimension]. To use a list of data set specifications, specify --dsspecs and then list the desired datasets one-by-one (space-separated in most shells). These data set specifications are formatted as NAME[,file=FILENAME,slice=SLICE], which will use the dataset called NAME in the HDF5 file FILENAME (defaulting to the main WEST HDF5 file west.h5), and slice it with the Python slice expression SLICE (as in [0:2] to select the first two elements of the first axis of the dataset). The ``slice`` option is most useful for selecting one column (or more) from a multi-column dataset, such as arises when using a progress coordinate of multiple dimensions. ----------------------------------------------------------------------------- Histogram binning ----------------------------------------------------------------------------- By default, histograms are constructed with 100 bins in each dimension. This can be overridden by specifying -b/--bins, which accepts a number of different kinds of arguments: a single integer N N uniformly spaced bins will be used in each dimension. a sequence of integers N1,N2,... (comma-separated) N1 uniformly spaced bins will be used for the first dimension, N2 for the second, and so on. a list of lists [[B11, B12, B13, ...], [B21, B22, B23, ...], ...] The bin boundaries B11, B12, B13, ... will be used for the first dimension, B21, B22, B23, ... for the second dimension, and so on. These bin boundaries need not be uniformly spaced. These expressions will be evaluated with Python's ``eval`` construct, with ``np`` available for use [e.g. to specify bins using np.arange()]. The first two forms (integer, list of integers) will trigger a scan of all data in each dimension in order to determine the minimum and maximum values, which may be very expensive for large datasets. This can be avoided by explicitly providing bin boundaries using the list-of-lists form. Note that these bins are *NOT* at all related to the bins used to drive WE sampling. ----------------------------------------------------------------------------- Output format ----------------------------------------------------------------------------- The output file produced (specified by -o/--output, defaulting to "pdist.h5") may be fed to plothist to generate plots (or appropriately processed text or HDF5 files) from this data. In short, the following datasets are created: ``histograms`` Normalized histograms. The first axis corresponds to iteration, and remaining axes correspond to dimensions of the input dataset. ``/binbounds_0`` Vector of bin boundaries for the first (index 0) dimension. Additional datasets similarly named (/binbounds_1, /binbounds_2, ...) are created for additional dimensions. ``/midpoints_0`` Vector of bin midpoints for the first (index 0) dimension. Additional datasets similarly named are created for additional dimensions. ``n_iter`` Vector of iteration numbers corresponding to the stored histograms (i.e. the first axis of the ``histograms`` dataset). ----------------------------------------------------------------------------- Subsequent processing ----------------------------------------------------------------------------- The output generated by this program (-o/--output, default "pdist.h5") may be plotted by the ``plothist`` program. See ``plothist --help`` for more information. ----------------------------------------------------------------------------- Parallelization ----------------------------------------------------------------------------- This tool supports parallelized binning, including reading of input data. Parallel processing is the default. For simple cases (reading pre-computed input data, modest numbers of segments), serial processing (--serial) may be more efficient. ----------------------------------------------------------------------------- Command-line options ----------------------------------------------------------------------------- ''' def __init__(self): super().__init__() # Parallel processing by default (this is not actually necessary, but it is # informative!) self.wm_env.default_work_manager = self.wm_env.default_parallel_work_manager # These are used throughout self.progress = ProgressIndicatorComponent() self.data_reader = WESTDataReader() self.input_dssynth = WESTDSSynthesizer(default_dsname='pcoord') self.input_wdssynth = WESTWDSSynthesizer(default_dsname='seg_index') self.iter_range = IterRangeSelection(self.data_reader) self.iter_range.include_args['iter_step'] = False self.binspec = None self.output_filename = None self.output_file = None self.dsspec = None self.wt_dsspec = None # dsspec for weights # These are used during histogram generation only self.iter_start = None self.iter_stop = None self.ndim = None self.ntimepoints = None self.dset_dtype = None self.binbounds = None # bin boundaries for each dimension self.midpoints = None # bin midpoints for each dimension self.data_range = None # data range for each dimension, as the pairs (min,max) self.ignore_out_of_range = False self.compress_output = False def add_args(self, parser): self.data_reader.add_args(parser) self.iter_range.add_args(parser) parser.add_argument( '-b', '--bins', dest='bins', metavar='BINEXPR', default='100', help='''Use BINEXPR for bins. This may be an integer, which will be used for each dimension of the progress coordinate; a list of integers (formatted as [n1,n2,...]) which will use n1 bins for the first dimension, n2 for the second dimension, and so on; or a list of lists of boundaries (formatted as [[a1, a2, ...], [b1, b2, ...], ... ]), which will use [a1, a2, ...] as bin boundaries for the first dimension, [b1, b2, ...] as bin boundaries for the second dimension, and so on. (Default: 100 bins in each dimension.)''', ) parser.add_argument( '-o', '--output', dest='output', default='pdist.h5', help='''Store results in OUTPUT (default: %(default)s).''' ) parser.add_argument( '-C', '--compress', action='store_true', help='''Compress histograms. May make storage of higher-dimensional histograms more tractable, at the (possible extreme) expense of increased analysis time. (Default: no compression.)''', ) parser.add_argument( '--loose', dest='ignore_out_of_range', action='store_true', help='''Ignore values that do not fall within bins. (Risky, as this can make buggy bin boundaries appear as reasonable data. Only use if you are sure of your bin boundary specification.)''', ) igroup = parser.add_argument_group('input dataset options').add_mutually_exclusive_group(required=False) igroup.add_argument( '--construct-dataset', help='''Use the given function (as in module.function) to extract source data. This function will be called once per iteration as function(n_iter, iter_group) to construct data for one iteration. Data returned must be indexable as [seg_id][timepoint][dimension]''', ) igroup.add_argument( '--dsspecs', nargs='+', metavar='DSSPEC', help='''Construct probability distribution from one or more DSSPECs.''' ) wgroup = parser.add_argument_group('input weight dataset options').add_mutually_exclusive_group(required=False) wgroup.add_argument( '--construct-wdataset', help='''Use the given function (as in module.function) to extract weight data. This function will be called once per iteration as function(n_iter, iter_group) to construct data for one iteration. Data returned must be indexable as [seg_id]''', ) self.progress.add_args(parser) def process_args(self, args): self.progress.process_args(args) self.data_reader.process_args(args) self.input_dssynth.h5filename = self.data_reader.we_h5filename self.input_dssynth.process_args(args) self.dsspec = self.input_dssynth.dsspec # Carrying an open HDF5 file across a fork() seems to corrupt the entire HDF5 library # Open the WEST HDF5 file just long enough to process our iteration range, then close # and reopen in go() [which executes after the fork] with self.data_reader: self.iter_range.process_args(args) # Reading potential custom weights self.input_wdssynth.h5filename = self.data_reader.we_h5filename self.input_wdssynth.process_args(args) self.wt_dsspec = self.input_wdssynth.dsspec self.binspec = args.bins self.output_filename = args.output self.ignore_out_of_range = bool(args.ignore_out_of_range) self.compress_output = args.compress or False def go(self): self.data_reader.open('r') pi = self.progress.indicator pi.operation = 'Initializing' with pi: self.output_file = h5py.File(self.output_filename, 'w') h5io.stamp_creator_data(self.output_file) self.iter_start = self.iter_range.iter_start self.iter_stop = self.iter_range.iter_stop # Construct bin boundaries self.construct_bins(self.parse_binspec(self.binspec)) for idim, (binbounds, midpoints) in enumerate(zip(self.binbounds, self.midpoints)): self.output_file['binbounds_{}'.format(idim)] = binbounds self.output_file['midpoints_{}'.format(idim)] = midpoints # construct histogram self.construct_histogram() # Record iteration range iter_range = self.iter_range.iter_range() self.output_file['n_iter'] = iter_range self.iter_range.record_data_iter_range(self.output_file['histograms']) self.output_file.close() @staticmethod def parse_binspec(binspec): namespace = {'numpy': np, 'np': np, 'inf': float('inf')} try: binspec_compiled = eval(binspec, namespace) except Exception as e: raise ValueError('invalid bin specification: {!r}'.format(e)) else: if log.isEnabledFor(logging.DEBUG): log.debug('bin specs: {!r}'.format(binspec_compiled)) return binspec_compiled def construct_bins(self, bins): ''' Construct bins according to ``bins``, which may be: 1) A scalar integer (for that number of bins in each dimension) 2) A sequence of integers (specifying number of bins for each dimension) 3) A sequence of sequences of bin boundaries (specifying boundaries for each dimension) Sets ``self.binbounds`` to a list of arrays of bin boundaries appropriate for passing to fasthist.histnd, along with ``self.midpoints`` to the midpoints of the bins. ''' if not isiterable(bins): self._construct_bins_from_scalar(bins) elif not isiterable(bins[0]): self._construct_bins_from_int_seq(bins) else: self._construct_bins_from_bound_seqs(bins) if log.isEnabledFor(logging.DEBUG): log.debug('binbounds: {!r}'.format(self.binbounds)) def scan_data_shape(self): if self.ndim is None: dset = self.dsspec.get_iter_data(self.iter_start) self.ntimepoints = dset.shape[1] self.ndim = dset.shape[2] self.dset_dtype = dset.dtype def scan_data_range(self): '''Scan input data for range in each dimension. The number of dimensions is determined from the shape of the progress coordinate as of self.iter_start.''' self.progress.indicator.new_operation('Scanning for data range', self.iter_stop - self.iter_start) self.scan_data_shape() dset_dtype = self.dset_dtype ndim = self.ndim dsspec = self.dsspec try: minval = np.finfo(dset_dtype).min maxval = np.finfo(dset_dtype).max except ValueError: minval = np.iinfo(dset_dtype).min maxval = np.iinfo(dset_dtype).max data_range = self.data_range = [(maxval, minval) for _i in range(self.ndim)] # futures = [] # for n_iter in xrange(self.iter_start, self.iter_stop): # _remote_min_max(ndim, dset_dtype, n_iter, dsspec) # futures.append(self.work_manager.submit(_remote_min_max, args=(ndim, dset_dtype, n_iter, dsspec))) # for future in self.work_manager.as_completed(futures): for future in self.work_manager.submit_as_completed( ((_remote_min_max, (ndim, dset_dtype, n_iter, dsspec), {}) for n_iter in range(self.iter_start, self.iter_stop)), self.max_queue_len, ): bounds = future.get_result(discard=True) for idim in range(ndim): current_min, current_max = data_range[idim] current_min = min(current_min, bounds[idim][0]) current_max = max(current_max, bounds[idim][1]) data_range[idim] = (current_min, current_max) self.progress.indicator.progress += 1 def _construct_bins_from_scalar(self, bins): if self.data_range is None: self.scan_data_range() self.binbounds = [] self.midpoints = [] for idim in range(self.ndim): lb, ub = self.data_range[idim] # Advance just beyond the upper bound of the range, so that we catch # the maximum in the histogram ub *= 1.01 boundset = np.linspace(lb, ub, bins + 1) midpoints = (boundset[:-1] + boundset[1:]) / 2.0 self.binbounds.append(boundset) self.midpoints.append(midpoints) def _construct_bins_from_int_seq(self, bins): if self.data_range is None: self.scan_data_range() self.binbounds = [] self.midpoints = [] for idim in range(self.ndim): lb, ub = self.data_range[idim] # Advance just beyond the upper bound of the range, so that we catch # the maximum in the histogram ub *= 1.01 boundset = np.linspace(lb, ub, bins[idim] + 1) midpoints = (boundset[:-1] + boundset[1:]) / 2.0 self.binbounds.append(boundset) self.midpoints.append(midpoints) def _construct_bins_from_bound_seqs(self, bins): self.binbounds = [] self.midpoints = [] for boundset in bins: boundset = np.asarray(boundset) if (np.diff(boundset) <= 0).any(): raise ValueError('boundary set {!r} is not strictly monotonically increasing'.format(boundset)) self.binbounds.append(boundset) self.midpoints.append((boundset[:-1] + boundset[1:]) / 2.0) def construct_histogram(self): '''Construct a histogram using bins previously constructed with ``construct_bins()``. The time series of histogram values is stored in ``histograms``. Each histogram in the time series is normalized.''' self.scan_data_shape() iter_count = self.iter_stop - self.iter_start histograms_ds = self.output_file.create_dataset( 'histograms', dtype=np.float64, shape=((iter_count,) + tuple(len(bounds) - 1 for bounds in self.binbounds)), compression=9 if self.compress_output else None, ) binbounds = [np.require(boundset, self.dset_dtype, 'C') for boundset in self.binbounds] self.progress.indicator.new_operation('Constructing histograms', self.iter_stop - self.iter_start) task_gen = ( ( _remote_bin_iter, (iiter, n_iter, self.dsspec, self.wt_dsspec, 1 if iiter > 0 else 0, binbounds, self.ignore_out_of_range), {}, ) for (iiter, n_iter) in enumerate(range(self.iter_start, self.iter_stop)) ) # futures = set() # for iiter, n_iter in enumerate(xrange(self.iter_start, self.iter_stop)): # initpoint = 1 if iiter > 0 else 0 # futures.add(self.work_manager.submit(_remote_bin_iter, # args=(iiter, n_iter, self.dsspec, self.wt_dsspec, initpoint, binbounds))) # for future in self.work_manager.as_completed(futures): # future = self.work_manager.wait_any(futures) # for future in self.work_manager.submit_as_completed(task_gen, self.queue_size): log.debug('max queue length: {!r}'.format(self.max_queue_len)) for future in self.work_manager.submit_as_completed(task_gen, self.max_queue_len): iiter, n_iter, iter_hist = future.get_result(discard=True) self.progress.indicator.progress += 1 # store histogram histograms_ds[iiter] = iter_hist del iter_hist, future
class WNTopTool(WESTTool): prog = 'w_ntop' description = '''\ Select walkers from bins . An assignment file mapping walkers to bins at each timepoint is required (see``w_assign --help`` for further information on generating this file). By default, high-weight walkers are selected (hence the name ``w_ntop``: select the N top-weighted walkers from each bin); however, minimum weight walkers and randomly-selected walkers may be selected instead. ----------------------------------------------------------------------------- Output format ----------------------------------------------------------------------------- The output file (-o/--output, by default "ntop.h5") contains the following datasets: ``/n_iter`` [iteration] *(Integer)* Iteration numbers for each entry in other datasets. ``/n_segs`` [iteration][bin] *(Integer)* Number of segments in each bin/state in the given iteration. This will generally be the same as the number requested with ``--n/--count`` but may be smaller if the requested number of walkers does not exist. ``/seg_ids`` [iteration][bin][segment] *(Integer)* Matching segments in each iteration for each bin. For an iteration ``n_iter``, only the first ``n_iter`` entries are valid. For example, the full list of matching seg_ids in bin 0 in the first stored iteration is ``seg_ids[0][0][:n_segs[0]]``. ``/weights`` [iteration][bin][segment] *(Floating-point)* Weights for each matching segment in ``/seg_ids``. ----------------------------------------------------------------------------- Command-line arguments ----------------------------------------------------------------------------- ''' def __init__(self): super(WNTopTool, self).__init__() self.data_reader = WESTDataReader() self.iter_range = IterRangeSelection() self.progress = ProgressIndicatorComponent() self.output_file = None self.assignments_filename = None self.output_filename = None self.what = None self.timepoint = None self.count = None def add_args(self, parser): self.data_reader.add_args(parser) self.iter_range.add_args(parser) igroup = parser.add_argument_group('input options') igroup.add_argument( '-a', '--assignments', default='assign.h5', help= '''Use assignments from the given ASSIGNMENTS file (default: %(default)s).''' ) sgroup = parser.add_argument_group('selection options') sgroup.add_argument( '-n', '--count', type=int, default=1, help= '''Select COUNT walkers from each iteration for each bin (default: %(default)s).''' ) sgroup.add_argument( '-t', '--timepoint', type=int, default=-1, help= '''Base selection on the given TIMEPOINT within each iteration. Default (-1) corresponds to the last timepoint.''') cgroup = parser.add_mutually_exclusive_group() cgroup.add_argument( '--highweight', dest='select_what', action='store_const', const='highweight', help='''Select COUNT highest-weight walkers from each bin.''') cgroup.add_argument( '--lowweight', dest='select_what', action='store_const', const='lowweight', help='''Select COUNT lowest-weight walkers from each bin.''') cgroup.add_argument( '--random', dest='select_what', action='store_const', const='random', help='''Select COUNT walkers randomly from each bin.''') parser.set_defaults(select_what='highweight') ogroup = parser.add_argument_group('output options') ogroup.add_argument( '-o', '--output', default='ntop.h5', help='''Write output to OUTPUT (default: %(default)s).''') self.progress.add_args(parser) def process_args(self, args): self.progress.process_args(args) self.data_reader.process_args(args) with self.data_reader: self.iter_range.process_args(args) self.what = args.select_what self.output_filename = args.output self.assignments_filename = args.assignments self.count = args.count self.timepoint = args.timepoint def go(self): self.data_reader.open('r') assignments_file = h5py.File(self.assignments_filename, mode='r') output_file = h5io.WESTPAH5File(self.output_filename, mode='w') pi = self.progress.indicator count = self.count timepoint = self.timepoint nbins = assignments_file.attrs['nbins'] + 1 assignments_ds = assignments_file['assignments'] iter_start, iter_stop = self.iter_range.iter_start, self.iter_range.iter_stop iter_count = iter_stop - iter_start h5io.check_iter_range_least(assignments_ds, iter_start, iter_stop) nsegs = assignments_file['nsegs'][h5io.get_iteration_slice( assignments_file['nsegs'], iter_start, iter_stop)] output_file.create_dataset('n_iter', dtype=n_iter_dtype, data=list(range(iter_start, iter_stop))) seg_count_ds = output_file.create_dataset('nsegs', dtype=np.uint, shape=(iter_count, nbins)) matching_segs_ds = output_file.create_dataset( 'seg_ids', shape=(iter_count, nbins, count), dtype=seg_id_dtype, chunks=h5io.calc_chunksize((iter_count, nbins, count), seg_id_dtype), shuffle=True, compression=9) weights_ds = output_file.create_dataset('weights', shape=(iter_count, nbins, count), dtype=weight_dtype, chunks=h5io.calc_chunksize( (iter_count, nbins, count), weight_dtype), shuffle=True, compression=9) what = self.what with pi: pi.new_operation('Finding matching segments', extent=iter_count) for iiter, n_iter in enumerate(range(iter_start, iter_stop)): assignments = np.require(assignments_ds[ h5io.get_iteration_entry(assignments_ds, n_iter) + np.index_exp[:, timepoint]], dtype=westpa.binning.index_dtype) all_weights = self.data_reader.get_iter_group( n_iter)['seg_index']['weight'] # the following Cython function just executes this loop: #for iseg in xrange(nsegs[iiter]): # segs_by_bin[iseg,assignments[iseg]] = True segs_by_bin = assignments_list_to_table( nsegs[iiter], nbins, assignments) for ibin in range(nbins): segs = np.nonzero(segs_by_bin[:, ibin])[0] seg_count_ds[iiter, ibin] = min(len(segs), count) if len(segs): weights = all_weights.take(segs) if what == 'lowweight': indices = np.argsort(weights)[:count] elif what == 'highweight': indices = np.argsort(weights)[::-1][:count] else: assert what == 'random' indices = np.random.permutation(len(weights)) matching_segs_ds[iiter, ibin, :len(segs)] = segs.take(indices) weights_ds[iiter, ibin, :len(segs)] = weights.take(indices) del segs, weights del assignments, segs_by_bin, all_weights pi.progress += 1
class WCrawl(WESTParallelTool): prog = 'w_crawl' description = '''\ Crawl a weighted ensemble dataset, executing a function for each iteration. This can be used for postprocessing of trajectories, cleanup of datasets, or anything else that can be expressed as "do X for iteration N, then do something with the result". Tasks are parallelized by iteration, and no guarantees are made about evaluation order. ----------------------------------------------------------------------------- Command-line options ----------------------------------------------------------------------------- ''' def __init__(self): super(WCrawl, self).__init__() # These are used throughout self.progress = ProgressIndicatorComponent() self.data_reader = WESTDataReader() self.iter_range = IterRangeSelection(self.data_reader) self.crawler = None self.task_callable = None def add_args(self, parser): self.data_reader.add_args(parser) self.iter_range.add_args(parser) tgroup = parser.add_argument_group('task options') tgroup.add_argument( '-c', '--crawler-instance', help= '''Use CRAWLER_INSTANCE (specified as module.instance) as an instance of WESTPACrawler to coordinate the calculation. Required only if initialization, finalization, or task result processing is required.''' ) tgroup.add_argument( 'task_callable', help= '''Run TASK_CALLABLE (specified as module.function) on each iteration. Required.''') self.progress.add_args(parser) def process_args(self, args): self.progress.process_args(args) self.data_reader.process_args(args) with self.data_reader: self.iter_range.process_args(args) self.task_callable = get_object(args.task_callable, path=['.']) if args.crawler_instance is not None: self.crawler = get_object(args.crawler_instance, path=['.']) else: self.crawler = WESTPACrawler() def go(self): iter_start = self.iter_range.iter_start iter_stop = self.iter_range.iter_stop iter_count = iter_stop - iter_start self.data_reader.open('r') pi = self.progress.indicator with pi: pi.operation = 'Initializing' self.crawler.initialize(iter_start, iter_stop) try: pi.new_operation('Dispatching tasks & processing results', iter_count) task_gen = ((_remote_task, (n_iter, self.task_callable), {}) for n_iter in range(iter_start, iter_stop)) for future in self.work_manager.submit_as_completed( task_gen, self.max_queue_len): n_iter, result = future.get_result(discard=True) if self.crawler is not None: self.crawler.process_iter_result(n_iter, result) pi.progress += 1 finally: pi.new_operation('Finalizing') self.crawler.finalize()
class WSelectTool(WESTParallelTool): prog = 'w_select' description = '''\ Select dynamics segments matching various criteria. This requires a user-provided prediate function. By default, only matching segments are stored. If the -a/--include-ancestors option is given, then matching segments and their ancestors will be stored. ----------------------------------------------------------------------------- Predicate function ----------------------------------------------------------------------------- Segments are selected based on a predicate function, which must be callable as ``predicate(n_iter, iter_group)`` and return a collection of segment IDs matching the predicate in that iteration. The predicate may be inverted by specifying the -v/--invert command-line argument. ----------------------------------------------------------------------------- Output format ----------------------------------------------------------------------------- The output file (-o/--output, by default "select.h5") contains the following datasets: ``/n_iter`` [iteration] *(Integer)* Iteration numbers for each entry in other datasets. ``/n_segs`` [iteration] *(Integer)* Number of segment IDs matching the predicate (or inverted predicate, if -v/--invert is specified) in the given iteration. ``/seg_ids`` [iteration][segment] *(Integer)* Matching segments in each iteration. For an iteration ``n_iter``, only the first ``n_iter`` entries are valid. For example, the full list of matching seg_ids in the first stored iteration is ``seg_ids[0][:n_segs[0]]``. ``/weights`` [iteration][segment] *(Floating-point)* Weights for each matching segment in ``/seg_ids``. ----------------------------------------------------------------------------- Command-line arguments ----------------------------------------------------------------------------- ''' def __init__(self): super().__init__() self.data_reader = WESTDataReader() self.iter_range = IterRangeSelection() self.progress = ProgressIndicatorComponent() self.output_file = None self.output_filename = None self.predicate = None self.invert = False self.include_ancestors = False def add_args(self, parser): self.data_reader.add_args(parser) self.iter_range.add_args(parser) sgroup = parser.add_argument_group('selection options') sgroup.add_argument( '-p', '--predicate-function', metavar='MODULE.FUNCTION', help= '''Use the given predicate function to match segments. This function should take an iteration number and the HDF5 group corresponding to that iteration and return a sequence of seg_ids matching the predicate, as in ``match_predicate(n_iter, iter_group)``.''', ) sgroup.add_argument('-v', '--invert', dest='invert', action='store_true', help='''Invert the match predicate.''') sgroup.add_argument( '-a', '--include-ancestors', action='store_true', help='''Include ancestors of matched segments in output.''') ogroup = parser.add_argument_group('output options') ogroup.add_argument( '-o', '--output', default='select.h5', help='''Write output to OUTPUT (default: %(default)s).''') self.progress.add_args(parser) def process_args(self, args): self.progress.process_args(args) self.data_reader.process_args(args) with self.data_reader: self.iter_range.process_args(args) predicate = get_object(args.predicate_function, path=['.']) if not callable(predicate): raise TypeError( 'predicate object {!r} is not callable'.format(predicate)) self.predicate = predicate self.invert = bool(args.invert) self.include_ancestors = bool(args.include_ancestors) self.output_filename = args.output def go(self): self.data_reader.open('r') output_file = h5io.WESTPAH5File(self.output_filename, mode='w') pi = self.progress.indicator iter_start, iter_stop = self.iter_range.iter_start, self.iter_range.iter_stop iter_count = iter_stop - iter_start output_file.create_dataset('n_iter', dtype=n_iter_dtype, data=list(range(iter_start, iter_stop))) current_seg_count = 0 seg_count_ds = output_file.create_dataset('n_segs', dtype=np.uint, shape=(iter_count, )) matching_segs_ds = output_file.create_dataset( 'seg_ids', shape=(iter_count, 0), maxshape=(iter_count, None), dtype=seg_id_dtype, chunks=h5io.calc_chunksize((iter_count, 1000000), seg_id_dtype), shuffle=True, compression=9, ) weights_ds = output_file.create_dataset( 'weights', shape=(iter_count, 0), maxshape=(iter_count, None), dtype=weight_dtype, chunks=h5io.calc_chunksize((iter_count, 1000000), weight_dtype), shuffle=True, compression=9, ) with pi: pi.new_operation('Finding matching segments', extent=iter_count) # futures = set() # for n_iter in xrange(iter_start,iter_stop): # futures.add(self.work_manager.submit(_find_matching_segments, # args=(self.data_reader.we_h5filename,n_iter,self.predicate,self.invert))) # for future in self.work_manager.as_completed(futures): for future in self.work_manager.submit_as_completed( ((_find_matching_segments, (self.data_reader.we_h5filename, n_iter, self.predicate, self.invert), {}) for n_iter in range(iter_start, iter_stop)), self.max_queue_len, ): n_iter, matching_ids = future.get_result() n_matches = len(matching_ids) if n_matches: if n_matches > current_seg_count: current_seg_count = len(matching_ids) matching_segs_ds.resize((iter_count, n_matches)) weights_ds.resize((iter_count, n_matches)) current_seg_count = n_matches seg_count_ds[n_iter - iter_start] = n_matches matching_segs_ds[n_iter - iter_start, :n_matches] = matching_ids weights_ds[n_iter - iter_start, : n_matches] = self.data_reader.get_iter_group( n_iter)['seg_index']['weight'][sorted( matching_ids)] del matching_ids pi.progress += 1 if self.include_ancestors: pi.new_operation('Tracing ancestors of matching segments', extent=iter_count) from_previous = set() current_seg_count = matching_segs_ds.shape[1] for n_iter in range(iter_stop - 1, iter_start - 1, -1): iiter = n_iter - iter_start n_matches = seg_count_ds[iiter] matching_ids = set(from_previous) if n_matches: matching_ids.update( matching_segs_ds[iiter, :seg_count_ds[iiter]]) from_previous.clear() n_matches = len(matching_ids) if n_matches > current_seg_count: matching_segs_ds.resize((iter_count, n_matches)) weights_ds.resize((iter_count, n_matches)) current_seg_count = n_matches if n_matches > 0: seg_count_ds[iiter] = n_matches matching_ids = sorted(matching_ids) matching_segs_ds[iiter, :n_matches] = matching_ids weights_ds[ iiter, : n_matches] = self.data_reader.get_iter_group( n_iter)['seg_index']['weight'][sorted( matching_ids)] parent_ids = self.data_reader.get_iter_group(n_iter)[ 'seg_index']['parent_id'][sorted(matching_ids)] from_previous.update( parent_id for parent_id in parent_ids if parent_id >= 0) # filter initial states del parent_ids del matching_ids pi.progress += 1