Ejemplo n.º 1
0
class WESTKineticsBase(WESTSubcommand):
    '''
    Common argument processing for w_direct/w_reweight subcommands.
    Mostly limited to handling input and output from w_assign.
    '''

    def __init__(self, parent):
        super(WESTKineticsBase,self).__init__(parent)

        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.progress = ProgressIndicatorComponent()

        self.output_filename = None
        # This is actually applicable to both.
        self.assignment_filename = None

        self.output_file = None
        self.assignments_file = None

        self.evolution_mode = None

        self.mcbs_alpha = None
        self.mcbs_acalpha = None
        self.mcbs_nsets = None

        # Now we're adding in things that come from the old w_kinetics
        self.do_compression = True


    def add_args(self, parser):
        self.progress.add_args(parser)
        self.data_reader.add_args(parser)
        self.iter_range.include_args['iter_step'] = True
        self.iter_range.add_args(parser)

        iogroup = parser.add_argument_group('input/output options')
        iogroup.add_argument('-a', '--assignments', default='assign.h5',
                            help='''Bin assignments and macrostate definitions are in ASSIGNMENTS
                            (default: %(default)s).''')

        iogroup.add_argument('-o', '--output', dest='output', default=self.default_output_file,
                            help='''Store results in OUTPUT (default: %(default)s).''')

    def process_args(self, args):
        self.progress.process_args(args)
        self.data_reader.process_args(args)
        with self.data_reader:
            self.iter_range.process_args(args, default_iter_step=None)
        if self.iter_range.iter_step is None:
            #use about 10 blocks by default
            self.iter_range.iter_step = max(1, (self.iter_range.iter_stop - self.iter_range.iter_start) // 10)

        self.output_filename = args.output
        self.assignments_filename = args.assignments
Ejemplo n.º 2
0
class WPDist(WESTParallelTool):
    prog = 'w_pdist'
    description = '''\
Calculate time-resolved, multi-dimensional probability distributions of WE
datasets.


-----------------------------------------------------------------------------
Source data
-----------------------------------------------------------------------------

Source data is provided either by a user-specified function
(--construct-dataset) or a list of "data set specifications" (--dsspecs).
If neither is provided, the progress coordinate dataset ''pcoord'' is used.

To use a custom function to extract or calculate data whose probability
distribution will be calculated, specify the function in standard Python
MODULE.FUNCTION syntax as the argument to --construct-dataset. This function
will be called as function(n_iter,iter_group), where n_iter is the iteration
whose data are being considered and iter_group is the corresponding group
in the main WEST HDF5 file (west.h5). The function must return data which can
be indexed as [segment][timepoint][dimension].

To use a list of data set specifications, specify --dsspecs and then list the
desired datasets one-by-one (space-separated in most shells). These data set
specifications are formatted as NAME[,file=FILENAME,slice=SLICE], which will
use the dataset called NAME in the HDF5 file FILENAME (defaulting to the main
WEST HDF5 file west.h5), and slice it with the Python slice expression SLICE
(as in [0:2] to select the first two elements of the first axis of the
dataset). The ``slice`` option is most useful for selecting one column (or
more) from a multi-column dataset, such as arises when using a progress
coordinate of multiple dimensions.


-----------------------------------------------------------------------------
Histogram binning
-----------------------------------------------------------------------------

By default, histograms are constructed with 100 bins in each dimension. This
can be overridden by specifying -b/--bins, which accepts a number of different
kinds of arguments:

  a single integer N
    N uniformly spaced bins will be used in each dimension.

  a sequence of integers N1,N2,... (comma-separated)
    N1 uniformly spaced bins will be used for the first dimension, N2 for the
    second, and so on.

  a list of lists [[B11, B12, B13, ...], [B21, B22, B23, ...], ...]
    The bin boundaries B11, B12, B13, ... will be used for the first dimension,
    B21, B22, B23, ... for the second dimension, and so on. These bin
    boundaries need not be uniformly spaced. These expressions will be
    evaluated with Python's ``eval`` construct, with ``np`` available for
    use [e.g. to specify bins using np.arange()].

The first two forms (integer, list of integers) will trigger a scan of all
data in each dimension in order to determine the minimum and maximum values,
which may be very expensive for large datasets. This can be avoided by
explicitly providing bin boundaries using the list-of-lists form.

Note that these bins are *NOT* at all related to the bins used to drive WE
sampling.


-----------------------------------------------------------------------------
Output format
-----------------------------------------------------------------------------

The output file produced (specified by -o/--output, defaulting to "pdist.h5")
may be fed to plothist to generate plots (or appropriately processed text or
HDF5 files) from this data. In short, the following datasets are created:

  ``histograms``
    Normalized histograms. The first axis corresponds to iteration, and
    remaining axes correspond to dimensions of the input dataset.

  ``/binbounds_0``
    Vector of bin boundaries for the first (index 0) dimension. Additional
    datasets similarly named (/binbounds_1, /binbounds_2, ...) are created
    for additional dimensions.

  ``/midpoints_0``
    Vector of bin midpoints for the first (index 0) dimension. Additional
    datasets similarly named are created for additional dimensions.

  ``n_iter``
    Vector of iteration numbers corresponding to the stored histograms (i.e.
    the first axis of the ``histograms`` dataset).


-----------------------------------------------------------------------------
Subsequent processing
-----------------------------------------------------------------------------

The output generated by this program (-o/--output, default "pdist.h5") may be
plotted by the ``plothist`` program. See ``plothist --help`` for more
information.


-----------------------------------------------------------------------------
Parallelization
-----------------------------------------------------------------------------

This tool supports parallelized binning, including reading of input data.
Parallel processing is the default. For simple cases (reading pre-computed
input data, modest numbers of segments), serial processing (--serial) may be
more efficient.


-----------------------------------------------------------------------------
Command-line options
-----------------------------------------------------------------------------

'''

    def __init__(self):
        super().__init__()

        # Parallel processing by default (this is not actually necessary, but it is
        # informative!)
        self.wm_env.default_work_manager = self.wm_env.default_parallel_work_manager

        # These are used throughout
        self.progress = ProgressIndicatorComponent()
        self.data_reader = WESTDataReader()
        self.input_dssynth = WESTDSSynthesizer(default_dsname='pcoord')
        self.input_wdssynth = WESTWDSSynthesizer(default_dsname='seg_index')
        self.iter_range = IterRangeSelection(self.data_reader)
        self.iter_range.include_args['iter_step'] = False
        self.binspec = None
        self.output_filename = None
        self.output_file = None

        self.dsspec = None
        self.wt_dsspec = None  # dsspec for weights

        # These are used during histogram generation only
        self.iter_start = None
        self.iter_stop = None
        self.ndim = None
        self.ntimepoints = None
        self.dset_dtype = None
        self.binbounds = None  # bin boundaries for each dimension
        self.midpoints = None  # bin midpoints for each dimension
        self.data_range = None  # data range for each dimension, as the pairs (min,max)
        self.ignore_out_of_range = False
        self.compress_output = False

    def add_args(self, parser):
        self.data_reader.add_args(parser)

        self.iter_range.add_args(parser)

        parser.add_argument(
            '-b',
            '--bins',
            dest='bins',
            metavar='BINEXPR',
            default='100',
            help='''Use BINEXPR for bins. This may be an integer, which will be used for each
                            dimension of the progress coordinate; a list of integers (formatted as [n1,n2,...])
                            which will use n1 bins for the first dimension, n2 for the second dimension, and so on;
                            or a list of lists of boundaries (formatted as [[a1, a2, ...], [b1, b2, ...], ... ]), which
                            will use [a1, a2, ...] as bin boundaries for the first dimension, [b1, b2, ...] as bin boundaries
                            for the second dimension, and so on. (Default: 100 bins in each dimension.)''',
        )

        parser.add_argument(
            '-o', '--output', dest='output', default='pdist.h5', help='''Store results in OUTPUT (default: %(default)s).'''
        )
        parser.add_argument(
            '-C',
            '--compress',
            action='store_true',
            help='''Compress histograms. May make storage of higher-dimensional histograms
                            more tractable, at the (possible extreme) expense of increased analysis time.
                            (Default: no compression.)''',
        )

        parser.add_argument(
            '--loose',
            dest='ignore_out_of_range',
            action='store_true',
            help='''Ignore values that do not fall within bins. (Risky, as this can make buggy bin
                            boundaries appear as reasonable data. Only use if you are
                            sure of your bin boundary specification.)''',
        )

        igroup = parser.add_argument_group('input dataset options').add_mutually_exclusive_group(required=False)

        igroup.add_argument(
            '--construct-dataset',
            help='''Use the given function (as in module.function) to extract source data.
                            This function will be called once per iteration as function(n_iter, iter_group)
                            to construct data for one iteration. Data returned must be indexable as
                            [seg_id][timepoint][dimension]''',
        )

        igroup.add_argument(
            '--dsspecs', nargs='+', metavar='DSSPEC', help='''Construct probability distribution from one or more DSSPECs.'''
        )

        wgroup = parser.add_argument_group('input weight dataset options').add_mutually_exclusive_group(required=False)
        wgroup.add_argument(
            '--construct-wdataset',
            help='''Use the given function (as in module.function) to extract weight data.
                            This function will be called once per iteration as function(n_iter, iter_group)
                            to construct data for one iteration. Data returned must be indexable as
                            [seg_id]''',
        )

        self.progress.add_args(parser)

    def process_args(self, args):
        self.progress.process_args(args)
        self.data_reader.process_args(args)
        self.input_dssynth.h5filename = self.data_reader.we_h5filename
        self.input_dssynth.process_args(args)
        self.dsspec = self.input_dssynth.dsspec

        # Carrying an open HDF5 file across a fork() seems to corrupt the entire HDF5 library
        # Open the WEST HDF5 file just long enough to process our iteration range, then close
        # and reopen in go() [which executes after the fork]
        with self.data_reader:
            self.iter_range.process_args(args)

        # Reading potential custom weights
        self.input_wdssynth.h5filename = self.data_reader.we_h5filename
        self.input_wdssynth.process_args(args)
        self.wt_dsspec = self.input_wdssynth.dsspec

        self.binspec = args.bins
        self.output_filename = args.output
        self.ignore_out_of_range = bool(args.ignore_out_of_range)
        self.compress_output = args.compress or False

    def go(self):
        self.data_reader.open('r')
        pi = self.progress.indicator
        pi.operation = 'Initializing'
        with pi:
            self.output_file = h5py.File(self.output_filename, 'w')
            h5io.stamp_creator_data(self.output_file)

            self.iter_start = self.iter_range.iter_start
            self.iter_stop = self.iter_range.iter_stop

            # Construct bin boundaries
            self.construct_bins(self.parse_binspec(self.binspec))
            for idim, (binbounds, midpoints) in enumerate(zip(self.binbounds, self.midpoints)):
                self.output_file['binbounds_{}'.format(idim)] = binbounds
                self.output_file['midpoints_{}'.format(idim)] = midpoints

            # construct histogram
            self.construct_histogram()

            # Record iteration range
            iter_range = self.iter_range.iter_range()
            self.output_file['n_iter'] = iter_range
            self.iter_range.record_data_iter_range(self.output_file['histograms'])

            self.output_file.close()

    @staticmethod
    def parse_binspec(binspec):
        namespace = {'numpy': np, 'np': np, 'inf': float('inf')}

        try:
            binspec_compiled = eval(binspec, namespace)
        except Exception as e:
            raise ValueError('invalid bin specification: {!r}'.format(e))
        else:
            if log.isEnabledFor(logging.DEBUG):
                log.debug('bin specs: {!r}'.format(binspec_compiled))
        return binspec_compiled

    def construct_bins(self, bins):
        '''
        Construct bins according to ``bins``, which may be:

          1) A scalar integer (for that number of bins in each dimension)
          2) A sequence of integers (specifying number of bins for each dimension)
          3) A sequence of sequences of bin boundaries (specifying boundaries for each dimension)

        Sets ``self.binbounds`` to a list of arrays of bin boundaries appropriate for passing to
        fasthist.histnd, along with ``self.midpoints`` to the midpoints of the bins.
        '''

        if not isiterable(bins):
            self._construct_bins_from_scalar(bins)
        elif not isiterable(bins[0]):
            self._construct_bins_from_int_seq(bins)
        else:
            self._construct_bins_from_bound_seqs(bins)

        if log.isEnabledFor(logging.DEBUG):
            log.debug('binbounds: {!r}'.format(self.binbounds))

    def scan_data_shape(self):
        if self.ndim is None:
            dset = self.dsspec.get_iter_data(self.iter_start)
            self.ntimepoints = dset.shape[1]
            self.ndim = dset.shape[2]
            self.dset_dtype = dset.dtype

    def scan_data_range(self):
        '''Scan input data for range in each dimension. The number of dimensions is determined
        from the shape of the progress coordinate as of self.iter_start.'''

        self.progress.indicator.new_operation('Scanning for data range', self.iter_stop - self.iter_start)
        self.scan_data_shape()

        dset_dtype = self.dset_dtype
        ndim = self.ndim
        dsspec = self.dsspec

        try:
            minval = np.finfo(dset_dtype).min
            maxval = np.finfo(dset_dtype).max
        except ValueError:
            minval = np.iinfo(dset_dtype).min
            maxval = np.iinfo(dset_dtype).max

        data_range = self.data_range = [(maxval, minval) for _i in range(self.ndim)]

        # futures = []
        # for n_iter in xrange(self.iter_start, self.iter_stop):
        # _remote_min_max(ndim, dset_dtype, n_iter, dsspec)
        #    futures.append(self.work_manager.submit(_remote_min_max, args=(ndim, dset_dtype, n_iter, dsspec)))

        # for future in self.work_manager.as_completed(futures):
        for future in self.work_manager.submit_as_completed(
            ((_remote_min_max, (ndim, dset_dtype, n_iter, dsspec), {}) for n_iter in range(self.iter_start, self.iter_stop)),
            self.max_queue_len,
        ):
            bounds = future.get_result(discard=True)
            for idim in range(ndim):
                current_min, current_max = data_range[idim]
                current_min = min(current_min, bounds[idim][0])
                current_max = max(current_max, bounds[idim][1])
                data_range[idim] = (current_min, current_max)
            self.progress.indicator.progress += 1

    def _construct_bins_from_scalar(self, bins):
        if self.data_range is None:
            self.scan_data_range()

        self.binbounds = []
        self.midpoints = []
        for idim in range(self.ndim):
            lb, ub = self.data_range[idim]
            # Advance just beyond the upper bound of the range, so that we catch
            # the maximum in the histogram
            ub *= 1.01

            boundset = np.linspace(lb, ub, bins + 1)
            midpoints = (boundset[:-1] + boundset[1:]) / 2.0
            self.binbounds.append(boundset)
            self.midpoints.append(midpoints)

    def _construct_bins_from_int_seq(self, bins):
        if self.data_range is None:
            self.scan_data_range()

        self.binbounds = []
        self.midpoints = []
        for idim in range(self.ndim):
            lb, ub = self.data_range[idim]
            # Advance just beyond the upper bound of the range, so that we catch
            # the maximum in the histogram
            ub *= 1.01

            boundset = np.linspace(lb, ub, bins[idim] + 1)
            midpoints = (boundset[:-1] + boundset[1:]) / 2.0
            self.binbounds.append(boundset)
            self.midpoints.append(midpoints)

    def _construct_bins_from_bound_seqs(self, bins):
        self.binbounds = []
        self.midpoints = []
        for boundset in bins:
            boundset = np.asarray(boundset)
            if (np.diff(boundset) <= 0).any():
                raise ValueError('boundary set {!r} is not strictly monotonically increasing'.format(boundset))
            self.binbounds.append(boundset)
            self.midpoints.append((boundset[:-1] + boundset[1:]) / 2.0)

    def construct_histogram(self):
        '''Construct a histogram using bins previously constructed with ``construct_bins()``.
        The time series of histogram values is stored in ``histograms``.
        Each histogram in the time series is normalized.'''

        self.scan_data_shape()

        iter_count = self.iter_stop - self.iter_start
        histograms_ds = self.output_file.create_dataset(
            'histograms',
            dtype=np.float64,
            shape=((iter_count,) + tuple(len(bounds) - 1 for bounds in self.binbounds)),
            compression=9 if self.compress_output else None,
        )
        binbounds = [np.require(boundset, self.dset_dtype, 'C') for boundset in self.binbounds]

        self.progress.indicator.new_operation('Constructing histograms', self.iter_stop - self.iter_start)
        task_gen = (
            (
                _remote_bin_iter,
                (iiter, n_iter, self.dsspec, self.wt_dsspec, 1 if iiter > 0 else 0, binbounds, self.ignore_out_of_range),
                {},
            )
            for (iiter, n_iter) in enumerate(range(self.iter_start, self.iter_stop))
        )
        # futures = set()
        # for iiter, n_iter in enumerate(xrange(self.iter_start, self.iter_stop)):
        #    initpoint = 1 if iiter > 0 else 0
        #    futures.add(self.work_manager.submit(_remote_bin_iter,
        #                                            args=(iiter, n_iter, self.dsspec, self.wt_dsspec, initpoint, binbounds)))

        # for future in self.work_manager.as_completed(futures):
        # future = self.work_manager.wait_any(futures)
        # for future in self.work_manager.submit_as_completed(task_gen, self.queue_size):
        log.debug('max queue length: {!r}'.format(self.max_queue_len))
        for future in self.work_manager.submit_as_completed(task_gen, self.max_queue_len):
            iiter, n_iter, iter_hist = future.get_result(discard=True)
            self.progress.indicator.progress += 1

            # store histogram
            histograms_ds[iiter] = iter_hist
            del iter_hist, future
Ejemplo n.º 3
0
class WNTopTool(WESTTool):
    prog = 'w_ntop'
    description = '''\
Select walkers from bins . An assignment file mapping walkers to
bins at each timepoint is required (see``w_assign --help`` for further
information on generating this file). By default, high-weight walkers are
selected (hence the name ``w_ntop``: select the N top-weighted walkers from
each bin); however, minimum weight walkers and randomly-selected walkers
may be selected instead.


-----------------------------------------------------------------------------
Output format
-----------------------------------------------------------------------------

The output file (-o/--output, by default "ntop.h5") contains the following
datasets:

  ``/n_iter`` [iteration]
    *(Integer)* Iteration numbers for each entry in other datasets.

  ``/n_segs`` [iteration][bin]
    *(Integer)* Number of segments in each bin/state in the given iteration.
    This will generally be the same as the number requested with
    ``--n/--count`` but may be smaller if the requested number of walkers
    does not exist.

  ``/seg_ids`` [iteration][bin][segment]
    *(Integer)* Matching segments in each iteration for each bin.
    For an iteration ``n_iter``, only the first ``n_iter`` entries are
    valid. For example, the full list of matching seg_ids in bin 0 in the
    first stored iteration is ``seg_ids[0][0][:n_segs[0]]``.

  ``/weights`` [iteration][bin][segment]
    *(Floating-point)* Weights for each matching segment in ``/seg_ids``.


-----------------------------------------------------------------------------
Command-line arguments
-----------------------------------------------------------------------------
'''

    def __init__(self):
        super(WNTopTool, self).__init__()

        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.progress = ProgressIndicatorComponent()
        self.output_file = None
        self.assignments_filename = None
        self.output_filename = None
        self.what = None
        self.timepoint = None
        self.count = None

    def add_args(self, parser):
        self.data_reader.add_args(parser)
        self.iter_range.add_args(parser)

        igroup = parser.add_argument_group('input options')
        igroup.add_argument(
            '-a',
            '--assignments',
            default='assign.h5',
            help=
            '''Use assignments from the given ASSIGNMENTS file (default: %(default)s).'''
        )

        sgroup = parser.add_argument_group('selection options')
        sgroup.add_argument(
            '-n',
            '--count',
            type=int,
            default=1,
            help=
            '''Select COUNT walkers from each iteration for each bin (default: %(default)s).'''
        )
        sgroup.add_argument(
            '-t',
            '--timepoint',
            type=int,
            default=-1,
            help=
            '''Base selection on the given TIMEPOINT within each iteration. Default (-1)
                            corresponds to the last timepoint.''')
        cgroup = parser.add_mutually_exclusive_group()
        cgroup.add_argument(
            '--highweight',
            dest='select_what',
            action='store_const',
            const='highweight',
            help='''Select COUNT highest-weight walkers from each bin.''')
        cgroup.add_argument(
            '--lowweight',
            dest='select_what',
            action='store_const',
            const='lowweight',
            help='''Select COUNT lowest-weight walkers from each bin.''')
        cgroup.add_argument(
            '--random',
            dest='select_what',
            action='store_const',
            const='random',
            help='''Select COUNT walkers randomly from each bin.''')
        parser.set_defaults(select_what='highweight')

        ogroup = parser.add_argument_group('output options')
        ogroup.add_argument(
            '-o',
            '--output',
            default='ntop.h5',
            help='''Write output to OUTPUT (default: %(default)s).''')
        self.progress.add_args(parser)

    def process_args(self, args):
        self.progress.process_args(args)
        self.data_reader.process_args(args)
        with self.data_reader:
            self.iter_range.process_args(args)
        self.what = args.select_what
        self.output_filename = args.output
        self.assignments_filename = args.assignments
        self.count = args.count
        self.timepoint = args.timepoint

    def go(self):
        self.data_reader.open('r')
        assignments_file = h5py.File(self.assignments_filename, mode='r')
        output_file = h5io.WESTPAH5File(self.output_filename, mode='w')
        pi = self.progress.indicator
        count = self.count
        timepoint = self.timepoint

        nbins = assignments_file.attrs['nbins'] + 1
        assignments_ds = assignments_file['assignments']

        iter_start, iter_stop = self.iter_range.iter_start, self.iter_range.iter_stop
        iter_count = iter_stop - iter_start
        h5io.check_iter_range_least(assignments_ds, iter_start, iter_stop)
        nsegs = assignments_file['nsegs'][h5io.get_iteration_slice(
            assignments_file['nsegs'], iter_start, iter_stop)]

        output_file.create_dataset('n_iter',
                                   dtype=n_iter_dtype,
                                   data=list(range(iter_start, iter_stop)))

        seg_count_ds = output_file.create_dataset('nsegs',
                                                  dtype=np.uint,
                                                  shape=(iter_count, nbins))
        matching_segs_ds = output_file.create_dataset(
            'seg_ids',
            shape=(iter_count, nbins, count),
            dtype=seg_id_dtype,
            chunks=h5io.calc_chunksize((iter_count, nbins, count),
                                       seg_id_dtype),
            shuffle=True,
            compression=9)
        weights_ds = output_file.create_dataset('weights',
                                                shape=(iter_count, nbins,
                                                       count),
                                                dtype=weight_dtype,
                                                chunks=h5io.calc_chunksize(
                                                    (iter_count, nbins, count),
                                                    weight_dtype),
                                                shuffle=True,
                                                compression=9)
        what = self.what

        with pi:
            pi.new_operation('Finding matching segments', extent=iter_count)
            for iiter, n_iter in enumerate(range(iter_start, iter_stop)):
                assignments = np.require(assignments_ds[
                    h5io.get_iteration_entry(assignments_ds, n_iter) +
                    np.index_exp[:, timepoint]],
                                         dtype=westpa.binning.index_dtype)
                all_weights = self.data_reader.get_iter_group(
                    n_iter)['seg_index']['weight']

                # the following Cython function just executes this loop:
                #for iseg in xrange(nsegs[iiter]):
                #    segs_by_bin[iseg,assignments[iseg]] = True
                segs_by_bin = assignments_list_to_table(
                    nsegs[iiter], nbins, assignments)
                for ibin in range(nbins):
                    segs = np.nonzero(segs_by_bin[:, ibin])[0]

                    seg_count_ds[iiter, ibin] = min(len(segs), count)

                    if len(segs):
                        weights = all_weights.take(segs)

                        if what == 'lowweight':
                            indices = np.argsort(weights)[:count]
                        elif what == 'highweight':
                            indices = np.argsort(weights)[::-1][:count]
                        else:
                            assert what == 'random'
                            indices = np.random.permutation(len(weights))

                        matching_segs_ds[iiter,
                                         ibin, :len(segs)] = segs.take(indices)
                        weights_ds[iiter,
                                   ibin, :len(segs)] = weights.take(indices)
                        del segs, weights

                del assignments, segs_by_bin, all_weights
                pi.progress += 1
Ejemplo n.º 4
0
class WCrawl(WESTParallelTool):
    prog = 'w_crawl'
    description = '''\
Crawl a weighted ensemble dataset, executing a function for each iteration.
This can be used for postprocessing of trajectories, cleanup of datasets,
or anything else that can be expressed as "do X for iteration N, then do
something with the result". Tasks are parallelized by iteration, and
no guarantees are made about evaluation order.


-----------------------------------------------------------------------------
Command-line options
-----------------------------------------------------------------------------

'''

    def __init__(self):
        super(WCrawl, self).__init__()

        # These are used throughout
        self.progress = ProgressIndicatorComponent()
        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection(self.data_reader)

        self.crawler = None
        self.task_callable = None

    def add_args(self, parser):
        self.data_reader.add_args(parser)
        self.iter_range.add_args(parser)

        tgroup = parser.add_argument_group('task options')
        tgroup.add_argument(
            '-c',
            '--crawler-instance',
            help=
            '''Use CRAWLER_INSTANCE (specified as module.instance) as an instance of
                            WESTPACrawler to coordinate the calculation. Required only if initialization,
                            finalization, or task result processing is required.'''
        )
        tgroup.add_argument(
            'task_callable',
            help=
            '''Run TASK_CALLABLE (specified as module.function) on each iteration.
                            Required.''')
        self.progress.add_args(parser)

    def process_args(self, args):
        self.progress.process_args(args)
        self.data_reader.process_args(args)
        with self.data_reader:
            self.iter_range.process_args(args)

        self.task_callable = get_object(args.task_callable, path=['.'])
        if args.crawler_instance is not None:
            self.crawler = get_object(args.crawler_instance, path=['.'])
        else:
            self.crawler = WESTPACrawler()

    def go(self):
        iter_start = self.iter_range.iter_start
        iter_stop = self.iter_range.iter_stop
        iter_count = iter_stop - iter_start
        self.data_reader.open('r')
        pi = self.progress.indicator
        with pi:
            pi.operation = 'Initializing'
            self.crawler.initialize(iter_start, iter_stop)

            try:
                pi.new_operation('Dispatching tasks & processing results',
                                 iter_count)
                task_gen = ((_remote_task, (n_iter, self.task_callable), {})
                            for n_iter in range(iter_start, iter_stop))
                for future in self.work_manager.submit_as_completed(
                        task_gen, self.max_queue_len):
                    n_iter, result = future.get_result(discard=True)
                    if self.crawler is not None:
                        self.crawler.process_iter_result(n_iter, result)
                    pi.progress += 1
            finally:
                pi.new_operation('Finalizing')
                self.crawler.finalize()
Ejemplo n.º 5
0
class WSelectTool(WESTParallelTool):
    prog = 'w_select'
    description = '''\
Select dynamics segments matching various criteria. This requires a
user-provided prediate function. By default, only matching segments are
stored. If the -a/--include-ancestors option is given, then matching segments
and their ancestors will be stored.


-----------------------------------------------------------------------------
Predicate function
-----------------------------------------------------------------------------

Segments are selected based on a predicate function, which must be callable
as ``predicate(n_iter, iter_group)`` and return a collection of segment IDs
matching the predicate in that iteration.

The predicate may be inverted by specifying the -v/--invert command-line
argument.


-----------------------------------------------------------------------------
Output format
-----------------------------------------------------------------------------

The output file (-o/--output, by default "select.h5") contains the following
datasets:

  ``/n_iter`` [iteration]
    *(Integer)* Iteration numbers for each entry in other datasets.

  ``/n_segs`` [iteration]
    *(Integer)* Number of segment IDs matching the predicate (or inverted
    predicate, if -v/--invert is specified) in the given iteration.

  ``/seg_ids`` [iteration][segment]
    *(Integer)* Matching segments in each iteration. For an iteration
    ``n_iter``, only the first ``n_iter`` entries are valid. For example,
    the full list of matching seg_ids in the first stored iteration is
    ``seg_ids[0][:n_segs[0]]``.

  ``/weights`` [iteration][segment]
    *(Floating-point)* Weights for each matching segment in ``/seg_ids``.


-----------------------------------------------------------------------------
Command-line arguments
-----------------------------------------------------------------------------
'''

    def __init__(self):
        super().__init__()

        self.data_reader = WESTDataReader()
        self.iter_range = IterRangeSelection()
        self.progress = ProgressIndicatorComponent()
        self.output_file = None
        self.output_filename = None
        self.predicate = None
        self.invert = False
        self.include_ancestors = False

    def add_args(self, parser):
        self.data_reader.add_args(parser)
        self.iter_range.add_args(parser)

        sgroup = parser.add_argument_group('selection options')
        sgroup.add_argument(
            '-p',
            '--predicate-function',
            metavar='MODULE.FUNCTION',
            help=
            '''Use the given predicate function to match segments. This function
                             should take an iteration number and the HDF5 group corresponding to that
                             iteration and return a sequence of seg_ids matching the predicate, as in
                             ``match_predicate(n_iter, iter_group)``.''',
        )
        sgroup.add_argument('-v',
                            '--invert',
                            dest='invert',
                            action='store_true',
                            help='''Invert the match predicate.''')
        sgroup.add_argument(
            '-a',
            '--include-ancestors',
            action='store_true',
            help='''Include ancestors of matched segments in output.''')

        ogroup = parser.add_argument_group('output options')
        ogroup.add_argument(
            '-o',
            '--output',
            default='select.h5',
            help='''Write output to OUTPUT (default: %(default)s).''')
        self.progress.add_args(parser)

    def process_args(self, args):
        self.progress.process_args(args)
        self.data_reader.process_args(args)
        with self.data_reader:
            self.iter_range.process_args(args)

        predicate = get_object(args.predicate_function, path=['.'])
        if not callable(predicate):
            raise TypeError(
                'predicate object {!r} is not callable'.format(predicate))
        self.predicate = predicate
        self.invert = bool(args.invert)
        self.include_ancestors = bool(args.include_ancestors)
        self.output_filename = args.output

    def go(self):
        self.data_reader.open('r')
        output_file = h5io.WESTPAH5File(self.output_filename, mode='w')
        pi = self.progress.indicator

        iter_start, iter_stop = self.iter_range.iter_start, self.iter_range.iter_stop
        iter_count = iter_stop - iter_start

        output_file.create_dataset('n_iter',
                                   dtype=n_iter_dtype,
                                   data=list(range(iter_start, iter_stop)))
        current_seg_count = 0
        seg_count_ds = output_file.create_dataset('n_segs',
                                                  dtype=np.uint,
                                                  shape=(iter_count, ))
        matching_segs_ds = output_file.create_dataset(
            'seg_ids',
            shape=(iter_count, 0),
            maxshape=(iter_count, None),
            dtype=seg_id_dtype,
            chunks=h5io.calc_chunksize((iter_count, 1000000), seg_id_dtype),
            shuffle=True,
            compression=9,
        )
        weights_ds = output_file.create_dataset(
            'weights',
            shape=(iter_count, 0),
            maxshape=(iter_count, None),
            dtype=weight_dtype,
            chunks=h5io.calc_chunksize((iter_count, 1000000), weight_dtype),
            shuffle=True,
            compression=9,
        )

        with pi:
            pi.new_operation('Finding matching segments', extent=iter_count)
            #             futures = set()
            #             for n_iter in xrange(iter_start,iter_stop):
            #                 futures.add(self.work_manager.submit(_find_matching_segments,
            #                                                      args=(self.data_reader.we_h5filename,n_iter,self.predicate,self.invert)))

            #             for future in self.work_manager.as_completed(futures):
            for future in self.work_manager.submit_as_completed(
                ((_find_matching_segments,
                  (self.data_reader.we_h5filename, n_iter, self.predicate,
                   self.invert), {})
                 for n_iter in range(iter_start, iter_stop)),
                    self.max_queue_len,
            ):
                n_iter, matching_ids = future.get_result()
                n_matches = len(matching_ids)

                if n_matches:
                    if n_matches > current_seg_count:
                        current_seg_count = len(matching_ids)
                        matching_segs_ds.resize((iter_count, n_matches))
                        weights_ds.resize((iter_count, n_matches))
                        current_seg_count = n_matches

                    seg_count_ds[n_iter - iter_start] = n_matches
                    matching_segs_ds[n_iter -
                                     iter_start, :n_matches] = matching_ids
                    weights_ds[n_iter - iter_start, :
                               n_matches] = self.data_reader.get_iter_group(
                                   n_iter)['seg_index']['weight'][sorted(
                                       matching_ids)]
                del matching_ids
                pi.progress += 1

            if self.include_ancestors:
                pi.new_operation('Tracing ancestors of matching segments',
                                 extent=iter_count)
                from_previous = set()
                current_seg_count = matching_segs_ds.shape[1]
                for n_iter in range(iter_stop - 1, iter_start - 1, -1):
                    iiter = n_iter - iter_start
                    n_matches = seg_count_ds[iiter]
                    matching_ids = set(from_previous)
                    if n_matches:
                        matching_ids.update(
                            matching_segs_ds[iiter, :seg_count_ds[iiter]])
                    from_previous.clear()

                    n_matches = len(matching_ids)
                    if n_matches > current_seg_count:
                        matching_segs_ds.resize((iter_count, n_matches))
                        weights_ds.resize((iter_count, n_matches))
                        current_seg_count = n_matches

                    if n_matches > 0:
                        seg_count_ds[iiter] = n_matches
                        matching_ids = sorted(matching_ids)
                        matching_segs_ds[iiter, :n_matches] = matching_ids
                        weights_ds[
                            iiter, :
                            n_matches] = self.data_reader.get_iter_group(
                                n_iter)['seg_index']['weight'][sorted(
                                    matching_ids)]
                        parent_ids = self.data_reader.get_iter_group(n_iter)[
                            'seg_index']['parent_id'][sorted(matching_ids)]
                        from_previous.update(
                            parent_id for parent_id in parent_ids
                            if parent_id >= 0)  # filter initial states
                        del parent_ids
                    del matching_ids
                    pi.progress += 1
Ejemplo n.º 6
0
class CommonPloterrs(WESTSubcommand):
    def __init__(self, parent):
        super(CommonPloterrs, self).__init__(parent)

        self.progress = ProgressIndicatorComponent()

        self.xscale = None
        self.yscale = None
        self.xrange = None
        self.yrange = None
        self.xlabel = None
        self.ylabel = None
        self.title = None

        self.plot_options_group = None

    def add_args(self, parser):
        self.progress.add_args(parser)

        pogroup = self.plot_options_group = parser.add_argument_group(
            'plot options')
        pogroup.add_argument(
            '--xscale',
            choices=['linear', 'log', 'symlog'],
            default='linear',
            help='''Use "linear", "log", or "symlog" scaling for the x axis.
                             (Default: %(default)s).''')
        pogroup.add_argument(
            '--yscale',
            choices=['linear', 'log', 'symlog'],
            default='linear',
            help='''Use "linear", "log", or "symlog" scaling for the y axis.
                             (Default: %(default)s).''')
        pogroup.add_argument(
            '--xrange',
            help=
            '''Restrict X range to XRANGE, which must be formatted as "xmin,xmax".
                             (Default: determined by input data.)''')
        pogroup.add_argument(
            '--yrange',
            help=
            '''Restrict Y range to YRANGE, which must be formatted as "ymin,ymax".
                             (Default: determined by input data.)''')
        pogroup.add_argument(
            '--xlabel',
            help='''Use XLABEL for the x-axis label. (Default: varies.)''')
        pogroup.add_argument(
            '--ylabel',
            help='''Use YLABEL for the y-axis label. (Default: varies.)''')
        pogroup.add_argument(
            '--title',
            help='''Use TITLE for the plot title. (Default: varies.)''')
        pogroup.add_argument('--terminal',
                             '-t',
                             dest='plotting',
                             action='store_true',
                             help='''Plot output in terminal.''')

    def process_args(self, args):
        self.progress.process_args(args)

        if args.xrange:
            self.xrange = self.parse_range(args.xrange)

        if args.yrange:
            self.yrange = self.parse_range(args.yrange)

        self.xscale = args.xscale
        self.yscale = args.yscale
        self.xlabel = args.xlabel or 'Iteration'
        self.ylabel = args.ylabel
        self.title = args.title
        if args.plotting or os.environ.get('DISPLAY') is None:
            self.interface = 'text'
        else:
            import matplotlib
            from matplotlib import pyplot
            self.interface = 'matplotlib'

    def parse_range(self, rangespec):
        try:
            (lbt, ubt) = rangespec.split(',')
            return float(lbt), float(ubt)
        except (ValueError, TypeError) as e:
            raise ValueError('invalid range specification {!r}: {!s}'.format(
                rangespec, e))

    def do_plot(self,
                data,
                output_filename,
                title=None,
                x_range=None,
                y_range=None,
                x_label=None,
                y_label=None):
        if not output_filename:
            return

        title = title or self.title
        x_range = x_range or self.xrange
        y_range = y_range or self.yrange
        x_label = x_label or self.xlabel
        y_label = y_label or self.ylabel

        iters = data['iter_stop'] - 1

        pyplot.figure()
        pyplot.plot(iters, data['expected'], color='black')
        pyplot.plot(iters, data['ci_lbound'], color='gray')
        pyplot.plot(iters, data['ci_ubound'], color='gray')

        pyplot.gca().set_xscale(self.xscale)
        pyplot.gca().set_yscale(self.yscale)

        if title:
            pyplot.title(title)

        if x_range is not None:
            pyplot.xlim(x_range)

        if y_range is not None:
            pyplot.ylim(y_range)

        if x_label:
            pyplot.xlabel(x_label)

        if y_label:
            pyplot.ylabel(y_label)

        pyplot.savefig(output_filename)
Ejemplo n.º 7
0
class WIPI(WESTParallelTool):
    '''
        Welcome to w_ipa (WESTPA Interactive Python Analysis)!
        From here, you can run traces, look at weights, progress coordinates, etc.
        This is considered a 'stateful' tool; that is, the data you are pulling is always pulled
        from the current analysis scheme and iteration.
        By default, the first analysis scheme in west.cfg is used, and you are set at iteration 1.

        ALL PROPERTIES ARE ACCESSED VIA w or west
        To see the current iteration, try:

            w.iteration
            OR
            west.iteration

        to set it, simply plug in a new value.

            w.iteration = 100

        To change/list the current analysis schemes:

            w.list_schemes
            w.scheme = OUTPUT FROM w.list_schemes

        To see the states and bins defined in the current analysis scheme:

            w.states
            w.bin_labels

        All information about the current iteration is available in an object called 'current':

            w.current
            walkers, summary, states, seg_id, weights, parents, kinavg, pcoord, bins, populations, and auxdata, if it exists.

        In addition, the function w.trace(seg_id) will run a trace over a seg_id in the current iteration and return a dictionary
        containing all pertinent information about that seg_id's history.  It's best to store this, as the trace can be expensive.

        Run help on any function or property for more information!

        Happy analyzing!

    '''
    def __init__(self):
        super().__init__()
        self.data_reader = WESTDataReader()
        self.wm_env.default_work_manager = self.wm_env.default_parallel_work_manager
        self.progress = ProgressIndicatorComponent()

        self._iter = 1
        self.config_required = True
        self.version = "1.0B"
        # Set to matplotlib if you want that.  But why would you?
        # Well, whatever, we'll just set it to that for now.
        self.interface = 'matplotlib'
        self._scheme = None
        global iteration

    def add_args(self, parser):
        self.progress.add_args(parser)
        self.data_reader.add_args(parser)
        rgroup = parser.add_argument_group('runtime options')
        rgroup.add_argument(
            '--analysis-only',
            '-ao',
            dest='analysis_mode',
            action='store_true',
            help=
            '''Use this flag to run the analysis and return to the terminal.''',
        )
        rgroup.add_argument(
            '--reanalyze',
            '-ra',
            dest='reanalyze',
            action='store_true',
            help=
            '''Use this flag to delete the existing files and reanalyze.''',
        )
        rgroup.add_argument('--ignore-hash',
                            '-ih',
                            dest='ignore_hash',
                            action='store_true',
                            help='''Ignore hash and don't regenerate files.''')
        rgroup.add_argument(
            '--debug',
            '-d',
            dest='debug_mode',
            action='store_true',
            help='''Debug output largely intended for development.''')
        rgroup.add_argument('--terminal',
                            '-t',
                            dest='plotting',
                            action='store_true',
                            help='''Plot output in terminal.''')
        # There is almost certainly a better way to handle this, but we'll sort that later.
        import argparse

        rgroup.add_argument('--f',
                            '-f',
                            dest='extra',
                            default='blah',
                            help=argparse.SUPPRESS)

        parser.set_defaults(compression=True)

    def process_args(self, args):
        self.progress.process_args(args)
        self.data_reader.process_args(args)
        with self.data_reader:
            self.niters = self.data_reader.current_iteration - 1
        self.__config = westpa.rc.config
        self.__settings = self.__config['west']['analysis']
        for ischeme, scheme in enumerate(self.__settings['analysis_schemes']):
            if (self.__settings['analysis_schemes'][scheme]['enabled'] is True
                    or self.__settings['analysis_schemes'][scheme]['enabled']
                    is None):
                self.scheme = scheme
        self.data_args = args
        self.analysis_mode = args.analysis_mode
        self.reanalyze = args.reanalyze
        self.ignore_hash = args.ignore_hash
        self.debug_mode = args.debug_mode
        if args.plotting:
            self.interface = 'text'

    def hash_args(self, args, extra=None, path=None):
        '''Create unique hash stamp to determine if arguments/file is different from before.'''
        '''Combine with iteration to know whether or not file needs updating.'''
        # Why are we not loading this functionality into the individual tools?
        # While it may certainly be useful to store arguments (and we may well do that),
        # it's rather complex and nasty to deal with pickling and hashing arguments through
        # the various namespaces.
        # In addition, it's unlikely that the functionality is desired at the individual tool level,
        # since we'll always just rewrite a file when we call the function.
        # return hashlib.md5(pickle.dumps([args, extra])).hexdigest()
        # We don't care about the path, so we'll remove it.
        # Probably a better way to do this, but who cares.
        cargs = list(args)
        for iarg, arg in enumerate(cargs):
            if path in arg:
                cargs[iarg] = arg.replace(path, '').replace('/', '')
            if arg == '--disable-averages':
                cargs.remove('--disable-averages')
        to_hash = cargs + [extra]
        # print(args)
        # print(to_hash)
        # print(str(to_hash).encode('base64'))
        if self.debug_mode:
            for iarg, arg in enumerate(to_hash):
                if not isinstance(arg, list):
                    print('arg {num:02d} -- {arg:<20}'.format(num=iarg,
                                                              arg=arg))
                else:
                    for il, l in enumerate(arg):
                        print('arg {num:02d} -- {arg:<20}'.format(num=il +
                                                                  iarg,
                                                                  arg=l))
            # print('args: {}'.format(to_hash))
        # This SHOULD produce the same output, maybe?  That would be nice, anyway.
        # But we'll need to test it more.
        return hashlib.md5(base64.b64encode(str(to_hash).encode())).hexdigest()

    def stamp_hash(self, h5file_name, new_hash):
        '''Loads a file, stamps it, and returns the opened file in read only'''
        h5file = h5io.WESTPAH5File(h5file_name, 'r+')
        h5file.attrs['arg_hash'] = new_hash
        h5file.close()
        h5file = h5io.WESTPAH5File(h5file_name, 'r')
        return h5file

    def analysis_structure(self):
        '''
        Run automatically on startup.  Parses through the configuration file, and loads up all the data files from the different
        analysis schematics.  If they don't exist, it creates them automatically by hooking in to existing analysis routines
        and going from there.

        It does this by calling in the make_parser_and_process function for w_{assign,reweight,direct} using a custom built list
        of args.  The user can specify everything in the configuration file that would have been specified on the command line.

        For instance, were one to call w_direct as follows:

            w_direct --evolution cumulative --step-iter 1 --disable-correl

        the west.cfg would look as follows:

        west:
          analysis:
            w_direct:
              evolution: cumulative
              step_iter: 1
              extra: ['disable-correl']

        Alternatively, if one wishes to use the same options for both w_direct and w_reweight, the key 'w_direct' can be replaced
        with 'kinetics'.
        '''
        # Make sure everything exists.
        try:
            os.mkdir(self.__settings['directory'])
        except Exception:
            pass
        # Now, check to see whether they exist, and then load them.
        self.__analysis_schemes__ = {}
        # We really need to implement some sort of default behavior if an analysis scheme isn't set.
        # Right now, we just crash.  That isn't really graceful.
        for scheme in self.__settings['analysis_schemes']:
            if self.__settings['analysis_schemes'][scheme]['enabled']:
                if self.work_manager.running is False:
                    self.work_manager.startup()
                path = os.path.join(os.getcwd(), self.__settings['directory'],
                                    scheme)
                # if 'postanalysis' in self.__settings['analysis_schemes'][scheme] and 'postanalysis' in self.__settings['postanalysis']:
                # Should clean this up.  But it uses the default global setting if a by-scheme one isn't set.
                if 'postanalysis' in self.__settings:
                    if 'postanalysis' in self.__settings['analysis_schemes'][
                            scheme]:
                        pass
                    else:
                        self.__settings['analysis_schemes'][scheme][
                            'postanalysis'] = self.__settings['postanalysis']
                try:
                    os.mkdir(path)
                except Exception:
                    pass
                self.__analysis_schemes__[scheme] = {}
                try:
                    if (self.__settings['analysis_schemes'][scheme]
                        ['postanalysis'] is True
                            or self.__settings['postanalysis'] is True):
                        analysis_files = ['assign', 'direct', 'reweight']
                    else:
                        analysis_files = ['assign', 'direct']
                except Exception:
                    analysis_files = ['assign', 'direct']
                    self.__settings['analysis_schemes'][scheme][
                        'postanalysis'] = False
                reanalyze_kinetics = False
                assign_hash = None
                for name in analysis_files:
                    arg_hash = None
                    if self.reanalyze is True:
                        reanalyze_kinetics = True
                        try:
                            os.remove(os.path.join(path, '{}.h5'.format(name)))
                        except Exception:
                            pass
                    else:
                        try:
                            # Try to load the hash.  If we fail to load the hash or the file, we need to reload.
                            # if self.reanalyze == True:
                            #    raise ValueError('Reanalyze set to true.')
                            self.__analysis_schemes__[scheme][
                                name] = h5io.WESTPAH5File(
                                    os.path.join(path, '{}.h5'.format(name)),
                                    'r')
                            arg_hash = self.__analysis_schemes__[scheme][
                                name].attrs['arg_hash']
                            if name == 'assign':
                                assign_hash = arg_hash
                        except Exception:
                            pass
                            # We shouldn't rely on this.
                            # self.reanalyze = True
                    if True:
                        if name == 'assign':
                            assign = w_assign.WAssign()

                            w_assign_config = {
                                'output': os.path.join(path,
                                                       '{}.h5'.format(name))
                            }
                            try:
                                w_assign_config.update(
                                    self.__settings['w_assign'])
                            except Exception:
                                pass
                            try:
                                w_assign_config.update(
                                    self.__settings['analysis_schemes'][scheme]
                                    ['w_assign'])
                            except Exception:
                                pass
                            args = []
                            for key, value in w_assign_config.items():
                                if key != 'extra':
                                    args.append(
                                        str('--') + str(key).replace('_', '-'))
                                    args.append(str(value))
                            # This is for stuff like disabling correlation analysis, etc.
                            if 'extra' in list(w_assign_config.keys()):
                                # We're sorting to ensure that the order doesn't matter.
                                for value in sorted(w_assign_config['extra']):
                                    args.append(
                                        str('--') +
                                        str(value).replace('_', '-'))
                            # We're just calling the built in function.
                            # This is a lot cleaner than what we had in before, and far more workable.
                            args.append('--config-from-file')
                            args.append('--scheme-name')
                            args.append('{}'.format(scheme))
                            # Why are we calling this if we're not sure we're remaking the file?
                            # We need to load up the bin mapper and states and see if they're the same.
                            assign.make_parser_and_process(args=args)
                            import pickle

                            # new_hash = self.hash_args(args=args, path=path, extra=[self.niters, pickle.dumps(assign.binning.mapper), assign.states])
                            # We need to encode it properly to ensure that some OS specific thing doesn't kill us.  Same goes for the args, ultimately.
                            # Mostly, we just need to ensure that we're consistent.
                            new_hash = self.hash_args(
                                args=args,
                                path=path,
                                extra=[
                                    int(self.niters),
                                    codecs.encode(
                                        pickle.dumps(assign.binning.mapper),
                                        "base64"),
                                    base64.b64encode(
                                        str(assign.states).encode()),
                                ],
                            )
                            # Let's check the hash.  If the hash is the same, we don't need to reload.
                            if self.debug_mode is True:
                                print('{:<10}: old hash, new hash -- {}, {}'.
                                      format(name, arg_hash, new_hash))
                            if self.ignore_hash is False and (
                                    arg_hash != new_hash
                                    or self.reanalyze is True):
                                # If the hashes are different, or we need to reanalyze, delete the file.
                                try:
                                    os.remove(
                                        os.path.join(path,
                                                     '{}.h5'.format(name)))
                                except Exception:
                                    pass
                                print('Reanalyzing file {}.h5 for scheme {}.'.
                                      format(name, scheme))
                                # reanalyze_kinetics = True
                                # We want to use the work manager we have here.  Otherwise, just let the tool sort out what it needs, honestly.
                                assign.work_manager = self.work_manager

                                assign.go()
                                assign.data_reader.close()

                                # Stamp w/ hash, then reload as read only.
                                self.__analysis_schemes__[scheme][
                                    name] = self.stamp_hash(
                                        os.path.join(path,
                                                     '{}.h5'.format(name)),
                                        new_hash)
                            del assign
                            # Update the assignment hash.
                            assign_hash = new_hash

                        # Since these are all contained within one tool, now, we want it to just... load everything.
                        if name == 'direct' or name == 'reweight':
                            if name == 'direct':
                                analysis = w_direct.WDirect()
                            if name == 'reweight':
                                analysis = w_reweight.WReweight()

                            analysis_config = {
                                'assignments':
                                os.path.join(path, '{}.h5'.format('assign')),
                                'output':
                                os.path.join(path, '{}.h5'.format(name)),
                                'kinetics':
                                os.path.join(path, '{}.h5'.format(name)),
                            }

                            # Pull from general analysis options, then general SPECIFIC options for each analysis,
                            # then general options for that analysis scheme, then specific options for the analysis type in the scheme.

                            try:
                                analysis_config.update(
                                    self.__settings['kinetics'])
                            except Exception:
                                pass
                            try:
                                analysis_config.update(
                                    self.__settings['w_{}'.format(name)])
                            except Exception:
                                pass
                            try:
                                analysis_config.update(
                                    self.__settings['analysis_schemes'][scheme]
                                    ['kinetics'])
                            except Exception:
                                pass
                            try:
                                analysis_config.update(
                                    self.__settings['analysis_schemes'][scheme]
                                    ['w_{}'.format(name)])
                            except Exception:
                                pass

                            # We're pulling in a default set of arguments, then updating them with arguments from the west.cfg file, if appropriate, after setting the appropriate command
                            # Then, we call the magic function 'make_parser_and_process' with the arguments we've pulled in.
                            # The tool has no real idea it's being called outside of its actual function, and we're good to go.
                            args = ['all']
                            for key, value in analysis_config.items():
                                if key != 'extra':
                                    args.append(
                                        str('--') + str(key).replace('_', '-'))
                                    args.append(str(value))
                            # This is for stuff like disabling correlation analysis, etc.
                            if 'extra' in list(analysis_config.keys()):
                                for value in sorted(analysis_config['extra']):
                                    args.append(
                                        str('--') +
                                        str(value).replace('_', '-'))
                            # We want to not display the averages, so...
                            args.append('--disable-averages')
                            new_hash = self.hash_args(
                                args=args,
                                path=path,
                                extra=[int(self.niters), assign_hash])
                            # if arg_hash != new_hash or self.reanalyze == True or reanalyze_kinetics == True:
                            if self.debug_mode is True:
                                print('{:<10}: old hash, new hash -- {}, {}'.
                                      format(name, arg_hash, new_hash))
                            if self.ignore_hash is False and (
                                    arg_hash != new_hash
                                    or reanalyze_kinetics is True):
                                try:
                                    os.remove(
                                        os.path.join(path,
                                                     '{}.h5'.format(name)))
                                except Exception:
                                    pass
                                print('Reanalyzing file {}.h5 for scheme {}.'.
                                      format(name, scheme))
                                analysis.make_parser_and_process(args=args)
                                # We want to hook into the existing work manager.
                                analysis.work_manager = self.work_manager

                                analysis.go()

                                # Open!
                                self.__analysis_schemes__[scheme][
                                    name] = self.stamp_hash(
                                        os.path.join(path,
                                                     '{}.h5'.format(name)),
                                        new_hash)
                            del analysis

        # Make sure this doesn't get too far out, here.  We need to keep it alive as long as we're actually analyzing things.
        # self.work_manager.shutdown()
        print("")
        print("Complete!")

    @property
    def assign(self):
        return self.__analysis_schemes__[str(self.scheme)]['assign']

    @property
    def direct(self):
        """
        The output from w_kinavg.py from the current scheme.
        """
        return self.__analysis_schemes__[str(self.scheme)]['direct']

    @property
    def state_labels(self):
        print("State labels and definitions!")
        for istate, state in enumerate(self.assign['state_labels']):
            print('{}: {}'.format(istate, state))
        print('{}: {}'.format(istate + 1, 'Unknown'))

    @property
    def bin_labels(self):
        print("Bin definitions! ")
        for istate, state in enumerate(self.assign['bin_labels']):
            print('{}: {}'.format(istate, state))

    @property
    def west(self):
        return self.data_reader.data_manager.we_h5file

    @property
    def reweight(self):
        if self.__settings['analysis_schemes'][str(
                self.scheme)]['postanalysis'] is True:
            return self.__analysis_schemes__[str(self.scheme)]['reweight']
        else:
            value = "This sort of analysis has not been enabled."
            current = {
                'bin_prob_evolution': value,
                'color_prob_evolution': value,
                'conditional_flux_evolution': value,
                'rate_evolution': value,
                'state_labels': value,
                'state_prob_evolution': value,
            }
            current.update({'bin_populations': value, 'iterations': value})
            return current

    @property
    def scheme(self):
        '''
        Returns and sets what scheme is currently in use.
        To see what schemes are available, run:

            w.list_schemes

        '''
        # Let's do this a few different ways.
        # We want to return things about the DIFFERENT schemes, if possible.
        if self._scheme is None:
            self._scheme = WIPIScheme(scheme=self.__analysis_schemes__,
                                      name=self._schemename,
                                      parent=self,
                                      settings=self.__settings)

        # This just ensures that when we call it, it's clean.
        self._scheme.name = None
        return self._scheme

    @scheme.setter
    def scheme(self, scheme):
        self._future = None
        self._current = None
        self._past = None
        if scheme in self.__settings['analysis_schemes']:
            pass
        else:
            for ischeme, schemename in enumerate(
                    self.__settings['analysis_schemes']):
                if ischeme == scheme:
                    scheme = schemename
        if (self.__settings['analysis_schemes'][scheme]['enabled'] is True
                or self.__settings['analysis_schemes'][scheme]['enabled'] is
                None):
            self._schemename = scheme
        else:
            print("Scheme cannot be changed to scheme: {}; it is not enabled!".
                  format(scheme))

    @property
    def list_schemes(self):
        '''
        Lists what schemes are configured in west.cfg file.
        Schemes should be structured as follows, in west.cfg:

        west:
          system:
            analysis:
              directory: analysis
              analysis_schemes:
                scheme.1:
                  enabled: True
                  states:
                    - label: unbound
                      coords: [[7.0]]
                    - label: bound
                      coords: [[2.7]]
                  bins:
                    - type: RectilinearBinMapper
                      boundaries: [[0.0, 2.80, 7, 10000]]
        '''
        # print("The following schemes are available:")
        # print("")
        # for ischeme, scheme in enumerate(self.__settings['analysis_schemes']):
        #    print('{}. Scheme: {}'.format(ischeme, scheme))
        # print("")
        # print("Set via name, or via the index listed.")
        # print("")
        # print("Current scheme: {}".format(self.scheme))
        self._scheme.list_schemes

    @property
    def iteration(self):
        '''
        Returns/sets the current iteration.
        '''
        # print("The current iteration is {}".format(self._iter))
        return self._iter

    @iteration.setter
    def iteration(self, value):
        print("Setting iteration to iter {}.".format(value))
        if value <= 0:
            print("Iteration must begin at 1.")
            value = 1
        if value > self.niters:
            print("Cannot go beyond {} iterations!".format(self.niters))
            print("Setting to {}".format(self.niters))
            value = self.niters
        # We want to trigger a rebuild on our current/past/future bits.
        # The scheme should automatically reset to the proper iteration, but
        # future needs to be manually triggered.
        self._iter = value
        self._future = None
        return self._iter

    @property
    def current(self):
        '''
        The current iteration.  See help for __get_data_for_iteration__
        '''
        return self.scheme[self.scheme.scheme].current

    @property
    def past(self):
        '''
        The previous iteration.  See help for __get_data_for_iteration__
        '''
        return self.scheme[self.scheme.scheme].past

    def trace(self, seg_id):
        '''
        Runs a trace on a seg_id within the current iteration, all the way back to the beginning,
        returning a dictionary containing all interesting information:

            seg_id, pcoord, states, bins, weights, iteration, auxdata (optional)

        sorted in chronological order.


        Call with a seg_id.
        '''
        if seg_id >= self.current.walkers:
            print("Walker seg_id # {} is beyond the max count of {} walkers.".
                  format(seg_id, self.current.walkers))
            return 1
        pi = self.progress.indicator
        with pi:
            pi.new_operation(
                'Tracing scheme:iter:seg_id {}:{}:{}'.format(
                    self.scheme, self.iteration, seg_id), self.iteration)
            current = {
                'seg_id': [],
                'pcoord': [],
                'states': [],
                'weights': [],
                'iteration': [],
                'bins': []
            }
            keys = []
            try:
                current['auxdata'] = {}
                for key in list(self.current['auxdata'].keys()):
                    current['auxdata'][key] = []
                    key = []
            except Exception:
                pass
            for iter in reversed(list(range(1, self.iteration + 1))):
                iter_group = self.data_reader.get_iter_group(iter)
                current['pcoord'].append(iter_group['pcoord'][seg_id, :, :])
                current['states'].append(self.assign['trajlabels'][iter - 1,
                                                                   seg_id, :])
                current['bins'].append(self.assign['assignments'][iter - 1,
                                                                  seg_id, :])
                current['seg_id'].append(seg_id)
                current['weights'].append(
                    iter_group['seg_index']['weight'][seg_id])
                current['iteration'].append(iter)
                try:
                    for key in keys:
                        current['auxdata'][key].append(
                            iter_group['auxdata'][key][seg_id])
                except Exception:
                    pass
                seg_id = iter_group['seg_index']['parent_id'][seg_id]
                if seg_id < 0:
                    # Necessary for steady state simulations.  This means they started in that iteration.
                    break
                pi.progress += 1
        current['seg_id'] = list(reversed(current['seg_id']))
        current['iteration'] = list(reversed(current['iteration']))
        current['states'] = np.concatenate(
            np.array(list(reversed(current['states']))))
        current['bins'] = np.concatenate(
            np.array(list(reversed(current['bins']))))
        current['weights'] = np.array(list(reversed(current['weights'])))
        current['pcoord'] = np.concatenate(
            np.array(list(reversed(current['pcoord']))))
        try:
            for key in keys():
                current['auxdata'][key] = np.concatenate(
                    np.array(list(reversed(current['auxdata'][key]))))
        except Exception:
            pass
        current['state_labels'] = self.assign['state_labels']
        for i in ['pcoord', 'states', 'bins', 'weights']:
            current[i] = WIPIDataset(raw=current[i], key=i)
            if i == 'weights':
                current[i].plotter = Plotter(np.log10(current[i].raw),
                                             str('log10 of ' + str(i)),
                                             iteration=current[i].raw.shape[0],
                                             interface=self.interface)
            else:
                current[i].plotter = Plotter(current[i].raw,
                                             i,
                                             iteration=current[i].raw.shape[0],
                                             interface=self.interface)
            current[i].plot = current[i].plotter.plot
        return WIPIDataset(raw=current, key=seg_id)

    @property
    def future(self, value=None):
        '''
        Similar to current/past, but keyed differently and returns different datasets.
        See help for Future.
        '''
        if self._future is None:
            self._future = self.Future(raw=self.__get_children__(), key=None)
            self._future.iteration = self.iteration + 1
        return self._future

    class Future(WIPIDataset):

        # This isn't a real fancy one.
        def __getitem__(self, value):
            if isinstance(value, str):
                print(list(self.__dict__.keys()))
                try:
                    return self.__dict__['raw'][value]
                except Exception:
                    print('{} is not a valid data structure.'.format(value))
            elif isinstance(value, int) or isinstance(value, np.int64):
                # Otherwise, we assume they're trying to index for a seg_id.
                # if value < self.parent.walkers:
                current = {}
                current['pcoord'] = self.__dict__['raw']['pcoord'][value]
                current['states'] = self.__dict__['raw']['states'][value]
                current['bins'] = self.__dict__['raw']['bins'][value]
                current['parents'] = self.__dict__['raw']['parents'][value]
                current['seg_id'] = self.__dict__['raw']['seg_id'][value]
                current['weights'] = self.__dict__['raw']['weights'][value]
                try:
                    current['auxdata'] = {}
                    for key in list(self.__dict__['raw']['auxdata'].keys()):
                        current['auxdata'][key] = self.__dict__['raw'][
                            'auxdata'][key][value]
                except Exception:
                    pass
                current = WIPIDataset(
                    current,
                    'Segment {} in Iter {}'.format(value, self.iteration))
                return current

    def __get_children__(self):
        '''
        Returns all information about the children of a given walker in the current iteration.
        Used to generate and create the future object, if necessary.
        '''

        if self.iteration == self.niters:
            print(
                "Currently at iteration {}, which is the max.  There are no children!"
                .format(self.iteration))
            return 0
        iter_data = __get_data_for_iteration__(value=self.iteration + 1,
                                               parent=self)
        future = {
            'weights': [],
            'pcoord': [],
            'parents': [],
            'summary': iter_data['summary'],
            'seg_id': [],
            'walkers': iter_data['walkers'],
            'states': [],
            'bins': [],
        }
        for seg_id in range(0, self.current.walkers):
            children = np.where(iter_data['parents'] == seg_id)[0]
            if len(children) == 0:
                error = "No children for seg_id {}.".format(seg_id)
                future['weights'].append(error)
                future['pcoord'].append(error)
                future['parents'].append(error)
                future['seg_id'].append(error)
                future['states'].append(error)
                future['bins'].append(error)
            else:
                # Now, we're gonna put them in the thing.
                value = self.iteration + 1
                future['weights'].append(iter_data['weights'][children])
                future['pcoord'].append(
                    iter_data['pcoord'][...][children, :, :])
                try:
                    aux_data = iter_data['auxdata'][...][children, :, :]
                    try:
                        future['aux_data'].append(aux_data)
                    except Exception:
                        future['aux_data'] = aux_data
                except Exception:
                    pass
                future['parents'].append(iter_data['parents'][children])
                future['seg_id'].append(iter_data['seg_id'][children])
                future['states'].append(self.assign['trajlabels'][value - 1,
                                                                  children, :])
                future['bins'].append(self.assign['assignments'][value - 1,
                                                                 children, :])
        return future

    def go(self):
        '''
        Function automatically called by main() when launched via the command line interface.
        Generally, call main, not this function.
        '''
        w = self

        print("")
        print("Welcome to w_ipa (WESTPA Interactive Python Analysis) v. {}!".
              format(w.version))
        print(
            "Run w.introduction for a more thorough introduction, or w.help to see a list of options."
        )
        print("Running analysis & loading files.")
        self.data_reader.open()
        self.analysis_structure()
        # Seems to be consistent with other tools, such as w_assign.  For setting the iterations.
        self.data_reader.open()
        self.niters = self.data_reader.current_iteration - 1
        self.iteration = self.niters
        try:
            print('Your current scheme, system and iteration are : {}, {}, {}'.
                  format(w.scheme, os.getcwd(), w.iteration))
        except Exception:
            pass

    @property
    def introduction(self):
        '''
        Just spits out an introduction, in case someone doesn't call help.
        '''
        help_string = '''
        Call as a dictionary item or a .attribute:

        w.past, w.current, w.future:

            {current}

        Raw schemes can be accessed as follows:

            w.scheme.{scheme_keys}

            and contain mostly the same datasets associated with w.

        The following give raw access to the h5 files associated with the current scheme

        w.west
        w.assign
        w.direct
        w.reweight

        OTHER:

        {w}

        '''.format(
            current=self.__format_keys__(self.current.__dir__(),
                                         split=' ',
                                         offset=12),
            scheme_keys=self.__format_keys__(list(self._scheme.raw.keys())),
            w=self.__format_keys__(self.__dir__(),
                                   offset=8,
                                   max_length=0,
                                   split='',
                                   prepend='w.'),
        )
        print(help_string)

    # Just a little function to be used with the introduction.
    def __format_keys__(self,
                        keys,
                        split='/',
                        offset=0,
                        max_length=80,
                        prepend=''):
        rtn = ''
        run_length = 0
        for key in keys:
            rtn += prepend + str(key) + split
            run_length += len(str(key))
            if run_length >= max_length:
                run_length = offset
                rtn += '\n' + ' ' * offset
        if rtn[-1] == split:
            return rtn[:-1]
        else:
            return rtn

    @property
    def help(self):
        ''' Just a minor function to call help on itself.  Only in here to really help someone get help.'''
        help(self)

    def _repr_pretty_(self, p, cycle):
        self.introduction
        return " "

    def __dir__(self):
        return_list = ['past', 'current', 'future']
        # For the moment, don't expose direct, reweight, or assign, as these are scheme dependent files.
        # They do exist, and always link to the current scheme, however.
        return_list += [
            'iteration', 'niters', 'scheme', 'list_schemes', 'bin_labels',
            'state_labels', 'west', 'trace'
        ]
        return sorted(set(return_list))
Ejemplo n.º 8
0
class WEDDist(WESTParallelTool):
    prog = 'w_eddist'
    description = '''\
Calculate time-resolved transition-event duration distribution from kinetics results


-----------------------------------------------------------------------------
Source data
-----------------------------------------------------------------------------

Source data is collected from the results of 'w_kinetics trace' (see w_kinetics trace --help for
more information on generating this dataset).


-----------------------------------------------------------------------------
Histogram binning
-----------------------------------------------------------------------------

By default, histograms are constructed with 100 bins in each dimension. This
can be overridden by specifying -b/--bins, which accepts a number of different
kinds of arguments:

  a single integer N
    N uniformly spaced bins will be used in each dimension.

  a sequence of integers N1,N2,... (comma-separated)
    N1 uniformly spaced bins will be used for the first dimension, N2 for the
    second, and so on.

  a list of lists [[B11, B12, B13, ...], [B21, B22, B23, ...], ...]
    The bin boundaries B11, B12, B13, ... will be used for the first dimension,
    B21, B22, B23, ... for the second dimension, and so on. These bin
    boundaries need not be uniformly spaced. These expressions will be
    evaluated with Python's ``eval`` construct, with ``np`` available for
    use [e.g. to specify bins using np.arange()].

The first two forms (integer, list of integers) will trigger a scan of all
data in each dimension in order to determine the minimum and maximum values,
which may be very expensive for large datasets. This can be avoided by
explicitly providing bin boundaries using the list-of-lists form.

Note that these bins are *NOT* at all related to the bins used to drive WE
sampling.


-----------------------------------------------------------------------------
Output format
-----------------------------------------------------------------------------

The output file produced (specified by -o/--output, defaulting to "pdist.h5")
may be fed to plothist to generate plots (or appropriately processed text or
HDF5 files) from this data. In short, the following datasets are created:

  ``histograms``
    Normalized histograms. The first axis corresponds to iteration, and
    remaining axes correspond to dimensions of the input dataset.

  ``/binbounds_0``
    Vector of bin boundaries for the first (index 0) dimension. Additional
    datasets similarly named (/binbounds_1, /binbounds_2, ...) are created
    for additional dimensions.

  ``/midpoints_0``
    Vector of bin midpoints for the first (index 0) dimension. Additional
    datasets similarly named are created for additional dimensions.

  ``n_iter``
    Vector of iteration numbers corresponding to the stored histograms (i.e.
    the first axis of the ``histograms`` dataset).


-----------------------------------------------------------------------------
Subsequent processing
-----------------------------------------------------------------------------

The output generated by this program (-o/--output, default "pdist.h5") may be
plotted by the ``plothist`` program. See ``plothist --help`` for more
information.


-----------------------------------------------------------------------------
Parallelization
-----------------------------------------------------------------------------

This tool supports parallelized binning, including reading of input data.
Parallel processing is the default. For simple cases (reading pre-computed
input data, modest numbers of segments), serial processing (--serial) may be
more efficient.


-----------------------------------------------------------------------------
Command-line options
-----------------------------------------------------------------------------

'''

    def __init__(self):
        super().__init__()

        # Parallel processing by default (this is not actually necessary, but it is
        # informative!)
        self.wm_env.default_work_manager = self.wm_env.default_parallel_work_manager

        # These are used throughout
        self.progress = ProgressIndicatorComponent()
        self.default_kinetics_file = 'kintrace.h5'
        self.kinetics_filename = None
        self.kinetics_file = None  # Kinavg file
        self.istate = None
        self.fstate = None
        # Duration and weight dsspecs
        self.duration_dsspec = None
        self.wt_dsspec = None
        self.binspec = None
        self.output_filename = None
        self.output_file = None

        # These are used during histogram generation only
        self.iter_start = None
        self.iter_stop = None
        self.ndim = None
        # self.ntimepoints = None
        self.dset_dtype = None
        self.binbounds = None  # bin boundaries for each dimension
        self.midpoints = None  # bin midpoints for each dimension
        self.data_range = None  # data range for each dimension, as the pairs (min,max)
        self.ignore_out_of_range = False
        self.compress_output = False

    def add_args(self, parser):

        parser.add_argument(
            '-b',
            '--bins',
            dest='bins',
            metavar='BINEXPR',
            default='100',
            help=
            '''Use BINEXPR for bins. This may be an integer, which will be used for each
                            dimension of the progress coordinate; a list of integers (formatted as [n1,n2,...])
                            which will use n1 bins for the first dimension, n2 for the second dimension, and so on;
                            or a list of lists of boundaries (formatted as [[a1, a2, ...], [b1, b2, ...], ... ]), which
                            will use [a1, a2, ...] as bin boundaries for the first dimension, [b1, b2, ...] as bin boundaries
                            for the second dimension, and so on. (Default: 100 bins in each dimension.)''',
        )

        parser.add_argument(
            '-C',
            '--compress',
            action='store_true',
            help=
            '''Compress histograms. May make storage of higher-dimensional histograms
                            more tractable, at the (possible extreme) expense of increased analysis time.
                            (Default: no compression.)''',
        )

        parser.add_argument(
            '--loose',
            dest='ignore_out_of_range',
            action='store_true',
            help=
            '''Ignore values that do not fall within bins. (Risky, as this can make buggy bin
                            boundaries appear as reasonable data. Only use if you are
                            sure of your bin boundary specification.)''',
        )

        parser.add_argument('--istate',
                            type=int,
                            required=True,
                            dest='istate',
                            help='''Initial state defining transition event''')

        parser.add_argument('--fstate',
                            type=int,
                            required=True,
                            dest='fstate',
                            help='''Final state defining transition event''')

        itergroup = parser.add_argument_group('iteration range options')

        itergroup.add_argument(
            '--first-iter',
            default=1,
            dest='iter_start',
            type=int,
            help='''Iteration to begin analysis (default: 1)''')

        itergroup.add_argument('--last-iter',
                               dest='iter_stop',
                               type=int,
                               help='''Iteration to end analysis''')

        iogroup = parser.add_argument_group('input/output options')

        # self.default_kinetics_file will be picked up as a class attribute from the appropriate subclass
        iogroup.add_argument(
            '-k',
            '--kinetics',
            default=self.default_kinetics_file,
            help=
            '''Populations and transition rates (including evolution) are stored in KINETICS
                            (default: %(default)s).''',
        )
        iogroup.add_argument(
            '-o',
            '--output',
            dest='output',
            default='eddist.h5',
            help='''Store results in OUTPUT (default: %(default)s).''')

        self.progress.add_args(parser)

    def process_args(self, args):
        self.progress.process_args(args)
        self.kinetics_filename = args.kinetics
        self.istate = args.istate
        self.fstate = args.fstate
        self.kinetics_file = h5io.WESTPAH5File(self.kinetics_filename, 'r')

        self.iter_start = args.iter_start
        if args.iter_stop is None:
            self.iter_stop = self.kinetics_file.attrs['iter_stop']
        else:
            self.iter_stop = args.iter_stop + 1

        self.binspec = args.bins
        self.output_filename = args.output
        self.ignore_out_of_range = bool(args.ignore_out_of_range)
        self.compress_output = args.compress or False

    def go(self):

        pi = self.progress.indicator
        pi.operation = 'Initializing'
        with pi:
            self.duration = self.kinetics_file['durations'][self.iter_start -
                                                            1:self.iter_stop -
                                                            1]

            # Only select transition events from specified istate to fstate
            mask = (self.duration['istate']
                    == self.istate) & (self.duration['fstate'] == self.fstate)

            self.duration_dsspec = DurationDataset(
                self.kinetics_file['durations']['duration'], mask,
                self.iter_start)
            self.wt_dsspec = DurationDataset(
                self.kinetics_file['durations']['weight'], mask,
                self.iter_start)

            self.output_file = h5py.File(self.output_filename, 'w')
            h5io.stamp_creator_data(self.output_file)

            # Construct bin boundaries
            self.construct_bins(self.parse_binspec(self.binspec))
            for idim, (binbounds, midpoints) in enumerate(
                    zip(self.binbounds, self.midpoints)):
                self.output_file['binbounds_{}'.format(idim)] = binbounds
                self.output_file['midpoints_{}'.format(idim)] = midpoints

            # construct histogram
            self.construct_histogram()

            # Record iteration range
            iter_range = np.arange(self.iter_start,
                                   self.iter_stop,
                                   1,
                                   dtype=(np.min_scalar_type(self.iter_stop)))
            self.output_file['n_iter'] = iter_range
            self.output_file['histograms'].attrs[
                'iter_start'] = self.iter_start
            self.output_file['histograms'].attrs['iter_stop'] = self.iter_stop

            self.output_file.close()

    @staticmethod
    def parse_binspec(binspec):

        namespace = {'numpy': np, 'np': np, 'inf': float('inf')}

        try:
            binspec_compiled = eval(binspec, namespace)
        except Exception as e:
            raise ValueError('invalid bin specification: {!r}'.format(e))
        else:
            if log.isEnabledFor(logging.DEBUG):
                log.debug('bin specs: {!r}'.format(binspec_compiled))
        return binspec_compiled

    def construct_bins(self, bins):
        '''
        Construct bins according to ``bins``, which may be:

          1) A scalar integer (for that number of bins in each dimension)
          2) A sequence of integers (specifying number of bins for each dimension)
          3) A sequence of sequences of bin boundaries (specifying boundaries for each dimension)

        Sets ``self.binbounds`` to a list of arrays of bin boundaries appropriate for passing to
        fasthist.histnd, along with ``self.midpoints`` to the midpoints of the bins.
        '''

        if not isiterable(bins):
            self._construct_bins_from_scalar(bins)
        elif not isiterable(bins[0]):
            self._construct_bins_from_int_seq(bins)
        else:
            self._construct_bins_from_bound_seqs(bins)

        if log.isEnabledFor(logging.DEBUG):
            log.debug('binbounds: {!r}'.format(self.binbounds))

    def scan_data_shape(self):
        if self.ndim is None:
            dset = self.duration_dsspec
            # self.ntimepoints = dset.shape[1]
            # self.ndim = dset.shape[2]
            self.ndim = 1
            self.dset_dtype = dset.dtype

    def scan_data_range(self):
        '''Scan input data for range in each dimension. The number of dimensions is determined
        from the shape of the progress coordinate as of self.iter_start.'''

        self.progress.indicator.new_operation('Scanning for data range',
                                              self.iter_stop - self.iter_start)
        self.scan_data_shape()

        dset_dtype = self.dset_dtype
        ndim = self.ndim
        dsspec = self.duration_dsspec

        try:
            minval = np.finfo(dset_dtype).min
            maxval = np.finfo(dset_dtype).max
        except ValueError:
            minval = np.iinfo(dset_dtype).min
            maxval = np.iinfo(dset_dtype).max

        data_range = self.data_range = [(maxval, minval)
                                        for _i in range(self.ndim)]

        # futures = []
        # for n_iter in xrange(self.iter_start, self.iter_stop):
        # _remote_min_max(ndim, dset_dtype, n_iter, dsspec)
        #    futures.append(self.work_manager.submit(_remote_min_max, args=(ndim, dset_dtype, n_iter, dsspec)))

        # for future in self.work_manager.as_completed(futures):
        for future in self.work_manager.submit_as_completed(
            ((_remote_min_max, (ndim, dset_dtype, n_iter, dsspec), {})
             for n_iter in range(self.iter_start, self.iter_stop)),
                self.max_queue_len,
        ):
            bounds = future.get_result(discard=True)
            for idim in range(ndim):
                current_min, current_max = data_range[idim]
                current_min = min(current_min, bounds[idim][0])
                current_max = max(current_max, bounds[idim][1])
                data_range[idim] = (current_min, current_max)
            self.progress.indicator.progress += 1

    def _construct_bins_from_scalar(self, bins):
        if self.data_range is None:
            self.scan_data_range()

        # print(self.data_range)

        self.binbounds = []
        self.midpoints = []
        for idim in range(self.ndim):
            lb, ub = self.data_range[idim]
            # Advance just beyond the upper bound of the range, so that we catch
            # the maximum in the histogram
            ub *= 1.01

            # lb -= 0.01

            boundset = np.linspace(lb, ub, bins + 1)
            midpoints = (boundset[:-1] + boundset[1:]) / 2.0
            self.binbounds.append(boundset)
            self.midpoints.append(midpoints)

    def _construct_bins_from_int_seq(self, bins):
        if self.data_range is None:
            self.scan_data_range()

        self.binbounds = []
        self.midpoints = []
        for idim in range(self.ndim):
            lb, ub = self.data_range[idim]
            # Advance just beyond the upper bound of the range, so that we catch
            # the maximum in the histogram
            ub *= 1.01

            boundset = np.linspace(lb, ub, bins[idim] + 1)
            midpoints = (boundset[:-1] + boundset[1:]) / 2.0
            self.binbounds.append(boundset)
            self.midpoints.append(midpoints)

    def _construct_bins_from_bound_seqs(self, bins):
        self.binbounds = []
        self.midpoints = []
        for boundset in bins:
            boundset = np.asarray(boundset)
            if (np.diff(boundset) <= 0).any():
                raise ValueError(
                    'boundary set {!r} is not strictly monotonically increasing'
                    .format(boundset))
            self.binbounds.append(boundset)
            self.midpoints.append((boundset[:-1] + boundset[1:]) / 2.0)

    def construct_histogram(self):
        '''Construct a histogram using bins previously constructed with ``construct_bins()``.
        The time series of histogram values is stored in ``histograms``.
        Each histogram in the time series is normalized.'''

        self.scan_data_shape()

        iter_count = self.iter_stop - self.iter_start
        histograms_ds = self.output_file.create_dataset(
            'histograms',
            dtype=np.float64,
            shape=((iter_count, ) +
                   tuple(len(bounds) - 1 for bounds in self.binbounds)),
            compression=9 if self.compress_output else None,
        )
        binbounds = [
            np.require(boundset, self.dset_dtype, 'C')
            for boundset in self.binbounds
        ]

        self.progress.indicator.new_operation('Constructing histograms',
                                              self.iter_stop - self.iter_start)
        task_gen = (
            (_remote_bin_iter,
             (iiter, n_iter, self.duration_dsspec, self.wt_dsspec, 0,
              binbounds, self.ignore_out_of_range), {})
            for (iiter,
                 n_iter) in enumerate(range(self.iter_start, self.iter_stop)))
        # futures = set()
        # for iiter, n_iter in enumerate(xrange(self.iter_start, self.iter_stop)):
        #    initpoint = 1 if iiter > 0 else 0
        #    futures.add(self.work_manager.submit(_remote_bin_iter,
        #                                            args=(iiter, n_iter, self.dsspec, self.wt_dsspec, initpoint, binbounds)))

        # for future in self.work_manager.as_completed(futures):
        # future = self.work_manager.wait_any(futures)
        # for future in self.work_manager.submit_as_completed(task_gen, self.queue_size):
        log.debug('max queue length: {!r}'.format(self.max_queue_len))
        for future in self.work_manager.submit_as_completed(
                task_gen, self.max_queue_len):
            iiter, n_iter, iter_hist = future.get_result(discard=True)
            self.progress.indicator.progress += 1

            # store histogram
            histograms_ds[iiter] = iter_hist
            del iter_hist, future