Esempio n. 1
0
 def stamp_hash(self, h5file_name, new_hash):
     '''Loads a file, stamps it, and returns the opened file in read only'''
     h5file = h5io.WESTPAH5File(h5file_name, 'r+')
     h5file.attrs['arg_hash'] = new_hash
     h5file.close()
     h5file = h5io.WESTPAH5File(h5file_name, 'r')
     return h5file
Esempio n. 2
0
 def open_files(self):
     self.output_file = h5io.WESTPAH5File(self.output_filename, 'a', creating_program=True)
     h5io.stamp_creator_data(self.output_file)
     self.assignments_file = h5io.WESTPAH5File(self.assignments_filename, 'r')#, driver='core', backing_store=False)
     self.kinetics_file = h5io.WESTPAH5File(self.kinetics_filename, 'r')#, driver='core', backing_store=False)
     if not self.iter_range.check_data_iter_range_least(self.assignments_file):
         raise ValueError('assignments data do not span the requested iterations')
Esempio n. 3
0
 def call_analysis(self, name):
     # This works for either analysis routine.  We tend to get slightly different answers from the reweighting routine (which is okay!), but they're usually the same.
     # u_to_b and b_to_u should be 'known good values' for each tool.
     analysis_config = {
         'west': 'odld/{}.h5'.format('west'),
         'assignments': os.path.join(self.path, '{}.h5'.format('assign')),
         'output': os.path.join(self.path, '{}.h5'.format(name)),
         'kinetics': os.path.join(self.path, '{}.h5'.format(name))
     }
     analysis_config.update({'step_iter': 1, 'e': 'cumulative'})
     if name == 'direct':
         analysis = w_direct.WDirect()
     if name == 'reweight':
         analysis = w_reweight.WReweight()
     args = ['all']
     for key, value in analysis_config.iteritems():
         args.append(str('--') + str(key).replace('_', '-'))
         args.append(str(value))
     # Don't print averages, and don't run bootstrap.  We're not testing the bootstrap, after all, just the estimator routines.
     args.append('--disable-averages')
     args.append('-db')
     analysis.make_parser_and_process(args=args)
     analysis.work_manager = work_managers.SerialWorkManager()
     analysis.go()
     # Load the output and test it against the known good file.
     output = h5io.WESTPAH5File(
         os.path.join(self.path, '{}.h5'.format(name)), 'r')
     test = h5io.WESTPAH5File(os.path.join('odld/{}.h5'.format(name)), 'r')
     # check rate evolution!
     # If our rates agree, we can generally assume that our other values are good, too.
     orates = output['rate_evolution'][-1, :, :]['expected']
     trates = output['rate_evolution'][-1, :, :]['expected']
     assert abs(orates[0, 1] - trates[0, 1]) <= EPS
     assert abs(orates[1, 0] - trates[1, 0]) <= EPS
Esempio n. 4
0
 def process_args(self, args):
     self.output_file = h5io.WESTPAH5File(args.output,
                                          'w',
                                          creating_program=True)
     self.assignments_file = h5io.WESTPAH5File(args.assignments, 'r')
     # Force a build of the transition matrix at the iteration level.
     self.sampling_frequency = 'iteration' if self.assignments_file.attrs[
         'subsampled'] == True else args.sampling_frequency
Esempio n. 5
0
 def process_args(self, args):
     self.progress.process_args(args)
     self.assignments_file = h5io.WESTPAH5File(args.assignments, 'r')
     self.data_reader.process_args(args)
     with self.data_reader:
         self.iter_range.process_args(args)
     self.output_file = h5io.WESTPAH5File(args.output,
                                          'w',
                                          creating_program=True)
     h5io.stamp_creator_data(self.output_file)
     if not self.iter_range.check_data_iter_range_least(
             self.assignments_file):
         raise ValueError(
             'assignments do not span the requested iterations')
     self.do_compression = args.compression
Esempio n. 6
0
    def open_files(self):
        self.output_file = h5io.WESTPAH5File(self.output_file,
                                             'w',
                                             creating_program=True)
        h5io.stamp_creator_data(self.output_file)

        opened_files = self.generate_file_list([self.west])
        self.westH5 = opened_files[self.west]
Esempio n. 7
0
    def process_args(self, args):
        self.progress.process_args(args)
        self.kinetics_filename = args.kinetics
        self.istate = args.istate
        self.fstate = args.fstate
        self.kinetics_file = h5io.WESTPAH5File(self.kinetics_filename, 'r')

        self.iter_start = args.iter_start
        if args.iter_stop is None:
            self.iter_stop = self.kinetics_file.attrs['iter_stop']
        else:
            self.iter_stop = args.iter_stop + 1

        self.binspec = args.bins
        self.output_filename = args.output
        self.ignore_out_of_range = bool(args.ignore_out_of_range)
        self.compress_output = args.compress or False
Esempio n. 8
0
    def generate_file_list(self, key_list):
        '''A convenience function which takes in a list of keys that are filenames, and returns a dictionary
        which contains all the individual files loaded inside of a dictionary keyed to the filename.'''
        return_dict = {}
        if self.ntrials == 0:
            raise self.NoSimulationsException(
                'You must specify the number of simulations.')

        for key in key_list:
            return_dict[key] = {}
        for i in range(1, self.ntrials + 1):
            # Need to not make this hard coded, but who cares for now.
            for key in key_list:
                return_dict[key][i] = h5io.WESTPAH5File(
                    os.path.join(self.master,
                                 str(i).zfill(2), key), 'r')
        return return_dict
Esempio n. 9
0
def _find_matching_segments(west_datafile_name,
                            n_iter,
                            predicate,
                            invert=False):
    '''Find all segments in iteration ``n_iter`` that match (or do not match, if
    ``invert`` is true) the given ``predicate``. Returns a sequence of matching
    seg_ids.'''

    with h5io.WESTPAH5File(west_datafile_name, 'r') as west_datafile:
        iter_group = west_datafile.get_iter_group(n_iter)
        nsegs = iter_group['seg_index'].shape[0]
        matching_ids = set(map(int, predicate(n_iter, iter_group)))

        if invert:
            matching_ids = set(range(nsegs)) - matching_ids

        matchvec = sorted(
            numpy.fromiter(matching_ids,
                           dtype=seg_id_dtype,
                           count=len(matching_ids)))
        return n_iter, matchvec
Esempio n. 10
0
    def go(self):
        self.data_reader.open('r')
        output_file = h5io.WESTPAH5File(self.output_filename, mode='w')
        pi = self.progress.indicator

        iter_start, iter_stop = self.iter_range.iter_start, self.iter_range.iter_stop
        iter_count = iter_stop - iter_start

        output_file.create_dataset('n_iter',
                                   dtype=n_iter_dtype,
                                   data=list(range(iter_start, iter_stop)))
        current_seg_count = 0
        seg_count_ds = output_file.create_dataset('n_segs',
                                                  dtype=numpy.uint,
                                                  shape=(iter_count, ))
        matching_segs_ds = output_file.create_dataset(
            'seg_ids',
            shape=(iter_count, 0),
            maxshape=(iter_count, None),
            dtype=seg_id_dtype,
            chunks=h5io.calc_chunksize((iter_count, 1000000), seg_id_dtype),
            shuffle=True,
            compression=9)
        weights_ds = output_file.create_dataset('weights',
                                                shape=(iter_count, 0),
                                                maxshape=(iter_count, None),
                                                dtype=weight_dtype,
                                                chunks=h5io.calc_chunksize(
                                                    (iter_count, 1000000),
                                                    weight_dtype),
                                                shuffle=True,
                                                compression=9)

        with pi:
            pi.new_operation('Finding matching segments', extent=iter_count)
            #             futures = set()
            #             for n_iter in xrange(iter_start,iter_stop):
            #                 futures.add(self.work_manager.submit(_find_matching_segments,
            #                                                      args=(self.data_reader.we_h5filename,n_iter,self.predicate,self.invert)))

            #             for future in self.work_manager.as_completed(futures):
            for future in self.work_manager.submit_as_completed(
                ((_find_matching_segments,
                  (self.data_reader.we_h5filename, n_iter, self.predicate,
                   self.invert), {})
                 for n_iter in range(iter_start, iter_stop)),
                    self.max_queue_len):
                n_iter, matching_ids = future.get_result()
                n_matches = len(matching_ids)

                if n_matches:
                    if n_matches > current_seg_count:
                        current_seg_count = len(matching_ids)
                        matching_segs_ds.resize((iter_count, n_matches))
                        weights_ds.resize((iter_count, n_matches))
                        current_seg_count = n_matches

                    seg_count_ds[n_iter - iter_start] = n_matches
                    matching_segs_ds[n_iter -
                                     iter_start, :n_matches] = matching_ids
                    weights_ds[n_iter - iter_start, :
                               n_matches] = self.data_reader.get_iter_group(
                                   n_iter)['seg_index']['weight'][sorted(
                                       matching_ids)]
                del matching_ids
                pi.progress += 1

            if self.include_ancestors:
                pi.new_operation('Tracing ancestors of matching segments',
                                 extent=iter_count)
                from_previous = set()
                current_seg_count = matching_segs_ds.shape[1]
                for n_iter in range(iter_stop - 1, iter_start - 1, -1):
                    iiter = n_iter - iter_start
                    n_matches = seg_count_ds[iiter]
                    matching_ids = set(from_previous)
                    if n_matches:
                        matching_ids.update(
                            matching_segs_ds[iiter, :seg_count_ds[iiter]])
                    from_previous.clear()

                    n_matches = len(matching_ids)
                    if n_matches > current_seg_count:
                        matching_segs_ds.resize((iter_count, n_matches))
                        weights_ds.resize((iter_count, n_matches))
                        current_seg_count = n_matches

                    if n_matches > 0:
                        seg_count_ds[iiter] = n_matches
                        matching_ids = sorted(matching_ids)
                        matching_segs_ds[iiter, :n_matches] = matching_ids
                        weights_ds[
                            iiter, :
                            n_matches] = self.data_reader.get_iter_group(
                                n_iter)['seg_index']['weight'][sorted(
                                    matching_ids)]
                        parent_ids = self.data_reader.get_iter_group(n_iter)[
                            'seg_index']['parent_id'][sorted(matching_ids)]
                        from_previous.update(
                            parent_id for parent_id in parent_ids
                            if parent_id >= 0)  # filter initial states
                        del parent_ids
                    del matching_ids
                    pi.progress += 1
Esempio n. 11
0
    def analysis_structure(self):
        '''
        Run automatically on startup.  Parses through the configuration file, and loads up all the data files from the different 
        analysis schematics.  If they don't exist, it creates them automatically by hooking in to existing analysis routines 
        and going from there.  

        It does this by calling in the make_parser_and_process function for w_{assign,reweight,direct} using a custom built list
        of args.  The user can specify everything in the configuration file that would have been specified on the command line.

        For instance, were one to call w_direct as follows:

            w_direct --evolution cumulative --step-iter 1 --disable-correl

        the west.cfg would look as follows:

        west:
          analysis:
            w_direct:
              evolution: cumulative
              step_iter: 1
              extra: ['disable-correl']

        Alternatively, if one wishes to use the same options for both w_direct and w_reweight, the key 'w_direct' can be replaced
        with 'kinetics'.
        '''
        # Make sure everything exists.
        try:
            os.mkdir(self.__settings['directory'])
        except:
            pass
        # Now, check to see whether they exist, and then load them.
        self.__analysis_schemes__ = {}
        # We really need to implement some sort of default behavior if an analysis scheme isn't set.
        # Right now, we just crash.  That isn't really graceful.
        for scheme in self.__settings['analysis_schemes']:
            if self.__settings['analysis_schemes'][scheme]['enabled']:
                if self.work_manager.running == False:
                    self.work_manager.startup()
                path = os.path.join(os.getcwd(), self.__settings['directory'], scheme)
                #if 'postanalysis' in self.__settings['analysis_schemes'][scheme] and 'postanalysis' in self.__settings['postanalysis']:
                # Should clean this up.  But it uses the default global setting if a by-scheme one isn't set.
                if 'postanalysis' in self.__settings:
                    if 'postanalysis' in self.__settings['analysis_schemes'][scheme]:
                        pass
                    else:
                        self.__settings['analysis_schemes'][scheme]['postanalysis'] = self.__settings['postanalysis']
                try:
                    os.mkdir(path)
                except:
                    pass
                self.__analysis_schemes__[scheme] = {}
                try:
                    if self.__settings['analysis_schemes'][scheme]['postanalysis'] == True or self.__settings['postanalysis'] == True:
                        analysis_files = ['assign', 'direct', 'reweight']
                    else:
                        analysis_files = ['assign', 'direct']
                except:
                    analysis_files = ['assign', 'direct']
                    self.__settings['analysis_schemes'][scheme]['postanalysis'] = False
                reanalyze_kinetics = False
                assign_hash = None
                for name in analysis_files:
                    arg_hash = None
                    if self.reanalyze == True:
                        reanalyze_kinetics = True
                        try:
                            os.remove(os.path.join(path, '{}.h5'.format(name)))
                        except:
                            pass
                    else:
                        try:
                            # Try to load the hash.  If we fail to load the hash or the file, we need to reload.
                            #if self.reanalyze == True:
                            #    raise ValueError('Reanalyze set to true.')
                            self.__analysis_schemes__[scheme][name] = h5io.WESTPAH5File(os.path.join(path, '{}.h5'.format(name)), 'r')
                            arg_hash = self.__analysis_schemes__[scheme][name].attrs['arg_hash']
                            if name == 'assign':
                                assign_hash = arg_hash
                        except:
                            pass
                            # We shouldn't rely on this.
                            # self.reanalyze = True
                    if True:
                        if name == 'assign':
                            assign = w_assign.WAssign()

                            w_assign_config = { 'output': os.path.join(path, '{}.h5'.format(name))}
                            try:
                                w_assign_config.update(self.__settings['w_assign'])
                            except:
                                pass
                            try:
                                w_assign_config.update(self.__settings['analysis_schemes'][scheme]['w_assign'])
                            except:
                                pass
                            args = []
                            for key,value in w_assign_config.items():
                                if key != 'extra':
                                    args.append(str('--') + str(key).replace('_', '-'))
                                    args.append(str(value))
                            # This is for stuff like disabling correlation analysis, etc.
                            if 'extra' in list(w_assign_config.keys()):
                                # We're sorting to ensure that the order doesn't matter.
                                for value in sorted(w_assign_config['extra']):
                                    args.append(str('--') + str(value).replace('_', '-'))
                            # We're just calling the built in function.
                            # This is a lot cleaner than what we had in before, and far more workable.
                            args.append('--config-from-file')
                            args.append('--scheme-name')
                            args.append('{}'.format(scheme))
                            # Why are we calling this if we're not sure we're remaking the file?
                            # We need to load up the bin mapper and states and see if they're the same.
                            assign.make_parser_and_process(args=args)
                            import pickle
                            #new_hash = self.hash_args(args=args, path=path, extra=[self.niters, pickle.dumps(assign.binning.mapper), assign.states])
                            # We need to encode it properly to ensure that some OS specific thing doesn't kill us.  Same goes for the args, ultimately.
                            # Mostly, we just need to ensure that we're consistent.
                            new_hash = self.hash_args(args=args, path=path,
                                                      extra=[int(self.niters),
                                                      codecs.encode(pickle.dumps(assign.binning.mapper), "base64"),
                                                      base64.b64encode(str(assign.states).encode())])
                            # Let's check the hash.  If the hash is the same, we don't need to reload.
                            if self.debug_mode == True:
                                print('{:<10}: old hash, new hash -- {}, {}'.format(name, arg_hash, new_hash))
                            if self.ignore_hash == False and (arg_hash != new_hash or self.reanalyze == True):
                                # If the hashes are different, or we need to reanalyze, delete the file.
                                try:
                                    os.remove(os.path.join(path, '{}.h5'.format(name)))
                                except:
                                    pass
                                print('Reanalyzing file {}.h5 for scheme {}.'.format(name, scheme))
                                #reanalyze_kinetics = True
                                # We want to use the work manager we have here.  Otherwise, just let the tool sort out what it needs, honestly.
                                assign.work_manager = self.work_manager

                                assign.go()
                                assign.data_reader.close()

                                # Stamp w/ hash, then reload as read only.
                                self.__analysis_schemes__[scheme][name] = self.stamp_hash(os.path.join(path, '{}.h5'.format(name)), new_hash)
                            del(assign)
                            # Update the assignment hash.
                            assign_hash = new_hash

                        # Since these are all contained within one tool, now, we want it to just... load everything.
                        if name == 'direct' or name == 'reweight':
                            assignment_file = self.__analysis_schemes__[scheme]['assign']
                            if name == 'direct':
                                analysis = w_direct.WDirect()
                            if name == 'reweight':
                                analysis = w_reweight.WReweight()
                            
                            analysis_config = { 'assignments': os.path.join(path, '{}.h5'.format('assign')), 'output': os.path.join(path, '{}.h5'.format(name)), 'kinetics': os.path.join(path, '{}.h5'.format(name))}

                            # Pull from general analysis options, then general SPECIFIC options for each analysis,
                            # then general options for that analysis scheme, then specific options for the analysis type in the scheme.

                            try:
                                analysis_config.update(self.__settings['kinetics'])
                            except:
                                pass
                            try:
                                analysis_config.update(self.__settings['w_{}'.format(name)])
                            except:
                                pass
                            try:
                                analysis_config.update(self.__settings['analysis_schemes'][scheme]['kinetics'])
                            except:
                                pass
                            try:
                                analysis_config.update(self.__settings['analysis_schemes'][scheme]['w_{}'.format(name)])
                            except:
                                pass

                            # We're pulling in a default set of arguments, then updating them with arguments from the west.cfg file, if appropriate, after setting the appropriate command
                            # Then, we call the magic function 'make_parser_and_process' with the arguments we've pulled in.
                            # The tool has no real idea it's being called outside of its actual function, and we're good to go.
                            args = ['all']
                            for key,value in analysis_config.items():
                                if key != 'extra':
                                    args.append(str('--') + str(key).replace('_', '-'))
                                    args.append(str(value))
                            # This is for stuff like disabling correlation analysis, etc.
                            if 'extra' in list(analysis_config.keys()):
                                for value in sorted(analysis_config['extra']):
                                    args.append(str('--') + str(value).replace('_', '-'))
                            # We want to not display the averages, so...
                            args.append('--disable-averages')
                            new_hash = self.hash_args(args=args, path=path, extra=[int(self.niters), assign_hash])
                            #if arg_hash != new_hash or self.reanalyze == True or reanalyze_kinetics == True:
                            if self.debug_mode == True:
                                print('{:<10}: old hash, new hash -- {}, {}'.format(name, arg_hash, new_hash))
                            if self.ignore_hash == False and (arg_hash != new_hash or reanalyze_kinetics == True):
                                try:
                                    os.remove(os.path.join(path, '{}.h5'.format(name)))
                                except:
                                    pass
                                print('Reanalyzing file {}.h5 for scheme {}.'.format(name, scheme))
                                analysis.make_parser_and_process(args=args)
                                # We want to hook into the existing work manager.
                                analysis.work_manager = self.work_manager

                                analysis.go()

                                # Open!
                                self.__analysis_schemes__[scheme][name] = self.stamp_hash(os.path.join(path, '{}.h5'.format(name)), new_hash)
                            del(analysis)

        # Make sure this doesn't get too far out, here.  We need to keep it alive as long as we're actually analyzing things.
        # self.work_manager.shutdown()
        print("")
        print("Complete!")
Esempio n. 12
0
    def go(self):
        self.data_reader.open('r')
        assignments_file = h5py.File(self.assignments_filename, mode='r')
        output_file = h5io.WESTPAH5File(self.output_filename, mode='w')
        pi = self.progress.indicator
        count = self.count
        timepoint = self.timepoint

        nbins = assignments_file.attrs['nbins'] + 1
        assignments_ds = assignments_file['assignments']

        iter_start, iter_stop = self.iter_range.iter_start, self.iter_range.iter_stop
        iter_count = iter_stop - iter_start
        h5io.check_iter_range_least(assignments_ds, iter_start, iter_stop)
        nsegs = assignments_file['nsegs'][h5io.get_iteration_slice(
            assignments_file['nsegs'], iter_start, iter_stop)]

        output_file.create_dataset('n_iter',
                                   dtype=n_iter_dtype,
                                   data=list(range(iter_start, iter_stop)))

        seg_count_ds = output_file.create_dataset('nsegs',
                                                  dtype=numpy.uint,
                                                  shape=(iter_count, nbins))
        matching_segs_ds = output_file.create_dataset(
            'seg_ids',
            shape=(iter_count, nbins, count),
            dtype=seg_id_dtype,
            chunks=h5io.calc_chunksize((iter_count, nbins, count),
                                       seg_id_dtype),
            shuffle=True,
            compression=9)
        weights_ds = output_file.create_dataset('weights',
                                                shape=(iter_count, nbins,
                                                       count),
                                                dtype=weight_dtype,
                                                chunks=h5io.calc_chunksize(
                                                    (iter_count, nbins, count),
                                                    weight_dtype),
                                                shuffle=True,
                                                compression=9)
        what = self.what

        with pi:
            pi.new_operation('Finding matching segments', extent=iter_count)
            for iiter, n_iter in enumerate(range(iter_start, iter_stop)):
                assignments = numpy.require(assignments_ds[
                    h5io.get_iteration_entry(assignments_ds, n_iter) +
                    numpy.index_exp[:, timepoint]],
                                            dtype=westpa.binning.index_dtype)
                all_weights = self.data_reader.get_iter_group(
                    n_iter)['seg_index']['weight']

                # the following Cython function just executes this loop:
                #for iseg in xrange(nsegs[iiter]):
                #    segs_by_bin[iseg,assignments[iseg]] = True
                segs_by_bin = assignments_list_to_table(
                    nsegs[iiter], nbins, assignments)
                for ibin in range(nbins):
                    segs = numpy.nonzero(segs_by_bin[:, ibin])[0]

                    seg_count_ds[iiter, ibin] = min(len(segs), count)

                    if len(segs):
                        weights = all_weights.take(segs)

                        if what == 'lowweight':
                            indices = numpy.argsort(weights)[:count]
                        elif what == 'highweight':
                            indices = numpy.argsort(weights)[::-1][:count]
                        else:
                            assert what == 'random'
                            indices = numpy.random.permutation(len(weights))

                        matching_segs_ds[iiter,
                                         ibin, :len(segs)] = segs.take(indices)
                        weights_ds[iiter,
                                   ibin, :len(segs)] = weights.take(indices)
                        del segs, weights

                del assignments, segs_by_bin, all_weights
                pi.progress += 1
Esempio n. 13
0
from westpa import rc, h5io

data_manager = rc.get_data_manager()

##Store west.h5 file in RAM for testing
west_file_name = 'west.h5'
west_file = h5io.WESTPAH5File(west_file_name,
                              driver='core',
                              backing_store=False)

data_manager.we_h5file = west_file
data_manager.we_h5file_version = int(
    west_file['/'].attrs['west_file_format_version'])