def stamp_hash(self, h5file_name, new_hash): '''Loads a file, stamps it, and returns the opened file in read only''' h5file = h5io.WESTPAH5File(h5file_name, 'r+') h5file.attrs['arg_hash'] = new_hash h5file.close() h5file = h5io.WESTPAH5File(h5file_name, 'r') return h5file
def open_files(self): self.output_file = h5io.WESTPAH5File(self.output_filename, 'a', creating_program=True) h5io.stamp_creator_data(self.output_file) self.assignments_file = h5io.WESTPAH5File(self.assignments_filename, 'r')#, driver='core', backing_store=False) self.kinetics_file = h5io.WESTPAH5File(self.kinetics_filename, 'r')#, driver='core', backing_store=False) if not self.iter_range.check_data_iter_range_least(self.assignments_file): raise ValueError('assignments data do not span the requested iterations')
def call_analysis(self, name): # This works for either analysis routine. We tend to get slightly different answers from the reweighting routine (which is okay!), but they're usually the same. # u_to_b and b_to_u should be 'known good values' for each tool. analysis_config = { 'west': 'odld/{}.h5'.format('west'), 'assignments': os.path.join(self.path, '{}.h5'.format('assign')), 'output': os.path.join(self.path, '{}.h5'.format(name)), 'kinetics': os.path.join(self.path, '{}.h5'.format(name)) } analysis_config.update({'step_iter': 1, 'e': 'cumulative'}) if name == 'direct': analysis = w_direct.WDirect() if name == 'reweight': analysis = w_reweight.WReweight() args = ['all'] for key, value in analysis_config.iteritems(): args.append(str('--') + str(key).replace('_', '-')) args.append(str(value)) # Don't print averages, and don't run bootstrap. We're not testing the bootstrap, after all, just the estimator routines. args.append('--disable-averages') args.append('-db') analysis.make_parser_and_process(args=args) analysis.work_manager = work_managers.SerialWorkManager() analysis.go() # Load the output and test it against the known good file. output = h5io.WESTPAH5File( os.path.join(self.path, '{}.h5'.format(name)), 'r') test = h5io.WESTPAH5File(os.path.join('odld/{}.h5'.format(name)), 'r') # check rate evolution! # If our rates agree, we can generally assume that our other values are good, too. orates = output['rate_evolution'][-1, :, :]['expected'] trates = output['rate_evolution'][-1, :, :]['expected'] assert abs(orates[0, 1] - trates[0, 1]) <= EPS assert abs(orates[1, 0] - trates[1, 0]) <= EPS
def process_args(self, args): self.output_file = h5io.WESTPAH5File(args.output, 'w', creating_program=True) self.assignments_file = h5io.WESTPAH5File(args.assignments, 'r') # Force a build of the transition matrix at the iteration level. self.sampling_frequency = 'iteration' if self.assignments_file.attrs[ 'subsampled'] == True else args.sampling_frequency
def process_args(self, args): self.progress.process_args(args) self.assignments_file = h5io.WESTPAH5File(args.assignments, 'r') self.data_reader.process_args(args) with self.data_reader: self.iter_range.process_args(args) self.output_file = h5io.WESTPAH5File(args.output, 'w', creating_program=True) h5io.stamp_creator_data(self.output_file) if not self.iter_range.check_data_iter_range_least( self.assignments_file): raise ValueError( 'assignments do not span the requested iterations') self.do_compression = args.compression
def open_files(self): self.output_file = h5io.WESTPAH5File(self.output_file, 'w', creating_program=True) h5io.stamp_creator_data(self.output_file) opened_files = self.generate_file_list([self.west]) self.westH5 = opened_files[self.west]
def process_args(self, args): self.progress.process_args(args) self.kinetics_filename = args.kinetics self.istate = args.istate self.fstate = args.fstate self.kinetics_file = h5io.WESTPAH5File(self.kinetics_filename, 'r') self.iter_start = args.iter_start if args.iter_stop is None: self.iter_stop = self.kinetics_file.attrs['iter_stop'] else: self.iter_stop = args.iter_stop + 1 self.binspec = args.bins self.output_filename = args.output self.ignore_out_of_range = bool(args.ignore_out_of_range) self.compress_output = args.compress or False
def generate_file_list(self, key_list): '''A convenience function which takes in a list of keys that are filenames, and returns a dictionary which contains all the individual files loaded inside of a dictionary keyed to the filename.''' return_dict = {} if self.ntrials == 0: raise self.NoSimulationsException( 'You must specify the number of simulations.') for key in key_list: return_dict[key] = {} for i in range(1, self.ntrials + 1): # Need to not make this hard coded, but who cares for now. for key in key_list: return_dict[key][i] = h5io.WESTPAH5File( os.path.join(self.master, str(i).zfill(2), key), 'r') return return_dict
def _find_matching_segments(west_datafile_name, n_iter, predicate, invert=False): '''Find all segments in iteration ``n_iter`` that match (or do not match, if ``invert`` is true) the given ``predicate``. Returns a sequence of matching seg_ids.''' with h5io.WESTPAH5File(west_datafile_name, 'r') as west_datafile: iter_group = west_datafile.get_iter_group(n_iter) nsegs = iter_group['seg_index'].shape[0] matching_ids = set(map(int, predicate(n_iter, iter_group))) if invert: matching_ids = set(range(nsegs)) - matching_ids matchvec = sorted( numpy.fromiter(matching_ids, dtype=seg_id_dtype, count=len(matching_ids))) return n_iter, matchvec
def go(self): self.data_reader.open('r') output_file = h5io.WESTPAH5File(self.output_filename, mode='w') pi = self.progress.indicator iter_start, iter_stop = self.iter_range.iter_start, self.iter_range.iter_stop iter_count = iter_stop - iter_start output_file.create_dataset('n_iter', dtype=n_iter_dtype, data=list(range(iter_start, iter_stop))) current_seg_count = 0 seg_count_ds = output_file.create_dataset('n_segs', dtype=numpy.uint, shape=(iter_count, )) matching_segs_ds = output_file.create_dataset( 'seg_ids', shape=(iter_count, 0), maxshape=(iter_count, None), dtype=seg_id_dtype, chunks=h5io.calc_chunksize((iter_count, 1000000), seg_id_dtype), shuffle=True, compression=9) weights_ds = output_file.create_dataset('weights', shape=(iter_count, 0), maxshape=(iter_count, None), dtype=weight_dtype, chunks=h5io.calc_chunksize( (iter_count, 1000000), weight_dtype), shuffle=True, compression=9) with pi: pi.new_operation('Finding matching segments', extent=iter_count) # futures = set() # for n_iter in xrange(iter_start,iter_stop): # futures.add(self.work_manager.submit(_find_matching_segments, # args=(self.data_reader.we_h5filename,n_iter,self.predicate,self.invert))) # for future in self.work_manager.as_completed(futures): for future in self.work_manager.submit_as_completed( ((_find_matching_segments, (self.data_reader.we_h5filename, n_iter, self.predicate, self.invert), {}) for n_iter in range(iter_start, iter_stop)), self.max_queue_len): n_iter, matching_ids = future.get_result() n_matches = len(matching_ids) if n_matches: if n_matches > current_seg_count: current_seg_count = len(matching_ids) matching_segs_ds.resize((iter_count, n_matches)) weights_ds.resize((iter_count, n_matches)) current_seg_count = n_matches seg_count_ds[n_iter - iter_start] = n_matches matching_segs_ds[n_iter - iter_start, :n_matches] = matching_ids weights_ds[n_iter - iter_start, : n_matches] = self.data_reader.get_iter_group( n_iter)['seg_index']['weight'][sorted( matching_ids)] del matching_ids pi.progress += 1 if self.include_ancestors: pi.new_operation('Tracing ancestors of matching segments', extent=iter_count) from_previous = set() current_seg_count = matching_segs_ds.shape[1] for n_iter in range(iter_stop - 1, iter_start - 1, -1): iiter = n_iter - iter_start n_matches = seg_count_ds[iiter] matching_ids = set(from_previous) if n_matches: matching_ids.update( matching_segs_ds[iiter, :seg_count_ds[iiter]]) from_previous.clear() n_matches = len(matching_ids) if n_matches > current_seg_count: matching_segs_ds.resize((iter_count, n_matches)) weights_ds.resize((iter_count, n_matches)) current_seg_count = n_matches if n_matches > 0: seg_count_ds[iiter] = n_matches matching_ids = sorted(matching_ids) matching_segs_ds[iiter, :n_matches] = matching_ids weights_ds[ iiter, : n_matches] = self.data_reader.get_iter_group( n_iter)['seg_index']['weight'][sorted( matching_ids)] parent_ids = self.data_reader.get_iter_group(n_iter)[ 'seg_index']['parent_id'][sorted(matching_ids)] from_previous.update( parent_id for parent_id in parent_ids if parent_id >= 0) # filter initial states del parent_ids del matching_ids pi.progress += 1
def analysis_structure(self): ''' Run automatically on startup. Parses through the configuration file, and loads up all the data files from the different analysis schematics. If they don't exist, it creates them automatically by hooking in to existing analysis routines and going from there. It does this by calling in the make_parser_and_process function for w_{assign,reweight,direct} using a custom built list of args. The user can specify everything in the configuration file that would have been specified on the command line. For instance, were one to call w_direct as follows: w_direct --evolution cumulative --step-iter 1 --disable-correl the west.cfg would look as follows: west: analysis: w_direct: evolution: cumulative step_iter: 1 extra: ['disable-correl'] Alternatively, if one wishes to use the same options for both w_direct and w_reweight, the key 'w_direct' can be replaced with 'kinetics'. ''' # Make sure everything exists. try: os.mkdir(self.__settings['directory']) except: pass # Now, check to see whether they exist, and then load them. self.__analysis_schemes__ = {} # We really need to implement some sort of default behavior if an analysis scheme isn't set. # Right now, we just crash. That isn't really graceful. for scheme in self.__settings['analysis_schemes']: if self.__settings['analysis_schemes'][scheme]['enabled']: if self.work_manager.running == False: self.work_manager.startup() path = os.path.join(os.getcwd(), self.__settings['directory'], scheme) #if 'postanalysis' in self.__settings['analysis_schemes'][scheme] and 'postanalysis' in self.__settings['postanalysis']: # Should clean this up. But it uses the default global setting if a by-scheme one isn't set. if 'postanalysis' in self.__settings: if 'postanalysis' in self.__settings['analysis_schemes'][scheme]: pass else: self.__settings['analysis_schemes'][scheme]['postanalysis'] = self.__settings['postanalysis'] try: os.mkdir(path) except: pass self.__analysis_schemes__[scheme] = {} try: if self.__settings['analysis_schemes'][scheme]['postanalysis'] == True or self.__settings['postanalysis'] == True: analysis_files = ['assign', 'direct', 'reweight'] else: analysis_files = ['assign', 'direct'] except: analysis_files = ['assign', 'direct'] self.__settings['analysis_schemes'][scheme]['postanalysis'] = False reanalyze_kinetics = False assign_hash = None for name in analysis_files: arg_hash = None if self.reanalyze == True: reanalyze_kinetics = True try: os.remove(os.path.join(path, '{}.h5'.format(name))) except: pass else: try: # Try to load the hash. If we fail to load the hash or the file, we need to reload. #if self.reanalyze == True: # raise ValueError('Reanalyze set to true.') self.__analysis_schemes__[scheme][name] = h5io.WESTPAH5File(os.path.join(path, '{}.h5'.format(name)), 'r') arg_hash = self.__analysis_schemes__[scheme][name].attrs['arg_hash'] if name == 'assign': assign_hash = arg_hash except: pass # We shouldn't rely on this. # self.reanalyze = True if True: if name == 'assign': assign = w_assign.WAssign() w_assign_config = { 'output': os.path.join(path, '{}.h5'.format(name))} try: w_assign_config.update(self.__settings['w_assign']) except: pass try: w_assign_config.update(self.__settings['analysis_schemes'][scheme]['w_assign']) except: pass args = [] for key,value in w_assign_config.items(): if key != 'extra': args.append(str('--') + str(key).replace('_', '-')) args.append(str(value)) # This is for stuff like disabling correlation analysis, etc. if 'extra' in list(w_assign_config.keys()): # We're sorting to ensure that the order doesn't matter. for value in sorted(w_assign_config['extra']): args.append(str('--') + str(value).replace('_', '-')) # We're just calling the built in function. # This is a lot cleaner than what we had in before, and far more workable. args.append('--config-from-file') args.append('--scheme-name') args.append('{}'.format(scheme)) # Why are we calling this if we're not sure we're remaking the file? # We need to load up the bin mapper and states and see if they're the same. assign.make_parser_and_process(args=args) import pickle #new_hash = self.hash_args(args=args, path=path, extra=[self.niters, pickle.dumps(assign.binning.mapper), assign.states]) # We need to encode it properly to ensure that some OS specific thing doesn't kill us. Same goes for the args, ultimately. # Mostly, we just need to ensure that we're consistent. new_hash = self.hash_args(args=args, path=path, extra=[int(self.niters), codecs.encode(pickle.dumps(assign.binning.mapper), "base64"), base64.b64encode(str(assign.states).encode())]) # Let's check the hash. If the hash is the same, we don't need to reload. if self.debug_mode == True: print('{:<10}: old hash, new hash -- {}, {}'.format(name, arg_hash, new_hash)) if self.ignore_hash == False and (arg_hash != new_hash or self.reanalyze == True): # If the hashes are different, or we need to reanalyze, delete the file. try: os.remove(os.path.join(path, '{}.h5'.format(name))) except: pass print('Reanalyzing file {}.h5 for scheme {}.'.format(name, scheme)) #reanalyze_kinetics = True # We want to use the work manager we have here. Otherwise, just let the tool sort out what it needs, honestly. assign.work_manager = self.work_manager assign.go() assign.data_reader.close() # Stamp w/ hash, then reload as read only. self.__analysis_schemes__[scheme][name] = self.stamp_hash(os.path.join(path, '{}.h5'.format(name)), new_hash) del(assign) # Update the assignment hash. assign_hash = new_hash # Since these are all contained within one tool, now, we want it to just... load everything. if name == 'direct' or name == 'reweight': assignment_file = self.__analysis_schemes__[scheme]['assign'] if name == 'direct': analysis = w_direct.WDirect() if name == 'reweight': analysis = w_reweight.WReweight() analysis_config = { 'assignments': os.path.join(path, '{}.h5'.format('assign')), 'output': os.path.join(path, '{}.h5'.format(name)), 'kinetics': os.path.join(path, '{}.h5'.format(name))} # Pull from general analysis options, then general SPECIFIC options for each analysis, # then general options for that analysis scheme, then specific options for the analysis type in the scheme. try: analysis_config.update(self.__settings['kinetics']) except: pass try: analysis_config.update(self.__settings['w_{}'.format(name)]) except: pass try: analysis_config.update(self.__settings['analysis_schemes'][scheme]['kinetics']) except: pass try: analysis_config.update(self.__settings['analysis_schemes'][scheme]['w_{}'.format(name)]) except: pass # We're pulling in a default set of arguments, then updating them with arguments from the west.cfg file, if appropriate, after setting the appropriate command # Then, we call the magic function 'make_parser_and_process' with the arguments we've pulled in. # The tool has no real idea it's being called outside of its actual function, and we're good to go. args = ['all'] for key,value in analysis_config.items(): if key != 'extra': args.append(str('--') + str(key).replace('_', '-')) args.append(str(value)) # This is for stuff like disabling correlation analysis, etc. if 'extra' in list(analysis_config.keys()): for value in sorted(analysis_config['extra']): args.append(str('--') + str(value).replace('_', '-')) # We want to not display the averages, so... args.append('--disable-averages') new_hash = self.hash_args(args=args, path=path, extra=[int(self.niters), assign_hash]) #if arg_hash != new_hash or self.reanalyze == True or reanalyze_kinetics == True: if self.debug_mode == True: print('{:<10}: old hash, new hash -- {}, {}'.format(name, arg_hash, new_hash)) if self.ignore_hash == False and (arg_hash != new_hash or reanalyze_kinetics == True): try: os.remove(os.path.join(path, '{}.h5'.format(name))) except: pass print('Reanalyzing file {}.h5 for scheme {}.'.format(name, scheme)) analysis.make_parser_and_process(args=args) # We want to hook into the existing work manager. analysis.work_manager = self.work_manager analysis.go() # Open! self.__analysis_schemes__[scheme][name] = self.stamp_hash(os.path.join(path, '{}.h5'.format(name)), new_hash) del(analysis) # Make sure this doesn't get too far out, here. We need to keep it alive as long as we're actually analyzing things. # self.work_manager.shutdown() print("") print("Complete!")
def go(self): self.data_reader.open('r') assignments_file = h5py.File(self.assignments_filename, mode='r') output_file = h5io.WESTPAH5File(self.output_filename, mode='w') pi = self.progress.indicator count = self.count timepoint = self.timepoint nbins = assignments_file.attrs['nbins'] + 1 assignments_ds = assignments_file['assignments'] iter_start, iter_stop = self.iter_range.iter_start, self.iter_range.iter_stop iter_count = iter_stop - iter_start h5io.check_iter_range_least(assignments_ds, iter_start, iter_stop) nsegs = assignments_file['nsegs'][h5io.get_iteration_slice( assignments_file['nsegs'], iter_start, iter_stop)] output_file.create_dataset('n_iter', dtype=n_iter_dtype, data=list(range(iter_start, iter_stop))) seg_count_ds = output_file.create_dataset('nsegs', dtype=numpy.uint, shape=(iter_count, nbins)) matching_segs_ds = output_file.create_dataset( 'seg_ids', shape=(iter_count, nbins, count), dtype=seg_id_dtype, chunks=h5io.calc_chunksize((iter_count, nbins, count), seg_id_dtype), shuffle=True, compression=9) weights_ds = output_file.create_dataset('weights', shape=(iter_count, nbins, count), dtype=weight_dtype, chunks=h5io.calc_chunksize( (iter_count, nbins, count), weight_dtype), shuffle=True, compression=9) what = self.what with pi: pi.new_operation('Finding matching segments', extent=iter_count) for iiter, n_iter in enumerate(range(iter_start, iter_stop)): assignments = numpy.require(assignments_ds[ h5io.get_iteration_entry(assignments_ds, n_iter) + numpy.index_exp[:, timepoint]], dtype=westpa.binning.index_dtype) all_weights = self.data_reader.get_iter_group( n_iter)['seg_index']['weight'] # the following Cython function just executes this loop: #for iseg in xrange(nsegs[iiter]): # segs_by_bin[iseg,assignments[iseg]] = True segs_by_bin = assignments_list_to_table( nsegs[iiter], nbins, assignments) for ibin in range(nbins): segs = numpy.nonzero(segs_by_bin[:, ibin])[0] seg_count_ds[iiter, ibin] = min(len(segs), count) if len(segs): weights = all_weights.take(segs) if what == 'lowweight': indices = numpy.argsort(weights)[:count] elif what == 'highweight': indices = numpy.argsort(weights)[::-1][:count] else: assert what == 'random' indices = numpy.random.permutation(len(weights)) matching_segs_ds[iiter, ibin, :len(segs)] = segs.take(indices) weights_ds[iiter, ibin, :len(segs)] = weights.take(indices) del segs, weights del assignments, segs_by_bin, all_weights pi.progress += 1
from westpa import rc, h5io data_manager = rc.get_data_manager() ##Store west.h5 file in RAM for testing west_file_name = 'west.h5' west_file = h5io.WESTPAH5File(west_file_name, driver='core', backing_store=False) data_manager.we_h5file = west_file data_manager.we_h5file_version = int( west_file['/'].attrs['west_file_format_version'])