def load_memory_state(self, projectinfo_file): """ Loads the 'memory state' from a serialized file on disk Parameters ---------- projectinfo_file : str The file on disk from which to read. Notes ----- When reading the memory state, we have to decode base16, and also remove leading 'a' characters See Also -------- save_memory_state """ logger.info("Loading memory state from: %s", projectinfo_file) project_info = Project.load_from_hdf( projectinfo_file ) self.memory = cPickle.loads( project_info["Memory"] ) return
def load_memory_state(self, projectinfo_file): """ Loads the 'memory state' from a serialized file on disk Parameters ---------- projectinfo_file : str The file on disk from which to read. Notes ----- When reading the memory state, we have to decode base16, and also remove leading 'a' characters See Also -------- save_memory_state """ logger.info("Loading memory state from: %s", projectinfo_file) project_info = Project.load_from_hdf(projectinfo_file) self.memory = cPickle.loads(project_info["Memory"]) return
def save_memory_state(self): """ Saves the 'memory state' to disk in a serialized format. Notes ----- When saving, we encode the keys into base16 with a leading 'a', because the HDF5 Serializer doesnt' like '/' characters and is super picky in general See Also -------- load_memory_state """ project_info = Project.load_from_hdf( projectinfo_file ) project_info["Memory"] = cPickle.dumps( self.memory ) project_info.save_to_hdf( projectinfo_file, do_file_check=False ) return
def save_memory_state(self): """ Saves the 'memory state' to disk in a serialized format. Notes ----- When saving, we encode the keys into base16 with a leading 'a', because the HDF5 Serializer doesnt' like '/' characters and is super picky in general See Also -------- load_memory_state """ project_info = Project.load_from_hdf(projectinfo_file) project_info["Memory"] = cPickle.dumps(self.memory) project_info.save_to_hdf(projectinfo_file, do_file_check=False) return
def write_all_trajectories(self, input_dir, output_dir, stride, max_rmsd, min_gens, center_conformations, num_proc, input_style, update=False): """ Convert all of the trajectories in the FAH project in input_dir to lh5 trajectory files which will be placed in output dir. If the 'update' flag is set, then will use the memory object to check for previously converted data, and add to it (rather than reconverting everything). This functionality can be more cleanly called through the update_trajectories() method. Parameters ---------- input_dir : str The directory to look for XTC/DCD files in. output_dir : str The place to write the converted lh5s stride : int The size of the stride to employ. E.g., if stride = 3, the script keeps every 3rd MD snapshot from the original data. Useful to throw away highly correlated data if snapshots were saved frequently. max_rmsd : float Throw away any data that is further than `max_rmsd` (in nm) from the pdb file associated with the project. This is used as a sanity check to prevent including, e.g. data from a simulation that is blowing up. min_gens : int Discard trajectories with fewer than `min_gens` generations. center_conformations : bool Whether to center the converted (lh5) conformations. num_proc : int Number of processors to employ. Note that this function is typically I/O limited, so paralellism is unlikely to yield much gain. input_style : {'FAH', 'FILE'} If you use input_style = 'FAH', this code uses knowledge of the RUN*/CLONE* directory structure to yield all the CLONE directories. If you use input_style = 'FILE', this code uses os.walk() which is A LOT slower because it has to stat every file, but is capable of recursively searching for xtc files to arbitrary depths. update : bool If `True`, then tries to figure out what data has already been converted by reading the "memory state" in the provided ProjectInfo file, and only converts new data. If `False`, does a fresh re-convert. Notes ----- Since sometimes a conversion fails, we collect all trajectories at the end and renumber them such that they are contiguously numbered. """ if update: assert os.path.exists(output_dir) else: try: os.mkdir(output_dir) except OSError: logger.error('Error: The directory %s already exists', output_dir) sys.exit(1) intermediate_filename_root = '_trj' # A placeholder name #dtm does not play nice with OpenMP use_parallel_rmsd = (num_proc != 'use_dtm_instead') jobs = [] for i, clone_dir in enumerate( self.yield_xtc_directories(input_dir, input_style)): job = { 'clone_dir': clone_dir, 'output_dir': output_dir, 'pdb_file': self.pdb_topology, 'trajectory_number': i, 'stride': stride, 'max_rmsd': max_rmsd, 'min_gens': min_gens, 'center_conformations': center_conformations, 'memory_check': update, 'omp_parallel_rmsd': use_parallel_rmsd } jobs.append(job) if len(jobs) == 0: raise RuntimeError('No conversion jobs found!') if num_proc == 'use_dtm_instead': # use DTM mpi parallel map dtm.map(self.write_trajectory_mapper, jobs) elif num_proc > 1: # use multiprocessing pool = Pool(processes=num_proc) pool.map(self.write_trajectory_mapper, jobs) else: # use regular serial execution map(self.write_trajectory_mapper, jobs) # Rename trajectory files such that they have contiguous numbering logger.info( "Finished Generating Trajectories. Renaming them now in contiguous order" ) mapping = { } # document the directory changes, allowing us to update memory for i, filename in enumerate(sorted(os.listdir(output_dir), key=keynat)): path = os.path.join(output_dir, filename) new_path = os.path.join(output_dir, "trj%d.lh5" % i) os.rename(path, new_path) mapping[path] = new_path # update the memory hash to accound for our renumbering for key in self.memory.keys(): if key not in ['convert_parameters', 'SerializerFilename']: logger.info("%s --> %s", self.memory[key][0], mapping[self.memory[key][0]]) self.memory[key][0] = mapping[self.memory[key][0]] # save the parameters used for this run in the memory file, and write to disk logger.info("Generating Project File: %s", self.projectinfo_file) if update: try: os.remove(self.projectinfo_file ) # if we are updating, just start w fresh slate except: pass self.memory['convert_parameters'] = (input_dir, output_dir, stride, max_rmsd, min_gens, center_conformations, num_proc, self.projectinfo_file, input_style) Project.CreateProjectFromDir(Filename=self.projectinfo_file, TrajFilePath=output_dir, TrajFileBaseName='trj', TrajFileType='.lh5', ConfFilename=self.pdb_topology, initial_memory=cPickle.dumps(self.memory)) logger.info("Data converted properly.") return
to use GetRandomConfs.py""") parser.add_argument('project') parser.add_argument('assignments', default='Data/Assignments.Fixed.h5') parser.add_argument( 'conformations_per_state', default=5, type=int, help='Number of conformations to sample from each state') parser.add_argument( 'states', nargs='+', type=int, help='''Which states to sample from. Pass a list of integers, separated by whitespace. To specify ALL of the states (Although the script GetRandomConfs.py is more efficient for this purpose), pass the integer -1.''') parser.add_argument('output_dir', default='PDBs') args = parser.parse_args() if -1 in args.states: logger.info("Ripping PDBs for all states") args.states = 'all' try: assignments = io.loadh(args.assignments, 'arr_0') except KeyError: assignments = io.loadh(args.assignments, 'Data') project = Project.load_from(args.project) run(project, assignments, args.conformations_per_state, args.states, args.output_dir)