Esempio n. 1
0
    def write_all_trajectories(self,
                               input_dir,
                               output_dir,
                               stride,
                               max_rmsd,
                               min_gens,
                               center_conformations,
                               num_proc,
                               input_style,
                               update=False):
        """
        Convert all of the trajectories in the FAH project in input_dir to
        lh5 trajectory files which will be placed in output dir.

        If the 'update' flag is set, then will use the memory object to check for
        previously converted data, and add to it (rather than reconverting everything).
        This functionality can be more cleanly called through the update_trajectories()
        method.

        Parameters
        ----------
        input_dir : str
            The directory to look for XTC/DCD files in.

        output_dir : str
            The place to write the converted lh5s

        stride : int
            The size of the stride to employ. E.g., if stride = 3, the script
            keeps every 3rd MD snapshot from the original data. Useful to throw
            away highly correlated data if snapshots were saved frequently.

        max_rmsd : float
            Throw away any data that is further than `max_rmsd` (in nm) from the
            pdb file associated with the project. This is used as a sanity check
            to prevent including, e.g. data from a simulation that is blowing up.

        min_gens : int
            Discard trajectories with fewer than `min_gens` generations.

        center_conformations : bool
            Whether to center the converted (lh5) conformations.

        num_proc : int
            Number of processors to employ. Note that this function is typically
            I/O limited, so paralellism is unlikely to yield much gain.

        input_style : {'FAH', 'FILE'}
            If you use input_style = 'FAH', this code uses knowledge of the
            RUN*/CLONE* directory structure to yield all the CLONE directories.
            If you use input_style = 'FILE', this code uses os.walk() which is
            A LOT slower because it has to stat every file, but is capable of
            recursively searching for xtc files to arbitrary depths.

        update : bool
            If `True`, then tries to figure out what data has already been converted
            by reading the "memory state" in the provided ProjectInfo file, and only
            converts new data. If `False`, does a fresh re-convert.


        Notes
        -----
        Since sometimes a conversion fails, we collect all trajectories at the
        end and renumber them such that they are contiguously numbered.
        """

        if update:
            assert os.path.exists(output_dir)
        else:
            try:
                os.mkdir(output_dir)
            except OSError:
                logger.error('Error: The directory %s already exists',
                             output_dir)
                sys.exit(1)

        intermediate_filename_root = '_trj'  # A placeholder name

        #dtm does not play nice with OpenMP
        use_parallel_rmsd = (num_proc != 'use_dtm_instead')

        jobs = []
        for i, clone_dir in enumerate(
                self.yield_xtc_directories(input_dir, input_style)):

            job = {
                'clone_dir': clone_dir,
                'output_dir': output_dir,
                'pdb_file': self.pdb_topology,
                'trajectory_number': i,
                'stride': stride,
                'max_rmsd': max_rmsd,
                'min_gens': min_gens,
                'center_conformations': center_conformations,
                'memory_check': update,
                'omp_parallel_rmsd': use_parallel_rmsd
            }
            jobs.append(job)

        if len(jobs) == 0:
            raise RuntimeError('No conversion jobs found!')

        if num_proc == 'use_dtm_instead':
            # use DTM mpi parallel map
            dtm.map(self.write_trajectory_mapper, jobs)
        elif num_proc > 1:
            # use multiprocessing
            pool = Pool(processes=num_proc)
            pool.map(self.write_trajectory_mapper, jobs)
        else:
            # use regular serial execution
            map(self.write_trajectory_mapper, jobs)

        # Rename trajectory files such that they have contiguous numbering
        logger.info(
            "Finished Generating Trajectories. Renaming them now in contiguous order"
        )
        mapping = {
        }  # document the directory changes, allowing us to update memory
        for i, filename in enumerate(sorted(os.listdir(output_dir),
                                            key=keynat)):
            path = os.path.join(output_dir, filename)
            new_path = os.path.join(output_dir, "trj%d.lh5" % i)
            os.rename(path, new_path)
            mapping[path] = new_path

        # update the memory hash to accound for our renumbering
        for key in self.memory.keys():
            if key not in ['convert_parameters', 'SerializerFilename']:
                logger.info("%s --> %s", self.memory[key][0],
                            mapping[self.memory[key][0]])
                self.memory[key][0] = mapping[self.memory[key][0]]

        # save the parameters used for this run in the memory file, and write to disk
        logger.info("Generating Project File: %s", self.projectinfo_file)
        if update:
            try:
                os.remove(self.projectinfo_file
                          )  # if we are updating, just start w fresh slate
            except:
                pass

        self.memory['convert_parameters'] = (input_dir, output_dir, stride,
                                             max_rmsd, min_gens,
                                             center_conformations, num_proc,
                                             self.projectinfo_file,
                                             input_style)

        Project.CreateProjectFromDir(Filename=self.projectinfo_file,
                                     TrajFilePath=output_dir,
                                     TrajFileBaseName='trj',
                                     TrajFileType='.lh5',
                                     ConfFilename=self.pdb_topology,
                                     initial_memory=cPickle.dumps(self.memory))

        logger.info("Data converted properly.")

        return