Exemple #1
0
    def write_trajectory(self, clone_dir, output_dir, trajectory_number, stride,
						 max_rmsd, min_gens, center_conformations, memory_check,
						 omp_parallel_rmsd=True):
        """
        This function takes in a path to a CLONE and merges all the XTC files
        it finds into a H5 trajectory:

        Parameters
        ----------
        clone_dir : str
            the directory in which the xtc files are found. All of the xtc files
            in this directory are joined together to make a single trajectory
            (.h5) output file

        output_dir : str
            directory where the outputted files will be placed

        trajectory_number : int
            A unique number for this trajectory. This number is used in
            constructing the filename to write the outputted .h5 trajectory to,
            and thus must be unique

        stride: int
            Subsample by only considering every Nth snapshop.
        max_rmsd: {int, None}
            if this value is not None, calculate the RMSD to the pdb_file from
            each snapshot and reject trajectories which have snapshots with RMSD
            greated than max_rmsd. If None, no check is performed

        min_gens : int
            Discard the trajectories that contain fewer than `min_gens` XTC files.

        center_conformations : bool
            center conformations before saving.

        memory_check : bool
            if yes, uses the memory dictionary to do an update rather than a
            complete re-convert.

        omp_parallel_rmsd : bool
            If true, use OpenMP accelerated RMSD calculation for max_rmsd check
        """

        xtc_files = self.list_xtcs_in_dir(clone_dir)

        # Ensure that we're only joining contiguously numbered xtc files -- starting at 0 --
        # into a trajectory. If there are gaps in the xtc files in the directory, we only
        # want to use the the ones such that they are contiguously numbered
        i = 0
        for i, filename in enumerate(xtc_files):
            if self.integer_component(filename) != i:
                logger.error("Found discontinuity in xtc numbering - check data in %s", clone_dir)
                xtc_files = xtc_files[0:i]
                break


        # check the memory object to see which xtc files have already been converted, and
        # exclude those from this conversion
        if memory_check:
            if clone_dir in self.memory.keys():
                previous_convert_exists = True
                num_xtcs_converted = self.memory[clone_dir][1]
                if len(xtc_files) == num_xtcs_converted: # if we have converted everything,
                    logger.info("Already converted all files in %s, skipping...", clone_dir)
                    return                               # just bail out
                else:
                    xtc_files = xtc_files[num_xtcs_converted:]
            else:
                previous_convert_exists = False
        else:
            previous_convert_exists = False

        xtc_file_paths = [os.path.join(clone_dir, f) for f in xtc_files]

        logger.info("Processing %d xtc files in clone_dir = %s", len(xtc_files), clone_dir)

        if len(xtc_files) <= min_gens:
            logger.info("Skipping trajectory in clone_dir = %s", clone_dir)
            logger.info("Too few xtc files (generations).")
            return

        try:
            # [this should check for and discard overlapping snapshots]
            trajectory = Trajectory.load_from_xtc(xtc_file_paths, PDBFilename=self.pdb_topology,
                                                discard_overlapping_frames=True)
        except IOError as e:
            logger.error("IOError (%s) when processing trajectory in clone_dir = %s", e, clone_dir)
            logger.error("Attempting rescue by disregarding final frame, which is often")
            logger.error("the first/only frame to be corrupted")

            if len(xtc_file_paths) == 1:
                logger.error("Didn't find any other frames in %s, continuing...", clone_dir)
                return

            try:
                trajectory = Trajectory.load_from_xtc(xtc_file_paths[0:-1], PDBFilename=self.pdb_topology)
            except IOError:
                logger.error("Unfortunately, the error remained even after ignoring the final frame.")
                logger.error("Skipping the trajectory in clone_dir = %s", clone_dir)
                return
            else:
                logger.error("Sucessfully recovered from IOError by disregarding final frame.")

        if max_rmsd is not None:
            atomindices = [ int(i)-1 for i in trajectory['AtomID'] ]
            rmsdmetric = RMSD(atomindices, omp_parallel=omp_parallel_rmsd)
            ppdb = rmsdmetric.prepare_trajectory(Trajectory.load_trajectory_file(self.pdb_topology))
            ptraj = rmsdmetric.prepare_trajectory(trajectory)
            rmsds = rmsdmetric.one_to_all(ppdb, ptraj, 0)

            if max(rmsds) > max_rmsd:
                logger.warning("Snapshot %d RMSD %f > the %f cutoff" , argmax(rmsds), max(rmsds), max_rmsd)
                logger.warning("Dropping trajectory")
                return

        if center_conformations:
            RMSD.TheoData.centerConformations(trajectory["XYZList"])

        # if we are adding to a previous trajectory, we have to load that traj up and extend it
        if previous_convert_exists:
            output_filename = self.memory[clone_dir][0]
            output_file_path = output_filename
            logger.info("Extending: %s", output_filename)
            assert os.path.exists( output_filename )

            # load the traj and extend it [this should check for and discard overlapping snapshots]
            Trajectory.append_frames_to_file( output_filename,
                                           trajectory['XYZList'][::stride],
                                           discard_overlapping_frames=True )

            num_xtcs_processed = len(xtc_file_paths) + self.memory[clone_dir][1]

        # if we are not adding to a traj, then we create a new one
        else:
            output_filename = 'trj%s.h5' % trajectory_number
            output_file_path = os.path.join(output_dir, output_filename)

            if os.path.exists(output_file_path):
                logger.info("The file name %s already exists. Skipping it.", output_file_path)
                return

            # stide and discard by snapshot
            trajectory['XYZList'] = trajectory['XYZList'][::stride]
            trajectory.save(output_file_path)

            num_xtcs_processed = len(xtc_file_paths)

        # log what we did into the memory object
        self.memory[clone_dir] = [ output_file_path, num_xtcs_processed ]

        return
Exemple #2
0
    def write_trajectory(self,
                         clone_dir,
                         output_dir,
                         trajectory_number,
                         stride,
                         max_rmsd,
                         min_gens,
                         center_conformations,
                         memory_check,
                         omp_parallel_rmsd=True):
        """
        This function takes in a path to a CLONE and merges all the XTC files
        it finds into a LH5 trajectory:

        Parameters
        ----------
        clone_dir : str
            the directory in which the xtc files are found. All of the xtc files
            in this directory are joined together to make a single trajectory
            (.lh5) output file

        output_dir : str
            directory where the outputted files will be placed

        trajectory_number : int
            A unique number for this trajectory. This number is used in
            constructing the filename to write the outputted .lh5 trajectory to,
            and thus must be unique

        stride: int
            Subsample by only considering every Nth snapshop.
        max_rmsd: {int, None}
            if this value is not None, calculate the RMSD to the pdb_file from
            each snapshot and reject trajectories which have snapshots with RMSD
            greated than max_rmsd. If None, no check is performed

        min_gens : int
            Discard the trajectories that contain fewer than `min_gens` XTC files.

        center_conformations : bool
            center conformations before saving.

        memory_check : bool
            if yes, uses the memory dictionary to do an update rather than a
            complete re-convert.

        omp_parallel_rmsd : bool
            If true, use OpenMP accelerated RMSD calculation for max_rmsd check
        """

        xtc_files = self.list_xtcs_in_dir(clone_dir)

        # Ensure that we're only joining contiguously numbered xtc files -- starting at 0 --
        # into a trajectory. If there are gaps in the xtc files in the directory, we only
        # want to use the the ones such that they are contiguously numbered
        i = 0
        for i, filename in enumerate(xtc_files):
            if self.integer_component(filename) != i:
                logger.error(
                    "Found discontinuity in xtc numbering - check data in %s",
                    clone_dir)
                xtc_files = xtc_files[0:i]
                break

        # check the memory object to see which xtc files have already been converted, and
        # exclude those from this conversion
        if memory_check:
            if clone_dir in self.memory.keys():
                previous_convert_exists = True
                num_xtcs_converted = self.memory[clone_dir][1]
                if len(
                        xtc_files
                ) == num_xtcs_converted:  # if we have converted everything,
                    logger.info(
                        "Already converted all files in %s, skipping...",
                        clone_dir)
                    return  # just bail out
                else:
                    xtc_files = xtc_files[num_xtcs_converted:]
            else:
                previous_convert_exists = False
        else:
            previous_convert_exists = False

        xtc_file_paths = [os.path.join(clone_dir, f) for f in xtc_files]

        logger.info("Processing %d xtc files in clone_dir = %s",
                    len(xtc_files), clone_dir)

        if len(xtc_files) <= min_gens:
            logger.info("Skipping trajectory in clone_dir = %s", clone_dir)
            logger.info("Too few xtc files (generations).")
            return

        try:
            # [this should check for and discard overlapping snapshots]
            trajectory = Trajectory.load_from_xtc(
                xtc_file_paths,
                PDBFilename=self.pdb_topology,
                discard_overlapping_frames=True)
        except IOError as e:
            logger.error(
                "IOError (%s) when processing trajectory in clone_dir = %s", e,
                clone_dir)
            logger.error(
                "Attempting rescue by disregarding final frame, which is often"
            )
            logger.error("the first/only frame to be corrupted")

            if len(xtc_file_paths) == 1:
                logger.error(
                    "Didn't find any other frames in %s, continuing...",
                    clone_dir)
                return

            try:
                trajectory = Trajectory.load_from_xtc(
                    xtc_file_paths[0:-1], PDBFilename=self.pdb_topology)
            except IOError:
                logger.error(
                    "Unfortunately, the error remained even after ignoring the final frame."
                )
                logger.error("Skipping the trajectory in clone_dir = %s",
                             clone_dir)
                return
            else:
                logger.error(
                    "Sucessfully recovered from IOError by disregarding final frame."
                )

        if max_rmsd is not None:
            atomindices = [int(i) - 1 for i in trajectory['AtomID']]
            rmsdmetric = RMSD(atomindices, omp_parallel=omp_parallel_rmsd)
            ppdb = rmsdmetric.prepare_trajectory(
                Trajectory.load_trajectory_file(self.pdb_topology))
            ptraj = rmsdmetric.prepare_trajectory(trajectory)
            rmsds = rmsdmetric.one_to_all(ppdb, ptraj, 0)

            if max(rmsds) > max_rmsd:
                logger.warning("Snapshot %d RMSD %f > the %f cutoff",
                               argmax(rmsds), max(rmsds), max_rmsd)
                logger.warning("Dropping trajectory")
                return

        if center_conformations:
            RMSD.TheoData.centerConformations(trajectory["XYZList"])

        # if we are adding to a previous trajectory, we have to load that traj up and extend it
        if previous_convert_exists:
            output_filename = self.memory[clone_dir][0]
            output_file_path = output_filename
            logger.info("Extending: %s", output_filename)
            assert os.path.exists(output_filename)

            # load the traj and extend it [this should check for and discard overlapping snapshots]
            Trajectory.append_frames_to_file(output_filename,
                                             trajectory['XYZList'][::stride],
                                             discard_overlapping_frames=True)

            num_xtcs_processed = len(
                xtc_file_paths) + self.memory[clone_dir][1]

        # if we are not adding to a traj, then we create a new one
        else:
            output_filename = 'trj%s.lh5' % trajectory_number
            output_file_path = os.path.join(output_dir, output_filename)

            if os.path.exists(output_file_path):
                logger.info("The file name %s already exists. Skipping it.",
                            output_file_path)
                return

            # stide and discard by snapshot
            trajectory['XYZList'] = trajectory['XYZList'][::stride]
            trajectory.save(output_file_path)

            num_xtcs_processed = len(xtc_file_paths)

        # log what we did into the memory object
        self.memory[clone_dir] = [output_file_path, num_xtcs_processed]

        return