コード例 #1
0
class _Manager(object):
    """
    A class to manage processing book-keeping
    """
    def __init__(self, experiments, reflections, params):
        """
        Initialise the manager.

        :param experiments: The list of experiments
        :param reflections: The list of reflections
        :param params: The phil parameters
        """

        # Initialise the callbacks
        self.executor = None

        # Save some data
        self.experiments = experiments
        self.reflections = reflections

        # Other data
        self.data = {}

        # Save some parameters
        self.params = params

        # Set the finalized flag to False
        self.finalized = False

        # Initialise the timing information
        self.time = dials.algorithms.integration.TimingInfo()

    def initialize(self):
        """
        Initialise the processing
        """
        # Get the start time
        start_time = time()

        # Ensure the reflections contain bounding boxes
        assert "bbox" in self.reflections, "Reflections have no bbox"

        if self.params.mp.nproc is libtbx.Auto:
            self.params.mp.nproc = available_cores()
            logger.info("Setting nproc={}".format(self.params.mp.nproc))

        # Compute the block size and processors
        self.compute_jobs()
        self.split_reflections()
        self.compute_processors()

        # Create the reflection manager
        self.manager = ReflectionManager(self.jobs, self.reflections)

        # Parallel reading of HDF5 from the same handle is not allowed. Python
        # multiprocessing is a bit messed up and used fork on linux so need to
        # close and reopen file.
        for exp in self.experiments:
            if exp.imageset.reader().is_single_file_reader():
                exp.imageset.reader().nullify_format_instance()

        # Set the initialization time
        self.time.initialize = time() - start_time

    def task(self, index):
        """
        Get a task.
        """
        job = self.manager.job(index)
        frames = job.frames()
        expr_id = job.expr()
        assert expr_id[1] > expr_id[0], "Invalid experiment id"
        assert expr_id[0] >= 0, "Invalid experiment id"
        assert expr_id[1] <= len(self.experiments), "Invalid experiment id"
        experiments = self.experiments  # [expr_id[0]:expr_id[1]]
        reflections = self.manager.split(index)
        if len(reflections) == 0:
            logger.warning("No reflections in job %d ***", index)
            task = NullTask(index=index, reflections=reflections)
        else:
            task = Task(
                index=index,
                job=frames,
                experiments=experiments,
                reflections=reflections,
                params=self.params,
                executor=self.executor,
            )
        return task

    def tasks(self):
        """
        Iterate through the tasks.
        """
        for i in range(len(self)):
            yield self.task(i)

    def accumulate(self, result):
        """Accumulate the results."""
        self.data[result.index] = result.data
        self.manager.accumulate(result.index, result.reflections)
        self.time.read += result.read_time
        self.time.extract += result.extract_time
        self.time.process += result.process_time
        self.time.total += result.total_time

    def finalize(self):
        """
        Finalize the processing and finish.
        """
        # Get the start time
        start_time = time()

        # Check manager is finished
        assert self.manager.finished(), "Manager is not finished"

        # Update the time and finalized flag
        self.time.finalize = time() - start_time
        self.finalized = True

    def result(self):
        """
        Return the result.

        :return: The result
        """
        assert self.finalized, "Manager is not finalized"
        return self.manager.data(), self.data

    def finished(self):
        """
        Return if all tasks have finished.

        :return: True/False all tasks have finished
        """
        return self.finalized and self.manager.finished()

    def __len__(self):
        """
        Return the number of tasks.

        :return: the number of tasks
        """
        return len(self.manager)

    def compute_jobs(self):
        """
        Sets up a JobList() object in self.jobs
        """

        if self.params.block.size == libtbx.Auto:
            if (self.params.mp.nproc * self.params.mp.njobs == 1
                    and not self.params.debug.output
                    and not self.params.block.force):
                self.params.block.size = None

        # calculate the block overlap based on the size of bboxes in the data
        # calculate once here rather than repeated in the loop below
        block_overlap = 0
        if self.params.block.size is not None:
            assert self.params.block.threshold > 0, "Threshold must be > 0"
            assert self.params.block.threshold <= 1.0, "Threshold must be < 1"
            frames_per_refl = sorted(
                [b[5] - b[4] for b in self.reflections["bbox"]])
            cutoff = int(self.params.block.threshold * len(frames_per_refl))
            block_overlap = frames_per_refl[cutoff]

        groups = itertools.groupby(
            range(len(self.experiments)),
            lambda x:
            (id(self.experiments[x].imageset), id(self.experiments[x].scan)),
        )
        self.jobs = JobList()
        for key, indices in groups:
            indices = list(indices)
            i0 = indices[0]
            i1 = indices[-1] + 1
            expr = self.experiments[i0]
            scan = expr.scan
            imgs = expr.imageset
            array_range = (0, len(imgs))
            if scan is not None:
                assert len(imgs) >= len(scan), "Invalid scan range"
                array_range = scan.get_array_range()

            if self.params.block.size is None:
                block_size_frames = array_range[1] - array_range[0]
            elif self.params.block.size == libtbx.Auto:
                # auto determine based on nframes and overlap
                nframes = array_range[1] - array_range[0]
                nblocks = self.params.mp.nproc * self.params.mp.njobs
                # want data to be split into n blocks with overlaps
                # i.e. [x, overlap, y, overlap, y, overlap, ....,y,  overlap, x]
                # blocks are x + overlap, or overlap + y + overlap.
                x = (nframes - block_overlap) / nblocks
                block_size = int(math.ceil(x + block_overlap))
                # increase the block size to be at least twice the overlap, in
                # case the overlap is large e.g. if high mosaicity.
                block_size_frames = max(block_size, 2 * block_overlap)
            elif self.params.block.units == "radians":
                _, dphi = scan.get_oscillation(deg=False)
                block_size_frames = int(
                    math.ceil(self.params.block.size / dphi))
                # if the specified block size is lower than the overlap,
                # reduce the overlap to be half of the block size.
                block_overlap = min(block_overlap, int(block_size_frames // 2))
            elif self.params.block.units == "degrees":
                _, dphi = scan.get_oscillation()
                block_size_frames = int(
                    math.ceil(self.params.block.size / dphi))
                # if the specified block size is lower than the overlap,
                # reduce the overlap to be half of the block size.
                block_overlap = min(block_overlap, int(block_size_frames // 2))
            elif self.params.block.units == "frames":
                block_size_frames = int(math.ceil(self.params.block.size))
                block_overlap = min(block_overlap, int(block_size_frames // 2))
            else:
                raise RuntimeError("Unknown block_size units %r" %
                                   self.params.block.units)
            self.jobs.add(
                (i0, i1),
                array_range,
                block_size_frames,
                block_overlap,
            )
        assert len(self.jobs) > 0, "Invalid number of jobs"

    def split_reflections(self):
        """
        Split the reflections into partials or over job boundaries
        """

        # Optionally split the reflection table into partials, otherwise,
        # split over job boundaries
        if self.params.shoebox.partials:
            num_full = len(self.reflections)
            self.reflections.split_partials()
            num_partial = len(self.reflections)
            assert num_partial >= num_full, "Invalid number of partials"
            if num_partial > num_full:
                logger.info(
                    " Split %d reflections into %d partial reflections\n" %
                    (num_full, num_partial))
        else:
            num_full = len(self.reflections)
            self.jobs.split(self.reflections)
            num_partial = len(self.reflections)
            assert num_partial >= num_full, "Invalid number of partials"
            if num_partial > num_full:
                num_split = num_partial - num_full
                logger.info(
                    " Split %d reflections overlapping job boundaries\n" %
                    num_split)

        # Compute the partiality
        self.reflections.compute_partiality(self.experiments)

    def compute_processors(self):
        """
        Compute the number of processors
        """

        # Get the maximum shoebox memory to estimate memory use for one process
        memory_required_per_process = flex.max(
            self.jobs.shoebox_memory(self.reflections,
                                     self.params.shoebox.flatten))

        # Obtain information about system memory
        available_memory = psutil.virtual_memory().available
        available_swap = psutil.swap_memory().free
        available_incl_swap = available_memory + available_swap
        available_limit = available_incl_swap * self.params.block.max_memory_usage
        available_immediate_limit = (available_memory *
                                     self.params.block.max_memory_usage)

        # Compile a memory report
        report = [
            "Memory situation report:",
        ]

        def _report(description, value):
            report.append("  %-50s:%5.1f GB" % (description, value))

        _report("Available system memory (excluding swap)",
                available_memory / 1e9)
        _report("Available swap memory", available_swap / 1e9)
        _report("Available system memory (including swap)",
                available_incl_swap / 1e9)
        _report(
            "Maximum memory for processing (including swap)",
            available_limit / 1e9,
        )
        _report(
            "Maximum memory for processing (excluding swap)",
            available_immediate_limit / 1e9,
        )
        _report("Memory required per process",
                memory_required_per_process / 1e9)

        # Check if a ulimit applies
        # Note that resource may be None on non-Linux platforms.
        # We can't use psutil as platform-independent solution in this instance due to
        # https://github.com/conda-forge/psutil-feedstock/issues/47
        rlimit = getattr(resource, "RLIMIT_VMEM",
                         getattr(resource, "RLIMIT_AS", None))
        if rlimit:
            try:
                ulimit = resource.getrlimit(rlimit)[0]
                if ulimit <= 0 or ulimit > (2**62):
                    report.append("  no memory ulimit set")
                else:
                    ulimit_used = psutil.Process().memory_info().rss
                    _report("Memory ulimit detected", ulimit / 1e9)
                    _report("Memory ulimit in use", ulimit_used / 1e9)
                    available_memory = max(
                        0, min(available_memory, ulimit - ulimit_used))
                    available_incl_swap = max(
                        0, min(available_incl_swap, ulimit - ulimit_used))
                    available_immediate_limit = (
                        available_memory * self.params.block.max_memory_usage)
                    _report("Available system memory (limited)",
                            available_memory / 1e9)
                    _report(
                        "Available system memory (incl. swap; limited)",
                        available_incl_swap / 1e9,
                    )
                    _report(
                        "Maximum memory for processing (exc. swap; limited)",
                        available_immediate_limit / 1e9,
                    )
            except Exception as e:
                logger.debug("Could not obtain ulimit values due to %s",
                             str(e),
                             exc_info=True)

        output_level = logging.INFO

        # Limit the number of parallel processes by amount of available memory
        if self.params.mp.method == "multiprocessing" and self.params.mp.nproc > 1:

            # Compute expected memory usage and warn if not enough
            njobs = available_immediate_limit / memory_required_per_process
            if njobs >= self.params.mp.nproc:
                # There is enough memory. Take no action
                pass
            elif njobs >= 1:
                # There is enough memory to run, but not as many processes as requested
                output_level = logging.WARNING
                report.append(
                    "Reducing number of processes from %d to %d due to memory constraints."
                    % (self.params.mp.nproc, int(njobs)))
                self.params.mp.nproc = int(njobs)
            elif (available_incl_swap * self.params.block.max_memory_usage >=
                  memory_required_per_process):
                # There is enough memory to run, but only if we count swap.
                output_level = logging.WARNING
                report.append(
                    "Reducing number of processes from %d to 1 due to memory constraints."
                    % self.params.mp.nproc, )
                report.append("Running this process will rely on swap usage!")
                self.params.mp.nproc = 1
            else:
                # There is not enough memory to run
                output_level = logging.ERROR

        report.append("")
        logger.log(output_level, "\n".join(report))

        if output_level >= logging.ERROR:
            raise MemoryError("""
          Not enough memory to run integration jobs.  This could be caused by a
          highly mosaic crystal model.  Possible solutions include increasing the
          percentage of memory allowed for shoeboxes or decreasing the block size.
          The average shoebox size is %d x %d pixels x %d images - is your crystal
          really this mosaic?
          """ % _average_bbox_size(self.reflections))

    def summary(self):
        """
        Get a summary of the processing
        """
        # Compute the task table
        if self.experiments.all_stills():
            rows = [["#", "Group", "Frame From", "Frame To", "# Reflections"]]
            for i in range(len(self)):
                job = self.manager.job(i)
                group = job.index()
                f0, f1 = job.frames()
                n = self.manager.num_reflections(i)
                rows.append([str(i), str(group), str(f0), str(f1), str(n)])
        elif self.experiments.all_sequences():
            rows = [[
                "#",
                "Group",
                "Frame From",
                "Frame To",
                "Angle From",
                "Angle To",
                "# Reflections",
            ]]
            for i in range(len(self)):
                job = self.manager.job(i)
                group = job.index()
                expr = job.expr()
                f0, f1 = job.frames()
                scan = self.experiments[expr[0]].scan
                p0 = scan.get_angle_from_array_index(f0)
                p1 = scan.get_angle_from_array_index(f1)
                n = self.manager.num_reflections(i)
                rows.append([
                    str(i),
                    str(group),
                    str(f0 + 1),
                    str(f1),
                    str(p0),
                    str(p1),
                    str(n)
                ])
        else:
            raise RuntimeError(
                "Experiments must be all sequences or all stills")

        # The job table
        task_table = tabulate(rows, headers="firstrow")

        # The format string
        if self.params.block.size is None:
            block_size = "auto"
        else:
            block_size = str(self.params.block.size)
        return ("Processing reflections in the following blocks of images:\n\n"
                " block_size: {} {}\n\n{}\n").format(
                    block_size,
                    "" if block_size in ("auto",
                                         "Auto") else self.params.block.units,
                    task_table,
                )
コード例 #2
0
ファイル: processor.py プロジェクト: jbeilstenedmands/dials
class Manager(object):
    """
    A class to manage processing book-keeping
    """
    def __init__(self, experiments, reflections, params):
        """
        Initialise the manager.

        :param experiments: The list of experiments
        :param reflections: The list of reflections
        :param params: The phil parameters
        """

        # Initialise the callbacks
        self.executor = None

        # Save some data
        self.experiments = experiments
        self.reflections = reflections

        # Other data
        self.data = {}

        # Save some parameters
        self.params = params

        # Set the finalized flag to False
        self.finalized = False

        # Initialise the timing information
        self.time = dials.algorithms.integration.TimingInfo()

    def initialize(self):
        """
        Initialise the processing
        """
        # Get the start time
        start_time = time()

        # Ensure the reflections contain bounding boxes
        assert "bbox" in self.reflections, "Reflections have no bbox"

        # Compute the block size and processors
        self.compute_blocks()
        self.compute_jobs()
        self.split_reflections()
        self.compute_processors()

        # Create the reflection manager
        self.manager = ReflectionManager(self.jobs, self.reflections)

        # Parallel reading of HDF5 from the same handle is not allowed. Python
        # multiprocessing is a bit messed up and used fork on linux so need to
        # close and reopen file.
        for exp in self.experiments:
            if exp.imageset.reader().is_single_file_reader():
                exp.imageset.reader().nullify_format_instance()

        # Set the initialization time
        self.time.initialize = time() - start_time

    def task(self, index):
        """
        Get a task.
        """
        job = self.manager.job(index)
        frames = job.frames()
        expr_id = job.expr()
        assert expr_id[1] > expr_id[0], "Invalid experiment id"
        assert expr_id[0] >= 0, "Invalid experiment id"
        assert expr_id[1] <= len(self.experiments), "Invalid experiment id"
        experiments = self.experiments  # [expr_id[0]:expr_id[1]]
        reflections = self.manager.split(index)
        if len(reflections) == 0:
            logger.warning("No reflections in job %d ***", index)
            task = NullTask(index=index, reflections=reflections)
        else:
            task = Task(
                index=index,
                job=frames,
                experiments=experiments,
                reflections=reflections,
                params=self.params,
                executor=self.executor,
            )
        return task

    def tasks(self):
        """
        Iterate through the tasks.
        """
        for i in range(len(self)):
            yield self.task(i)

    def accumulate(self, result):
        """Accumulate the results."""
        self.data[result.index] = result.data
        self.manager.accumulate(result.index, result.reflections)
        self.time.read += result.read_time
        self.time.extract += result.extract_time
        self.time.process += result.process_time
        self.time.total += result.total_time

    def finalize(self):
        """
        Finalize the processing and finish.
        """
        # Get the start time
        start_time = time()

        # Check manager is finished
        assert self.manager.finished(), "Manager is not finished"

        # Update the time and finalized flag
        self.time.finalize = time() - start_time
        self.finalized = True

    def result(self):
        """
        Return the result.

        :return: The result
        """
        assert self.finalized, "Manager is not finalized"
        return self.manager.data(), self.data

    def finished(self):
        """
        Return if all tasks have finished.

        :return: True/False all tasks have finished
        """
        return self.finalized and self.manager.finished()

    def __len__(self):
        """
        Return the number of tasks.

        :return: the number of tasks
        """
        return len(self.manager)

    def compute_blocks(self):
        """
        Compute the processing block size.
        """

        if self.params.block.size == libtbx.Auto:
            if (self.params.mp.nproc * self.params.mp.njobs == 1
                    and not self.params.debug.output
                    and not self.params.block.force):
                self.params.block.size = None
            else:
                assert self.params.block.threshold > 0, "Threshold must be > 0"
                assert self.params.block.threshold <= 1.0, "Threshold must be < 1"
                nframes = sorted(
                    [b[5] - b[4] for b in self.reflections["bbox"]])
                cutoff = int(self.params.block.threshold * len(nframes))
                block_size = nframes[cutoff] * 2
                self.params.block.size = block_size
                self.params.block.units = "frames"

    def compute_jobs(self):
        """
        Compute the jobs
        """
        groups = itertools.groupby(
            range(len(self.experiments)),
            lambda x:
            (id(self.experiments[x].imageset), id(self.experiments[x].scan)),
        )
        self.jobs = JobList()
        for key, indices in groups:
            indices = list(indices)
            i0 = indices[0]
            i1 = indices[-1] + 1
            expr = self.experiments[i0]
            scan = expr.scan
            imgs = expr.imageset
            array_range = (0, len(imgs))
            if scan is not None:
                assert len(imgs) >= len(scan), "Invalid scan range"
                array_range = scan.get_array_range()
            if self.params.block.size is None:
                block_size_frames = array_range[1] - array_range[0]
            elif self.params.block.units == "radians":
                phi0, dphi = scan.get_oscillation(deg=False)
                block_size_frames = int(
                    math.ceil(self.params.block.size / dphi))
            elif self.params.block.units == "degrees":
                phi0, dphi = scan.get_oscillation()
                block_size_frames = int(
                    math.ceil(self.params.block.size / dphi))
            elif self.params.block.units == "frames":
                block_size_frames = int(math.ceil(self.params.block.size))
            else:
                raise RuntimeError("Unknown block_size units %r" %
                                   self.params.block.units)
            self.jobs.add((i0, i1), array_range, block_size_frames)
        assert len(self.jobs) > 0, "Invalid number of jobs"

    def split_reflections(self):
        """
        Split the reflections into partials or over job boundaries
        """

        # Optionally split the reflection table into partials, otherwise,
        # split over job boundaries
        if self.params.shoebox.partials:
            num_full = len(self.reflections)
            self.reflections.split_partials()
            num_partial = len(self.reflections)
            assert num_partial >= num_full, "Invalid number of partials"
            if num_partial > num_full:
                logger.info(
                    " Split %d reflections into %d partial reflections\n" %
                    (num_full, num_partial))
        else:
            num_full = len(self.reflections)
            self.jobs.split(self.reflections)
            num_partial = len(self.reflections)
            assert num_partial >= num_full, "Invalid number of partials"
            if num_partial > num_full:
                num_split = num_partial - num_full
                logger.info(
                    " Split %d reflections overlapping job boundaries\n" %
                    num_split)

        # Compute the partiality
        self.reflections.compute_partiality(self.experiments)

    def compute_processors(self):
        """
        Compute the number of processors
        """

        # Set the memory usage per processor
        if self.params.mp.method == "multiprocessing" and self.params.mp.nproc > 1:

            # Get the maximum shoebox memory
            max_memory = flex.max(
                self.jobs.shoebox_memory(self.reflections,
                                         self.params.shoebox.flatten))

            # Compute expected memory usage and warn if not enough
            total_memory = psutil.virtual_memory().total
            assert (total_memory is not None and total_memory > 0
                    ), "psutil call appears to have given unexpected output"
            limit_memory = total_memory * self.params.block.max_memory_usage
            njobs = int(math.floor(limit_memory / max_memory))
            if njobs < 1:

                xsize, ysize, zsize = _average_bbox_size(self.reflections)
                raise RuntimeError("""
        Not enough memory to run integration jobs.  This could be caused by a
        highly mosaic crystal model.  Possible solutions include increasing the
        percentage of memory allowed for shoeboxes or decreasing the block size.
        The average shoebox size is %d x %d pixels x %d images - is your crystal
        really this mosaic?
            Total system memory: %.1f GB
            Shoebox memory limit: %.1f GB
            Required shoebox memory: %.1f GB
        """ % (
                    xsize,
                    ysize,
                    zsize,
                    total_memory / 1e9,
                    limit_memory / 1e9,
                    max_memory / 1e9,
                ))
            else:
                self.params.mp.nproc = min(self.params.mp.nproc, njobs)
                self.params.block.max_memory_usage /= self.params.mp.nproc

    def summary(self):
        """
        Get a summary of the processing
        """
        # Compute the task table
        if self.experiments.all_stills():
            rows = [["#", "Group", "Frame From", "Frame To", "# Reflections"]]
            for i in range(len(self)):
                job = self.manager.job(i)
                group = job.index()
                f0, f1 = job.frames()
                n = self.manager.num_reflections(i)
                rows.append([str(i), str(group), str(f0), str(f1), str(n)])
        elif self.experiments.all_sequences():
            rows = [[
                "#",
                "Group",
                "Frame From",
                "Frame To",
                "Angle From",
                "Angle To",
                "# Reflections",
            ]]
            for i in range(len(self)):
                job = self.manager.job(i)
                group = job.index()
                expr = job.expr()
                f0, f1 = job.frames()
                scan = self.experiments[expr[0]].scan
                p0 = scan.get_angle_from_array_index(f0)
                p1 = scan.get_angle_from_array_index(f1)
                n = self.manager.num_reflections(i)
                rows.append([
                    str(i),
                    str(group),
                    str(f0),
                    str(f1),
                    str(p0),
                    str(p1),
                    str(n)
                ])
        else:
            raise RuntimeError(
                "Experiments must be all sequences or all stills")

        # The job table
        task_table = tabulate(rows, headers="firstrow")

        # The format string
        if self.params.block.size is None:
            block_size = "auto"
        else:
            block_size = str(self.params.block.size)
        fmt = ("Processing reflections in the following blocks of images:\n"
               "\n"
               " block_size: %s %s\n"
               "\n"
               "%s\n")
        return fmt % (block_size, self.params.block.units, task_table)