Beispiel #1
0
    def process(self, item: LocationTask) -> Iterator[HueLocationInfo]:

        # Filter out entities that are empty (i.e., have nothing to color)
        # A '-1' in the 'start' or 'end' field indicates that the entity does not occur in a
        # specific place in the TeX, but rather a custom coloring technique based on other
        # entity properties will be used. So entities that have a '-1' for their start and
        # end should still be processed even though they appear to be zero-length.
        entities_filtered = [
            e for e in item.entities
            if e.start == -1 or e.end == -1 or e.start != e.end
        ]

        # Sort entities by the order in which they appear in the TeX. This allows the pipeline
        # to keep track of which ones appear first, when trying to recover from errors (i.e., when
        # trying to detect which entity in a batch may have shifted to cause many others to move.)
        entities_ordered = sorted(entities_filtered, key=lambda e: e.start)

        # Construct a queue of entities to detect.
        entities_by_id = {e.id_: e for e in entities_ordered}
        to_process = deque([e.id_ for e in entities_ordered])
        to_process_alone: Deque[str] = deque()

        # Path to output directories. These directories will be redefined once for each batch.
        colorized_tex_dir: Optional[str] = None
        compiled_tex_dir: Optional[str] = None
        raster_output_dir: Optional[str] = None
        diffs_output_dir: Optional[str] = None

        # Iteration state
        batch_index = -1
        iteration_id = None

        def next_batch() -> List[str]:
            """
            Get the next batch of entities to process. First tries to sample a batch from
            'to_process', and then attempts to sample individual entities from 'to_process_alone'.
            """
            if len(to_process) > 0:
                return [
                    to_process.popleft()
                    for _ in range(min(self.args.batch_size, len(to_process)))
                ]
            return [to_process_alone.popleft()]

        def _cleanup_from_last_batch() -> None:
            " Clean up output directories from the last batch. "
            if batch_index > -1 and not self.args.keep_intermediate_files:
                logging.debug(  # pylint: disable=logging-not-lazy
                    "Deleting intermediate files used to locate entities (i.e., colorized "
                    +
                    "sources, compilation results, and rasters) for paper %s iteration %s",
                    item.arxiv_id,
                    iteration_id or "''",
                )
                intermediate_files_dirs = [
                    colorized_tex_dir,
                    compiled_tex_dir,
                    raster_output_dir,
                    diffs_output_dir,
                ]
                for dir_ in intermediate_files_dirs:
                    if dir_ and os.path.exists(dir_):
                        file_utils.clean_directory(dir_)
                        os.rmdir(dir_)

        while len(to_process) > 0 or len(to_process_alone) > 0:
            if batch_index > -1:
                _cleanup_from_last_batch()

            batch_index += 1
            logging.debug(
                "Locating bounding boxes for batch %d-%d of entities of type %s for paper %s.",
                item.group,
                batch_index,
                self.get_entity_name(),
                item.arxiv_id,
            )
            iteration_id = directories.tex_iteration(
                item.tex_path, f"{item.group}-{batch_index}")

            # Define output directory locations for this batch.
            colorized_tex_dir = directories.iteration(
                self.output_base_dirs["sources"], item.arxiv_id, iteration_id)
            compiled_tex_dir = directories.iteration(
                self.output_base_dirs["compiled-sources"],
                item.arxiv_id,
                iteration_id,
            )
            raster_output_dir = directories.iteration(
                self.output_base_dirs["paper-images"], item.arxiv_id,
                iteration_id)
            diffs_output_dir = directories.iteration(
                self.output_base_dirs["diffed-images"], item.arxiv_id,
                iteration_id)

            # Fetch the next batch of entities to process.
            batch = next_batch()
            entities: List[SerializableEntity] = [
                entities_by_id[id_] for id_ in batch
            ]

            # Colorize the TeX for all the entities.
            custom_colorize_func = self.get_colorize_func()
            logging.debug(
                "Attempting to colorize entities in TeX for entity batch %d-%d of paper %s.",
                item.group,
                batch_index,
                item.arxiv_id,
            )
            if custom_colorize_func is not None:
                colorized_tex = custom_colorize_func(
                    item.file_contents.contents, entities,
                    self.get_colorize_options())
                if len(colorized_tex.entity_hues) == 0:
                    logging.info(  # pylint: disable=logging-not-lazy
                        "Custom colorization function colored nothing for entity batch %d-%d of "
                        +
                        "paper %s when coloring file %s. The function probably decide there was "
                        +
                        "nothing to do for this file, and will hopefullly colorize these "
                        +
                        "entities in another file. Skipping this batch for this file.",
                        item.group,
                        batch_index,
                        item.arxiv_id,
                        item.file_contents.path,
                    )
                    continue
            else:
                colorized_tex = colorize_entities(item.file_contents.contents,
                                                  entities,
                                                  self.get_colorize_options())

            # If some entities were skipped during colorization, perhaps because they
            # overlapped with each other, add them back to the work queue.
            if colorized_tex.skipped is not None and len(
                    colorized_tex.skipped) > 0:
                logging.info(  # pylint: disable=logging-not-lazy
                    "Entities %s were skipped during colorization batch %d-%d for paper "
                    + "%s. They will be processed in a later batch.",
                    [e.id_ for e in colorized_tex.skipped],
                    item.group,
                    batch_index,
                    item.arxiv_id,
                )
                # Queue skipped entities in the order that they initially appeared in the batch.
                reprocess_ids = {e.id_ for e in colorized_tex.skipped}
                reprocess_sorted = [
                    id_ for id_ in batch if id_ in reprocess_ids
                ]
                to_process.extendleft(reversed(reprocess_sorted))

                # Remove skipped entities from the current batch.
                for skip in colorized_tex.skipped:
                    del batch[batch.index(skip.id_)]

            # Save the colorized TeX to the file system.
            save_success = save_colorized_tex(
                item.arxiv_id,
                colorized_tex_dir,
                item.tex_path,
                iteration_id,
                colorized_tex.tex,
                item.file_contents.encoding,
                colorized_tex.entity_hues,
            )
            logging.debug(
                "Finished attempting to colorize entities for entity batch %d-%d of paper %s.",
                item.group,
                batch_index,
                item.arxiv_id,
            )
            if not save_success:
                logging.error(  # pylint: disable=logging-not-lazy
                    "Failed to save colorized TeX files for arXiv paper %s. "
                    "This paper will be skipped.",
                    item.arxiv_id,
                )

            # Compile the TeX with the colors.
            shutil.copytree(colorized_tex_dir, compiled_tex_dir)
            compilation_result = compile_tex(compiled_tex_dir)
            save_compilation_result("compiled-sources", item.arxiv_id,
                                    compiled_tex_dir, compilation_result)
            if not compilation_result.success:

                # If colorizing a specific entity caused the failure, remove the entity that caused
                # the problem from the batch and restart with a new batch, minus this entity.
                last_colorized_entity_id = get_last_colorized_entity(
                    item.arxiv_id, compiled_tex_dir)
                if last_colorized_entity_id is not None:
                    problem_ids = [last_colorized_entity_id]
                    if batch.index(last_colorized_entity_id) < len(batch) - 1:
                        problem_ids += [
                            batch[batch.index(last_colorized_entity_id) + 1]
                        ]

                    if len(batch) == 1:
                        logging.warning(  # pylint: disable=logging-not-lazy
                            "Failed to compile paper %s with colorized entity %s, even when it was "
                            +
                            "colorized in isolation. The location of this entity will not be detected.",
                            item.arxiv_id,
                            batch[0],
                        )
                        continue

                    logging.warning(  # pylint: disable=logging-not-lazy
                        "Failed to compile paper %s with colorized entities. The culprit may be "
                        +
                        "the colorization command for entity %s. The problematic entities will be "
                        +
                        "colorized on their own, and the rest of the entities will be colorized "
                        + "together in the next batch.",
                        item.arxiv_id,
                        " or ".join(problem_ids),
                    )

                    for id_ in problem_ids:
                        to_process_alone.append(id_)
                        del batch[batch.index(id_)]

                    to_process.extendleft(reversed(batch))
                    continue

                # If there was some other reason for the error, remove just the first entity from the batch.
                logging.error(  # pylint: disable=logging-not-lazy
                    "Failed to compile paper %s with colorized entities %s. The cause "
                    +
                    "is assumed to be in the first colorized entity. The location for the "
                    +
                    "first entity %s will not be detected. The remainder of the entities in "
                    + "this batch will be processed in another batch.",
                    item.arxiv_id,
                    batch,
                    batch[0],
                )
                del [batch[0]]
                to_process.extendleft(reversed(batch))
                continue

            # Raster the pages to images, and compute diffs from the original images.
            output_files = compilation_result.output_files
            for output_file in output_files:
                raster_success = raster_pages(
                    compiled_tex_dir,
                    os.path.join(raster_output_dir,
                                 directories.escape_slashes(output_file.path)),
                    output_file.path,
                    output_file.output_type,
                )
                if not raster_success:
                    logging.error(  # pylint: disable=logging-not-lazy
                        "Failed to rasterize pages for %s iteration %s. The locations for entities "
                        + "with IDs %s with not be detected.",
                        item.arxiv_id,
                        iteration_id,
                        batch,
                    )
                    continue

                logging.debug(
                    "Attempting to diff rastered pages for paper %s iteration %s.",
                    item.arxiv_id,
                    iteration_id,
                )
                diff_success = diff_images_in_raster_dirs(
                    output_files,
                    raster_output_dir,
                    diffs_output_dir,
                    item.arxiv_id,
                )
                logging.debug(
                    "Finished diffing attempt for paper %s iteration %s. Success? %s.",
                    item.arxiv_id,
                    iteration_id,
                    diff_success,
                )
                if not diff_success:
                    logging.error(  # pylint: disable=logging-not-lazy
                        "Failed to difference images of original and colorized versions of "
                        +
                        "papers %s in batch processing iteration %s. The locations for entities with IDs "
                        + "%s will not be detected.",
                        item.arxiv_id,
                        iteration_id,
                        batch,
                    )
                    continue

            # Locate the entities in the diffed images.
            logging.debug(
                "Attempting to locate entities using image differences for paper %s iteration %s.",
                item.arxiv_id,
                iteration_id,
            )
            entity_hues = colorized_tex.entity_hues
            location_result = locate_entities(item.arxiv_id, raster_output_dir,
                                              diffs_output_dir, entity_hues)
            if location_result is None:
                logging.warning(  # pylint: disable=logging-not-lazy
                    "Error occurred when locating entities by hue in diffed images "
                    +
                    "for paper %s. None of the entities in batch %s will be detected.",
                    item.arxiv_id,
                    batch,
                )
                continue

            if self.should_sanity_check_images(
            ) and location_result.black_pixels_found:
                logging.warning(  # pylint: disable=logging-not-lazy
                    "Ignoring bounding boxes found for paper %s in batch %s due to "
                    +
                    "black pixels found in the images. This might indicate that the colorization "
                    + "commands introduced subtle shifts of the text.",
                    item.arxiv_id,
                    batch,
                )
                continue

            # If colorizing entities seemed to cause drift in the document...
            if len(location_result.shifted_entities) > 0:

                logging.warning(  # pylint: disable=logging-not-lazy
                    "Some entities shifted position in the colorized TeX for paper %s batch %s: "
                    +
                    "%s. Attempting to remove the first shifted entity from the batch.",
                    item.arxiv_id,
                    batch,
                    location_result.shifted_entities,
                )

                first_shifted_entity_id = None
                for entity_id in batch:
                    if entity_id in location_result.shifted_entities:
                        first_shifted_entity_id = entity_id
                        break

                if first_shifted_entity_id is not None:
                    if len(batch) > 1:
                        logging.info(  # pylint: disable=logging-not-lazy
                            "Entity %s has been marked as being the potential cause of shifting in "
                            +
                            "the colorized document for paper %s batch %d-%d. It will be processed "
                            +
                            "later on its own. The other shifted entities in %s will be queued to "
                            + "process as a group in an upcoming batch.",
                            first_shifted_entity_id,
                            item.arxiv_id,
                            item.group,
                            batch_index,
                            location_result.shifted_entities,
                        )

                        # Get the index of the first entity for which the location has shifted
                        # during colorization.
                        moved_entity_index = batch.index(
                            first_shifted_entity_id)

                        # Mark all other entities that have shifted after the first one one to be processed
                        # in a later batch (instead of on their own). It could be that they won't shift
                        # once the first shifted entity is removed.
                        for i in range(len(batch) - 1, moved_entity_index, -1):
                            if batch[i] in location_result.shifted_entities:
                                to_process.appendleft(batch[i])
                                del batch[i]

                        # Mark the first entity that shifted to be reprocessed alone, where its position
                        # might be discoverable, without affecting the positions of other element.
                        del batch[moved_entity_index]
                        to_process_alone.append(first_shifted_entity_id)

                    elif len(batch) == 1 and self.should_sanity_check_images():
                        logging.info(  # pylint: disable=logging-not-lazy
                            "Skipping entity %s for paper %s as it caused " +
                            "colorization errors even when colorized in isolation.",
                            first_shifted_entity_id,
                            item.arxiv_id,
                        )
                        continue
                    elif len(batch) == 1:
                        logging.info(  # pylint: disable=logging-not-lazy
                            "Entity %s has been marked as the cause of shifting in "
                            +
                            "the colorized document for paper %s. Its location will "
                            +
                            "still be saved (if one was found), though this location should be "
                            + "considered potentially inaccurate.",
                            first_shifted_entity_id,
                            item.arxiv_id,
                        )

                else:
                    logging.warning(  # pylint: disable=logging-not-lazy
                        "Could not find a single entity that was likely responsible for shifting in "
                        +
                        "the colorized version of paper %s batch %d-%d. All entities in batch %s will "
                        + "be processed on their own.",
                        item.arxiv_id,
                        item.group,
                        batch_index,
                        batch,
                    )
                    to_process_alone.extend(batch)

            logging.debug(
                "Finished attempt at locating entities with image diffs for paper %s iteration %s.",
                item.arxiv_id,
                iteration_id,
            )

            # The code above is responsible for filtering 'batch' to ensure that it doesn't include
            # any entity IDs that shouldn't be saved to file, for example if the client has asked that
            # entity IDs that cause colorization errors be omitted from the results.
            for entity_id in batch:
                for box in location_result.locations[entity_id]:
                    yield HueLocationInfo(
                        tex_path=item.tex_path,
                        iteration=iteration_id,
                        hue=entity_hues[entity_id],
                        entity_id=entity_id,
                        page=box.page,
                        left=box.left,
                        top=box.top,
                        width=box.width,
                        height=box.height,
                    )

        _cleanup_from_last_batch()
Beispiel #2
0
 def process(self, item: CompilationTask) -> Iterator[CompilationResult]:
     result = compile_tex(item.compilation_path)
     yield result