Beispiel #1
0
        def load_hues(self, arxiv_id: ArxivId,
                      iteration: str) -> List[HueSearchRegion]:
            hues_path = os.path.join(
                directories.iteration(
                    f"sources-with-colorized-{entity_name}",
                    arxiv_id,
                    iteration,
                ),
                "entity_hues.csv",
            )
            if not os.path.exists(hues_path):
                logging.warning("Could not find any hues at %s", hues_path)
                return []

            searches = []
            for record in file_utils.load_from_csv(hues_path,
                                                   ColorizationRecord):
                searches.append(
                    HueSearchRegion(
                        hue=record.hue,
                        record=record,
                        relative_file_path=None,
                        masks=None,
                    ))

            return searches
Beispiel #2
0
 def save(self, item: PageRasterPair, result: np.ndarray) -> None:
     output_dir = directories.iteration(self.get_output_base_dirkey(),
                                        item.arxiv_id, item.iteration)
     image_path = os.path.join(output_dir, item.relative_path,
                               item.image_name)
     image_dir = os.path.dirname(image_path)
     if not os.path.exists(image_dir):
         os.makedirs(image_dir)
     cv2.imwrite(image_path, result)
     logging.debug("Diffed images and stored result at %s", image_path)
Beispiel #3
0
    def load(self) -> Iterator[PageRasterPair]:
        for arxiv_id in self.arxiv_ids:
            output_dir = directories.arxiv_subdir(
                self.get_output_base_dirkey(), arxiv_id)
            file_utils.clean_directory(output_dir)

            # Get output file names from results of compiling the uncolorized TeX sources.
            output_files = get_output_files(
                directories.arxiv_subdir("compiled-sources", arxiv_id))
            if len(output_files) == 0:
                continue

            for iteration in directories.iteration_names(
                    self.get_raster_base_dirkey(), arxiv_id):

                original_images_dir = directories.arxiv_subdir(
                    "paper-images", arxiv_id)
                modified_images_dir = directories.iteration(
                    self.get_raster_base_dirkey(), arxiv_id, iteration)

                for output_file in output_files:
                    relative_file_path = output_file.path
                    original_images_path = os.path.join(
                        original_images_dir, relative_file_path)
                    for img_name in os.listdir(original_images_path):
                        original_img_path = os.path.join(
                            original_images_path, img_name)
                        modified_img_path = os.path.join(
                            modified_images_dir, relative_file_path, img_name)
                        if not os.path.exists(modified_img_path):
                            logging.warning(
                                "Could not find expected image %s. Skipping diff for this paper.",
                                modified_img_path,
                            )
                            break

                        original_img = cv2.imread(original_img_path)
                        modified_img = cv2.imread(modified_img_path)
                        yield PageRasterPair(
                            arxiv_id,
                            iteration,
                            relative_file_path,
                            img_name,
                            original_img,
                            modified_img,
                        )
Beispiel #4
0
    def load(self) -> Iterator[LocationTask]:

        for arxiv_id in self.arxiv_ids:

            output_dir = directories.arxiv_subdir("citation-cluster-locations",
                                                  arxiv_id)
            file_utils.clean_directory(output_dir)

            boxes_by_hue_iteration = file_utils.load_citation_hue_locations(
                arxiv_id)
            if boxes_by_hue_iteration is None:
                continue

            boxes_by_citation_key: Dict[str, List[BoundingBox]] = {}
            for iteration in directories.iteration_names(
                    "sources-with-colorized-citations", arxiv_id):
                citation_hues_path = os.path.join(
                    directories.iteration(
                        "sources-with-colorized-citations",
                        arxiv_id,
                        iteration,
                    ),
                    "entity_hues.csv",
                )
                if not os.path.exists(citation_hues_path):
                    logging.warning(
                        "Could not find citation hue colors for %s iteration %s. Skipping",
                        arxiv_id,
                        iteration,
                    )
                    continue
                for record in file_utils.load_from_csv(citation_hues_path,
                                                       ColorizationRecord):
                    key = record.entity_id
                    if key not in boxes_by_citation_key:
                        boxes_by_citation_key[key] = []
                    hue_iteration = HueIteration(record.hue, iteration)
                    boxes_by_citation_key[key].extend(
                        boxes_by_hue_iteration.get(hue_iteration, []))

            for key, boxes in boxes_by_citation_key.items():
                yield LocationTask(
                    arxiv_id=arxiv_id,
                    citation_key=key,
                    boxes=boxes,
                )
Beispiel #5
0
    def save(self, item: TexAndTokens, result: ColorizationResult) -> None:
        iteration = result.iteration
        iteration_id = f"all-files-{iteration}"
        output_sources_path = directories.iteration(
            "sources-with-colorized-equation-tokens",
            item.arxiv_id,
            iteration_id,
        )
        logging.debug("Outputting to %s", output_sources_path)

        # Create new directory for each colorization iteration.
        unpack_path = unpack(item.arxiv_id, output_sources_path)
        sources_unpacked = unpack_path is not None
        if unpack_path is None:
            logging.warning("Could not unpack sources into %s",
                            output_sources_path)

        if sources_unpacked:
            for tex_path, colorized_tex in result.result.colorized_files.items(
            ):
                full_tex_path = os.path.join(output_sources_path, tex_path)
                with open(full_tex_path, "w",
                          encoding=colorized_tex.encoding) as tex_file:
                    tex_file.write(colorized_tex.contents)

            hues_path = os.path.join(output_sources_path, "entity_hues.csv")
            for colorized_token in result.result.colorized_tokens:
                file_utils.append_to_csv(
                    hues_path,
                    EquationTokenColorizationRecord(
                        entity_id=(str(colorized_token.equation_index) + "-" +
                                   str(colorized_token.token_index)),
                        hue=colorized_token.hue,
                        tex_path=colorized_token.tex_path,
                        iteration=str(iteration),
                        equation_index=colorized_token.equation_index,
                        token_index=colorized_token.token_index,
                        start=colorized_token.start,
                        end=colorized_token.end,
                        text=colorized_token.text,
                    ),
                )
    def save(self, item: ColorizationTask, result: ColorizationResult) -> None:
        iteration = result.iteration
        colorized_tex = result.tex
        colorized_citations = result.colorized_citations

        iteration_id = directories.tex_iteration(item.tex_path, str(iteration))
        output_sources_path = directories.iteration(
            "sources-with-colorized-citations",
            item.arxiv_id,
            iteration_id,
        )
        logging.debug("Outputting to %s", output_sources_path)

        # Create new directory for each colorization iteration for each TeX file.
        unpack_path = unpack(item.arxiv_id, output_sources_path)
        sources_unpacked = unpack_path is not None
        if unpack_path is None:
            logging.warning("Could not unpack sources into %s",
                            output_sources_path)

        if sources_unpacked:
            tex_path = os.path.join(output_sources_path, item.tex_path)
            with open(tex_path, "w",
                      encoding=item.file_contents.encoding) as tex_file:
                tex_file.write(colorized_tex)

            hues_path = os.path.join(output_sources_path, "entity_hues.csv")

            # TODO(andrewhead): It might be better to save this CSV data with the same
            # encoding as the file the TeX was read from, for the citations, for the
            # equations, and for the symbols. There might be some gotchas for character
            # positions not lining up between the ones we save using Unicode here and the
            # positions in the intended encoding in the original files.
            for c in colorized_citations:
                record = ColorizationRecord(
                    hue=c.hue,
                    entity_id=c.key,
                    tex_path=item.tex_path,
                    iteration=iteration_id,
                )
                file_utils.append_to_csv(hues_path, record)
Beispiel #7
0
    def save(self, item: ColorizationTask, result: ColorizationResult) -> None:
        iteration = result.iteration
        colorized_tex = result.tex
        entity_hues = result.entity_hues

        iteration_id = directories.tex_iteration(item.tex_path, str(iteration))
        output_sources_path = directories.iteration(
            self.get_output_base_dirkey(),
            item.arxiv_id,
            iteration_id,
        )
        logging.debug("Outputting to %s", output_sources_path)

        # Each colorization batch gets a new sources directory.
        unpack_path = unpack(item.arxiv_id, output_sources_path)
        sources_unpacked = unpack_path is not None
        if unpack_path is None:
            logging.warning("Could not unpack sources into %s",
                            output_sources_path)

        if sources_unpacked:
            # Rewrite the TeX with the colorized TeX.
            tex_path = os.path.join(output_sources_path, item.tex_path)
            with open(tex_path, "w",
                      encoding=item.file_contents.encoding) as tex_file:
                tex_file.write(colorized_tex)

            # Save a log of which hues were assigned to which entities.
            hues_path = os.path.join(output_sources_path, "entity_hues.csv")
            for (hue, entity) in entity_hues:
                file_utils.append_to_csv(
                    hues_path,
                    ColorizationRecord(
                        tex_path=item.tex_path,
                        iteration=str(iteration),
                        hue=hue,
                        entity_id=entity.id_,
                    ),
                )
Beispiel #8
0
    def process(self, item: LocationTask) -> Iterator[HueLocationInfo]:

        # Filter out entities that are empty (i.e., have nothing to color)
        # A '-1' in the 'start' or 'end' field indicates that the entity does not occur in a
        # specific place in the TeX, but rather a custom coloring technique based on other
        # entity properties will be used. So entities that have a '-1' for their start and
        # end should still be processed even though they appear to be zero-length.
        entities_filtered = [
            e for e in item.entities
            if e.start == -1 or e.end == -1 or e.start != e.end
        ]

        # Sort entities by the order in which they appear in the TeX. This allows the pipeline
        # to keep track of which ones appear first, when trying to recover from errors (i.e., when
        # trying to detect which entity in a batch may have shifted to cause many others to move.)
        entities_ordered = sorted(entities_filtered, key=lambda e: e.start)

        # Construct a queue of entities to detect.
        entities_by_id = {e.id_: e for e in entities_ordered}
        to_process = deque([e.id_ for e in entities_ordered])
        to_process_alone: Deque[str] = deque()

        # Path to output directories. These directories will be redefined once for each batch.
        colorized_tex_dir: Optional[str] = None
        compiled_tex_dir: Optional[str] = None
        raster_output_dir: Optional[str] = None
        diffs_output_dir: Optional[str] = None

        # Iteration state
        batch_index = -1
        iteration_id = None

        def next_batch() -> List[str]:
            """
            Get the next batch of entities to process. First tries to sample a batch from
            'to_process', and then attempts to sample individual entities from 'to_process_alone'.
            """
            if len(to_process) > 0:
                return [
                    to_process.popleft()
                    for _ in range(min(self.args.batch_size, len(to_process)))
                ]
            return [to_process_alone.popleft()]

        def _cleanup_from_last_batch() -> None:
            " Clean up output directories from the last batch. "
            if batch_index > -1 and not self.args.keep_intermediate_files:
                logging.debug(  # pylint: disable=logging-not-lazy
                    "Deleting intermediate files used to locate entities (i.e., colorized "
                    +
                    "sources, compilation results, and rasters) for paper %s iteration %s",
                    item.arxiv_id,
                    iteration_id or "''",
                )
                intermediate_files_dirs = [
                    colorized_tex_dir,
                    compiled_tex_dir,
                    raster_output_dir,
                    diffs_output_dir,
                ]
                for dir_ in intermediate_files_dirs:
                    if dir_ and os.path.exists(dir_):
                        file_utils.clean_directory(dir_)
                        os.rmdir(dir_)

        while len(to_process) > 0 or len(to_process_alone) > 0:
            if batch_index > -1:
                _cleanup_from_last_batch()

            batch_index += 1
            logging.debug(
                "Locating bounding boxes for batch %d-%d of entities of type %s for paper %s.",
                item.group,
                batch_index,
                self.get_entity_name(),
                item.arxiv_id,
            )
            iteration_id = directories.tex_iteration(
                item.tex_path, f"{item.group}-{batch_index}")

            # Define output directory locations for this batch.
            colorized_tex_dir = directories.iteration(
                self.output_base_dirs["sources"], item.arxiv_id, iteration_id)
            compiled_tex_dir = directories.iteration(
                self.output_base_dirs["compiled-sources"],
                item.arxiv_id,
                iteration_id,
            )
            raster_output_dir = directories.iteration(
                self.output_base_dirs["paper-images"], item.arxiv_id,
                iteration_id)
            diffs_output_dir = directories.iteration(
                self.output_base_dirs["diffed-images"], item.arxiv_id,
                iteration_id)

            # Fetch the next batch of entities to process.
            batch = next_batch()
            entities: List[SerializableEntity] = [
                entities_by_id[id_] for id_ in batch
            ]

            # Colorize the TeX for all the entities.
            custom_colorize_func = self.get_colorize_func()
            logging.debug(
                "Attempting to colorize entities in TeX for entity batch %d-%d of paper %s.",
                item.group,
                batch_index,
                item.arxiv_id,
            )
            if custom_colorize_func is not None:
                colorized_tex = custom_colorize_func(
                    item.file_contents.contents, entities,
                    self.get_colorize_options())
                if len(colorized_tex.entity_hues) == 0:
                    logging.info(  # pylint: disable=logging-not-lazy
                        "Custom colorization function colored nothing for entity batch %d-%d of "
                        +
                        "paper %s when coloring file %s. The function probably decide there was "
                        +
                        "nothing to do for this file, and will hopefullly colorize these "
                        +
                        "entities in another file. Skipping this batch for this file.",
                        item.group,
                        batch_index,
                        item.arxiv_id,
                        item.file_contents.path,
                    )
                    continue
            else:
                colorized_tex = colorize_entities(item.file_contents.contents,
                                                  entities,
                                                  self.get_colorize_options())

            # If some entities were skipped during colorization, perhaps because they
            # overlapped with each other, add them back to the work queue.
            if colorized_tex.skipped is not None and len(
                    colorized_tex.skipped) > 0:
                logging.info(  # pylint: disable=logging-not-lazy
                    "Entities %s were skipped during colorization batch %d-%d for paper "
                    + "%s. They will be processed in a later batch.",
                    [e.id_ for e in colorized_tex.skipped],
                    item.group,
                    batch_index,
                    item.arxiv_id,
                )
                # Queue skipped entities in the order that they initially appeared in the batch.
                reprocess_ids = {e.id_ for e in colorized_tex.skipped}
                reprocess_sorted = [
                    id_ for id_ in batch if id_ in reprocess_ids
                ]
                to_process.extendleft(reversed(reprocess_sorted))

                # Remove skipped entities from the current batch.
                for skip in colorized_tex.skipped:
                    del batch[batch.index(skip.id_)]

            # Save the colorized TeX to the file system.
            save_success = save_colorized_tex(
                item.arxiv_id,
                colorized_tex_dir,
                item.tex_path,
                iteration_id,
                colorized_tex.tex,
                item.file_contents.encoding,
                colorized_tex.entity_hues,
            )
            logging.debug(
                "Finished attempting to colorize entities for entity batch %d-%d of paper %s.",
                item.group,
                batch_index,
                item.arxiv_id,
            )
            if not save_success:
                logging.error(  # pylint: disable=logging-not-lazy
                    "Failed to save colorized TeX files for arXiv paper %s. "
                    "This paper will be skipped.",
                    item.arxiv_id,
                )

            # Compile the TeX with the colors.
            shutil.copytree(colorized_tex_dir, compiled_tex_dir)
            compilation_result = compile_tex(compiled_tex_dir)
            save_compilation_result("compiled-sources", item.arxiv_id,
                                    compiled_tex_dir, compilation_result)
            if not compilation_result.success:

                # If colorizing a specific entity caused the failure, remove the entity that caused
                # the problem from the batch and restart with a new batch, minus this entity.
                last_colorized_entity_id = get_last_colorized_entity(
                    item.arxiv_id, compiled_tex_dir)
                if last_colorized_entity_id is not None:
                    problem_ids = [last_colorized_entity_id]
                    if batch.index(last_colorized_entity_id) < len(batch) - 1:
                        problem_ids += [
                            batch[batch.index(last_colorized_entity_id) + 1]
                        ]

                    if len(batch) == 1:
                        logging.warning(  # pylint: disable=logging-not-lazy
                            "Failed to compile paper %s with colorized entity %s, even when it was "
                            +
                            "colorized in isolation. The location of this entity will not be detected.",
                            item.arxiv_id,
                            batch[0],
                        )
                        continue

                    logging.warning(  # pylint: disable=logging-not-lazy
                        "Failed to compile paper %s with colorized entities. The culprit may be "
                        +
                        "the colorization command for entity %s. The problematic entities will be "
                        +
                        "colorized on their own, and the rest of the entities will be colorized "
                        + "together in the next batch.",
                        item.arxiv_id,
                        " or ".join(problem_ids),
                    )

                    for id_ in problem_ids:
                        to_process_alone.append(id_)
                        del batch[batch.index(id_)]

                    to_process.extendleft(reversed(batch))
                    continue

                # If there was some other reason for the error, remove just the first entity from the batch.
                logging.error(  # pylint: disable=logging-not-lazy
                    "Failed to compile paper %s with colorized entities %s. The cause "
                    +
                    "is assumed to be in the first colorized entity. The location for the "
                    +
                    "first entity %s will not be detected. The remainder of the entities in "
                    + "this batch will be processed in another batch.",
                    item.arxiv_id,
                    batch,
                    batch[0],
                )
                del [batch[0]]
                to_process.extendleft(reversed(batch))
                continue

            # Raster the pages to images, and compute diffs from the original images.
            output_files = compilation_result.output_files
            for output_file in output_files:
                raster_success = raster_pages(
                    compiled_tex_dir,
                    os.path.join(raster_output_dir,
                                 directories.escape_slashes(output_file.path)),
                    output_file.path,
                    output_file.output_type,
                )
                if not raster_success:
                    logging.error(  # pylint: disable=logging-not-lazy
                        "Failed to rasterize pages for %s iteration %s. The locations for entities "
                        + "with IDs %s with not be detected.",
                        item.arxiv_id,
                        iteration_id,
                        batch,
                    )
                    continue

                logging.debug(
                    "Attempting to diff rastered pages for paper %s iteration %s.",
                    item.arxiv_id,
                    iteration_id,
                )
                diff_success = diff_images_in_raster_dirs(
                    output_files,
                    raster_output_dir,
                    diffs_output_dir,
                    item.arxiv_id,
                )
                logging.debug(
                    "Finished diffing attempt for paper %s iteration %s. Success? %s.",
                    item.arxiv_id,
                    iteration_id,
                    diff_success,
                )
                if not diff_success:
                    logging.error(  # pylint: disable=logging-not-lazy
                        "Failed to difference images of original and colorized versions of "
                        +
                        "papers %s in batch processing iteration %s. The locations for entities with IDs "
                        + "%s will not be detected.",
                        item.arxiv_id,
                        iteration_id,
                        batch,
                    )
                    continue

            # Locate the entities in the diffed images.
            logging.debug(
                "Attempting to locate entities using image differences for paper %s iteration %s.",
                item.arxiv_id,
                iteration_id,
            )
            entity_hues = colorized_tex.entity_hues
            location_result = locate_entities(item.arxiv_id, raster_output_dir,
                                              diffs_output_dir, entity_hues)
            if location_result is None:
                logging.warning(  # pylint: disable=logging-not-lazy
                    "Error occurred when locating entities by hue in diffed images "
                    +
                    "for paper %s. None of the entities in batch %s will be detected.",
                    item.arxiv_id,
                    batch,
                )
                continue

            if self.should_sanity_check_images(
            ) and location_result.black_pixels_found:
                logging.warning(  # pylint: disable=logging-not-lazy
                    "Ignoring bounding boxes found for paper %s in batch %s due to "
                    +
                    "black pixels found in the images. This might indicate that the colorization "
                    + "commands introduced subtle shifts of the text.",
                    item.arxiv_id,
                    batch,
                )
                continue

            # If colorizing entities seemed to cause drift in the document...
            if len(location_result.shifted_entities) > 0:

                logging.warning(  # pylint: disable=logging-not-lazy
                    "Some entities shifted position in the colorized TeX for paper %s batch %s: "
                    +
                    "%s. Attempting to remove the first shifted entity from the batch.",
                    item.arxiv_id,
                    batch,
                    location_result.shifted_entities,
                )

                first_shifted_entity_id = None
                for entity_id in batch:
                    if entity_id in location_result.shifted_entities:
                        first_shifted_entity_id = entity_id
                        break

                if first_shifted_entity_id is not None:
                    if len(batch) > 1:
                        logging.info(  # pylint: disable=logging-not-lazy
                            "Entity %s has been marked as being the potential cause of shifting in "
                            +
                            "the colorized document for paper %s batch %d-%d. It will be processed "
                            +
                            "later on its own. The other shifted entities in %s will be queued to "
                            + "process as a group in an upcoming batch.",
                            first_shifted_entity_id,
                            item.arxiv_id,
                            item.group,
                            batch_index,
                            location_result.shifted_entities,
                        )

                        # Get the index of the first entity for which the location has shifted
                        # during colorization.
                        moved_entity_index = batch.index(
                            first_shifted_entity_id)

                        # Mark all other entities that have shifted after the first one one to be processed
                        # in a later batch (instead of on their own). It could be that they won't shift
                        # once the first shifted entity is removed.
                        for i in range(len(batch) - 1, moved_entity_index, -1):
                            if batch[i] in location_result.shifted_entities:
                                to_process.appendleft(batch[i])
                                del batch[i]

                        # Mark the first entity that shifted to be reprocessed alone, where its position
                        # might be discoverable, without affecting the positions of other element.
                        del batch[moved_entity_index]
                        to_process_alone.append(first_shifted_entity_id)

                    elif len(batch) == 1 and self.should_sanity_check_images():
                        logging.info(  # pylint: disable=logging-not-lazy
                            "Skipping entity %s for paper %s as it caused " +
                            "colorization errors even when colorized in isolation.",
                            first_shifted_entity_id,
                            item.arxiv_id,
                        )
                        continue
                    elif len(batch) == 1:
                        logging.info(  # pylint: disable=logging-not-lazy
                            "Entity %s has been marked as the cause of shifting in "
                            +
                            "the colorized document for paper %s. Its location will "
                            +
                            "still be saved (if one was found), though this location should be "
                            + "considered potentially inaccurate.",
                            first_shifted_entity_id,
                            item.arxiv_id,
                        )

                else:
                    logging.warning(  # pylint: disable=logging-not-lazy
                        "Could not find a single entity that was likely responsible for shifting in "
                        +
                        "the colorized version of paper %s batch %d-%d. All entities in batch %s will "
                        + "be processed on their own.",
                        item.arxiv_id,
                        item.group,
                        batch_index,
                        batch,
                    )
                    to_process_alone.extend(batch)

            logging.debug(
                "Finished attempt at locating entities with image diffs for paper %s iteration %s.",
                item.arxiv_id,
                iteration_id,
            )

            # The code above is responsible for filtering 'batch' to ensure that it doesn't include
            # any entity IDs that shouldn't be saved to file, for example if the client has asked that
            # entity IDs that cause colorization errors be omitted from the results.
            for entity_id in batch:
                for box in location_result.locations[entity_id]:
                    yield HueLocationInfo(
                        tex_path=item.tex_path,
                        iteration=iteration_id,
                        hue=entity_hues[entity_id],
                        entity_id=entity_id,
                        page=box.page,
                        left=box.left,
                        top=box.top,
                        width=box.width,
                        height=box.height,
                    )

        _cleanup_from_last_batch()
Beispiel #9
0
    def load(self) -> Iterator[SearchTask]:
        for arxiv_id in self.arxiv_ids:
            output_dir = directories.arxiv_subdir(
                self.get_output_base_dirkey(), arxiv_id)
            file_utils.clean_directory(output_dir)

            # Get output file names from results of compiling the uncolorized TeX sources.
            output_files = get_output_files(
                directories.arxiv_subdir("compiled-sources", arxiv_id))

            for iteration in directories.iteration_names(
                    self.get_diff_images_base_dirkey(), arxiv_id):

                diff_images_dir = directories.iteration(
                    self.get_diff_images_base_dirkey(), arxiv_id, iteration)

                hue_searches = self.load_hues(arxiv_id, iteration)
                hue_searches_by_file: Dict[Path, List[HueSearchRegion]] = {}
                for search in hue_searches:
                    output_paths = [f.path for f in output_files]
                    files_to_search = ([search.relative_file_path]
                                       if search.relative_file_path is not None
                                       else output_paths)
                    for path in files_to_search:
                        if path not in hue_searches_by_file:
                            hue_searches_by_file[path] = []
                        hue_searches_by_file[path].append(search)

                for relative_file_path, search_regions in hue_searches_by_file.items(
                ):

                    diff_images_file_path = os.path.join(
                        diff_images_dir, relative_file_path)
                    page_images = {}

                    colorization_error_detected = False
                    for img_name in os.listdir(diff_images_file_path):
                        img_path = os.path.join(diff_images_file_path,
                                                img_name)
                        page_image = cv2.imread(img_path)

                        if not self.args.skip_visual_validation:
                            if contains_black_pixels(page_image):
                                logging.warning(
                                    "Black pixels found in image diff %s",
                                    img_path)
                                colorization_error_detected = True

                        page_number = (int(
                            os.path.splitext(img_name)[0].replace("page-", ""))
                                       - 1)
                        page_images[page_number] = page_image

                    if colorization_error_detected:
                        logging.warning(  # pylint: disable=logging-not-lazy
                            "Colorization error detected. Skipping hue location for "
                            + "iteration %s for arXiv paper %s",
                            iteration,
                            arxiv_id,
                        )
                        break

                    for search_region in search_regions:
                        yield SearchTask(
                            arxiv_id,
                            iteration,
                            page_images,
                            relative_file_path,
                            search_region,
                        )
Beispiel #10
0
    def load_hues(self, arxiv_id: ArxivId,
                  iteration: str) -> List[HueSearchRegion]:

        equation_boxes_path = os.path.join(
            directories.arxiv_subdir("hue-locations-for-equations", arxiv_id),
            "hue_locations.csv",
        )
        bounding_boxes: Dict[EquationId, BoundingBoxesByFile] = {}

        for location_info in file_utils.load_from_csv(equation_boxes_path,
                                                      HueLocationInfo):
            equation_id = EquationId(
                tex_path=location_info.tex_path,
                equation_index=int(location_info.entity_id),
            )
            if equation_id not in bounding_boxes:
                bounding_boxes[equation_id] = {}

            file_path = location_info.relative_file_path
            if file_path not in bounding_boxes[equation_id]:
                bounding_boxes[equation_id][file_path] = []

            box = BoundingBox(
                page=location_info.page,
                left=location_info.left,
                top=location_info.top,
                width=location_info.width,
                height=location_info.height,
            )
            bounding_boxes[equation_id][file_path].append(box)

        token_records_by_equation: Dict[EquationId, Dict[
            int, EquationTokenColorizationRecord]] = {}
        token_hues_path = os.path.join(
            directories.iteration(
                "sources-with-colorized-equation-tokens",
                arxiv_id,
                iteration,
            ),
            "entity_hues.csv",
        )
        for record in file_utils.load_from_csv(
                token_hues_path, EquationTokenColorizationRecord):
            equation_id = EquationId(tex_path=record.tex_path,
                                     equation_index=record.equation_index)
            token_index = int(record.token_index)

            if equation_id not in token_records_by_equation:
                token_records_by_equation[equation_id] = {}
            token_records_by_equation[equation_id][token_index] = record

        hue_searches = []
        for equation_id, boxes_by_file in bounding_boxes.items():
            for file_path, boxes in boxes_by_file.items():
                masks_by_page: MasksForPages = {}
                for box in boxes:
                    if box.page not in masks_by_page:
                        masks_by_page[box.page] = []
                    masks_by_page[box.page].append(
                        Rectangle(box.left, box.top, box.width, box.height))

                if equation_id in token_records_by_equation:
                    for token_index, record in token_records_by_equation[
                            equation_id].items():
                        hue_searches.append(
                            HueSearchRegion(
                                hue=record.hue,
                                record=record,
                                relative_file_path=file_path,
                                masks=masks_by_page,
                            ))

        return hue_searches