Ejemplo n.º 1
0
def locate_entities(
    arxiv_id: ArxivId,
    modified_images_dir: RelativePath,
    diffed_images_dir: RelativePath,
    entity_hues: Dict[str, float],
) -> Optional[LocationResult]:

    # Get output file names from results of compiling the uncolorized TeX sources.
    output_files = get_output_files(
        directories.arxiv_subdir("compiled-sources", arxiv_id))
    output_paths = [f.path for f in output_files]

    black_pixels_found = False
    shifted_entity_ids: Set[str] = set()
    entity_locations: Dict[str, List[BoundingBox]] = defaultdict(list)

    for relative_file_path in output_paths:
        diffed_images_file_path = os.path.join(diffed_images_dir,
                                               relative_file_path)

        # Locate bounding boxes for each hue in the diffs.
        diff_images = {}
        if not os.path.exists(diffed_images_file_path):
            logging.warning(  # pylint: disable=logging-not-lazy
                "Expected but could not find a directory %s from the image diffs. "
                +
                "This suggests that the colorized paper failed to compile. Hues "
                + "will not be searched for in this diff directory.",
                diffed_images_file_path,
            )
            return None

        for img_name in os.listdir(diffed_images_file_path):
            img_path = os.path.join(diffed_images_file_path, img_name)
            page_image = cv2.imread(img_path)

            if contains_black_pixels(page_image):
                logging.warning("Black pixels found in image diff %s",
                                img_path)
                black_pixels_found = True

            page_number = int(
                os.path.splitext(img_name)[0].replace("page-", "")) - 1
            diff_images[page_number] = page_image

        for entity_id, hue in entity_hues.items():
            for page_number, image in diff_images.items():
                boxes = extract_bounding_boxes(image, page_number, hue)
                for box in boxes:
                    entity_locations[entity_id].append(box)

        shifted_entity_ids.update(
            find_shifted_entities(arxiv_id, modified_images_dir,
                                  relative_file_path, entity_hues))

    return LocationResult(
        locations=entity_locations,
        shifted_entities=list(shifted_entity_ids),
        black_pixels_found=black_pixels_found,
    )
Ejemplo n.º 2
0
    def load(self) -> Iterator[RasterTask]:

        for arxiv_id in self.arxiv_ids:

            # Clean all past output for this arXiv ID.
            output_dir_for_arxiv_id = directories.arxiv_subdir("paper-images", arxiv_id)
            file_utils.clean_directory(output_dir_for_arxiv_id)

            paper_abs_path = directories.arxiv_subdir(
                "compiled-normalized-sources", arxiv_id
            )
            output_files = get_output_files(paper_abs_path)
            for output_file in output_files:
                yield RasterTask(
                    arxiv_id, output_file.output_type, output_file.path,
                )
Ejemplo n.º 3
0
    def load(self) -> Iterator[PageRasterPair]:
        for arxiv_id in self.arxiv_ids:
            output_dir = directories.arxiv_subdir(
                self.get_output_base_dirkey(), arxiv_id)
            file_utils.clean_directory(output_dir)

            # Get output file names from results of compiling the uncolorized TeX sources.
            output_files = get_output_files(
                directories.arxiv_subdir("compiled-sources", arxiv_id))
            if len(output_files) == 0:
                continue

            for iteration in directories.iteration_names(
                    self.get_raster_base_dirkey(), arxiv_id):

                original_images_dir = directories.arxiv_subdir(
                    "paper-images", arxiv_id)
                modified_images_dir = directories.iteration(
                    self.get_raster_base_dirkey(), arxiv_id, iteration)

                for output_file in output_files:
                    relative_file_path = output_file.path
                    original_images_path = os.path.join(
                        original_images_dir, relative_file_path)
                    for img_name in os.listdir(original_images_path):
                        original_img_path = os.path.join(
                            original_images_path, img_name)
                        modified_img_path = os.path.join(
                            modified_images_dir, relative_file_path, img_name)
                        if not os.path.exists(modified_img_path):
                            logging.warning(
                                "Could not find expected image %s. Skipping diff for this paper.",
                                modified_img_path,
                            )
                            break

                        original_img = cv2.imread(original_img_path)
                        modified_img = cv2.imread(modified_img_path)
                        yield PageRasterPair(
                            arxiv_id,
                            iteration,
                            relative_file_path,
                            img_name,
                            original_img,
                            modified_img,
                        )
Ejemplo n.º 4
0
    def load(self) -> Iterator[RasterTask]:

        for arxiv_id in self.arxiv_ids:

            # Clean all past output for this arXiv ID.
            output_dir_for_arxiv_id = directories.arxiv_subdir(
                self.get_output_base_dirkey(), arxiv_id)
            file_utils.clean_directory(output_dir_for_arxiv_id)

            for paper_dir in self.get_paper_dirs(arxiv_id):
                paper_abs_path = os.path.join(
                    directories.dirpath(self.get_papers_base_dirkey()),
                    paper_dir)
                output_files = get_output_files(paper_abs_path)
                for output_file in output_files:
                    yield RasterTask(
                        paper_dir,
                        output_file.output_type,
                        output_file.path,
                        os.path.join(paper_abs_path, output_file.path),
                    )
Ejemplo n.º 5
0
    def load(self) -> Iterator[SearchTask]:
        for arxiv_id in self.arxiv_ids:
            output_dir = directories.arxiv_subdir(
                self.get_output_base_dirkey(), arxiv_id)
            file_utils.clean_directory(output_dir)

            # Get output file names from results of compiling the uncolorized TeX sources.
            output_files = get_output_files(
                directories.arxiv_subdir("compiled-sources", arxiv_id))

            for iteration in directories.iteration_names(
                    self.get_diff_images_base_dirkey(), arxiv_id):

                diff_images_dir = directories.iteration(
                    self.get_diff_images_base_dirkey(), arxiv_id, iteration)

                hue_searches = self.load_hues(arxiv_id, iteration)
                hue_searches_by_file: Dict[Path, List[HueSearchRegion]] = {}
                for search in hue_searches:
                    output_paths = [f.path for f in output_files]
                    files_to_search = ([search.relative_file_path]
                                       if search.relative_file_path is not None
                                       else output_paths)
                    for path in files_to_search:
                        if path not in hue_searches_by_file:
                            hue_searches_by_file[path] = []
                        hue_searches_by_file[path].append(search)

                for relative_file_path, search_regions in hue_searches_by_file.items(
                ):

                    diff_images_file_path = os.path.join(
                        diff_images_dir, relative_file_path)
                    page_images = {}

                    colorization_error_detected = False
                    for img_name in os.listdir(diff_images_file_path):
                        img_path = os.path.join(diff_images_file_path,
                                                img_name)
                        page_image = cv2.imread(img_path)

                        if not self.args.skip_visual_validation:
                            if contains_black_pixels(page_image):
                                logging.warning(
                                    "Black pixels found in image diff %s",
                                    img_path)
                                colorization_error_detected = True

                        page_number = (int(
                            os.path.splitext(img_name)[0].replace("page-", ""))
                                       - 1)
                        page_images[page_number] = page_image

                    if colorization_error_detected:
                        logging.warning(  # pylint: disable=logging-not-lazy
                            "Colorization error detected. Skipping hue location for "
                            + "iteration %s for arXiv paper %s",
                            iteration,
                            arxiv_id,
                        )
                        break

                    for search_region in search_regions:
                        yield SearchTask(
                            arxiv_id,
                            iteration,
                            page_images,
                            relative_file_path,
                            search_region,
                        )