Ejemplo n.º 1
0
 def get_source_dirs(self, arxiv_id: ArxivId) -> Iterator[RelativePath]:
     """
     Get all directories that should be compiled for an arXiv ID. Paths should
     be relative to the sources base dir. This method can be overridden.
     """
     for iteration in directories.iteration_names(
         self.get_sources_base_dirkey(), arxiv_id
     ):
         yield directories.relpath_arxiv_id_iteration(arxiv_id, iteration)
Ejemplo n.º 2
0
    def load(self) -> Iterator[PageRasterPair]:
        for arxiv_id in self.arxiv_ids:
            output_dir = directories.arxiv_subdir(
                self.get_output_base_dirkey(), arxiv_id)
            file_utils.clean_directory(output_dir)

            # Get output file names from results of compiling the uncolorized TeX sources.
            output_files = get_output_files(
                directories.arxiv_subdir("compiled-sources", arxiv_id))
            if len(output_files) == 0:
                continue

            for iteration in directories.iteration_names(
                    self.get_raster_base_dirkey(), arxiv_id):

                original_images_dir = directories.arxiv_subdir(
                    "paper-images", arxiv_id)
                modified_images_dir = directories.iteration(
                    self.get_raster_base_dirkey(), arxiv_id, iteration)

                for output_file in output_files:
                    relative_file_path = output_file.path
                    original_images_path = os.path.join(
                        original_images_dir, relative_file_path)
                    for img_name in os.listdir(original_images_path):
                        original_img_path = os.path.join(
                            original_images_path, img_name)
                        modified_img_path = os.path.join(
                            modified_images_dir, relative_file_path, img_name)
                        if not os.path.exists(modified_img_path):
                            logging.warning(
                                "Could not find expected image %s. Skipping diff for this paper.",
                                modified_img_path,
                            )
                            break

                        original_img = cv2.imread(original_img_path)
                        modified_img = cv2.imread(modified_img_path)
                        yield PageRasterPair(
                            arxiv_id,
                            iteration,
                            relative_file_path,
                            img_name,
                            original_img,
                            modified_img,
                        )
Ejemplo n.º 3
0
    def load(self) -> Iterator[LocationTask]:

        for arxiv_id in self.arxiv_ids:

            output_dir = directories.arxiv_subdir("citation-cluster-locations",
                                                  arxiv_id)
            file_utils.clean_directory(output_dir)

            boxes_by_hue_iteration = file_utils.load_citation_hue_locations(
                arxiv_id)
            if boxes_by_hue_iteration is None:
                continue

            boxes_by_citation_key: Dict[str, List[BoundingBox]] = {}
            for iteration in directories.iteration_names(
                    "sources-with-colorized-citations", arxiv_id):
                citation_hues_path = os.path.join(
                    directories.iteration(
                        "sources-with-colorized-citations",
                        arxiv_id,
                        iteration,
                    ),
                    "entity_hues.csv",
                )
                if not os.path.exists(citation_hues_path):
                    logging.warning(
                        "Could not find citation hue colors for %s iteration %s. Skipping",
                        arxiv_id,
                        iteration,
                    )
                    continue
                for record in file_utils.load_from_csv(citation_hues_path,
                                                       ColorizationRecord):
                    key = record.entity_id
                    if key not in boxes_by_citation_key:
                        boxes_by_citation_key[key] = []
                    hue_iteration = HueIteration(record.hue, iteration)
                    boxes_by_citation_key[key].extend(
                        boxes_by_hue_iteration.get(hue_iteration, []))

            for key, boxes in boxes_by_citation_key.items():
                yield LocationTask(
                    arxiv_id=arxiv_id,
                    citation_key=key,
                    boxes=boxes,
                )
Ejemplo n.º 4
0
    def load(self) -> Iterator[SearchTask]:
        for arxiv_id in self.arxiv_ids:
            output_dir = directories.arxiv_subdir(
                self.get_output_base_dirkey(), arxiv_id)
            file_utils.clean_directory(output_dir)

            # Get output file names from results of compiling the uncolorized TeX sources.
            output_files = get_output_files(
                directories.arxiv_subdir("compiled-sources", arxiv_id))

            for iteration in directories.iteration_names(
                    self.get_diff_images_base_dirkey(), arxiv_id):

                diff_images_dir = directories.iteration(
                    self.get_diff_images_base_dirkey(), arxiv_id, iteration)

                hue_searches = self.load_hues(arxiv_id, iteration)
                hue_searches_by_file: Dict[Path, List[HueSearchRegion]] = {}
                for search in hue_searches:
                    output_paths = [f.path for f in output_files]
                    files_to_search = ([search.relative_file_path]
                                       if search.relative_file_path is not None
                                       else output_paths)
                    for path in files_to_search:
                        if path not in hue_searches_by_file:
                            hue_searches_by_file[path] = []
                        hue_searches_by_file[path].append(search)

                for relative_file_path, search_regions in hue_searches_by_file.items(
                ):

                    diff_images_file_path = os.path.join(
                        diff_images_dir, relative_file_path)
                    page_images = {}

                    colorization_error_detected = False
                    for img_name in os.listdir(diff_images_file_path):
                        img_path = os.path.join(diff_images_file_path,
                                                img_name)
                        page_image = cv2.imread(img_path)

                        if not self.args.skip_visual_validation:
                            if contains_black_pixels(page_image):
                                logging.warning(
                                    "Black pixels found in image diff %s",
                                    img_path)
                                colorization_error_detected = True

                        page_number = (int(
                            os.path.splitext(img_name)[0].replace("page-", ""))
                                       - 1)
                        page_images[page_number] = page_image

                    if colorization_error_detected:
                        logging.warning(  # pylint: disable=logging-not-lazy
                            "Colorization error detected. Skipping hue location for "
                            + "iteration %s for arXiv paper %s",
                            iteration,
                            arxiv_id,
                        )
                        break

                    for search_region in search_regions:
                        yield SearchTask(
                            arxiv_id,
                            iteration,
                            page_images,
                            relative_file_path,
                            search_region,
                        )
Ejemplo n.º 5
0
 def get_paper_dirs(self, arxiv_id: ArxivId) -> Iterator[RelativePath]:
     for iteration in directories.iteration_names(
             self.get_papers_base_dirkey(), arxiv_id):
         yield directories.relpath_arxiv_id_iteration(
             arxiv_id, iteration)