def get_source_dirs(self, arxiv_id: ArxivId) -> Iterator[RelativePath]: """ Get all directories that should be compiled for an arXiv ID. Paths should be relative to the sources base dir. This method can be overridden. """ for iteration in directories.iteration_names( self.get_sources_base_dirkey(), arxiv_id ): yield directories.relpath_arxiv_id_iteration(arxiv_id, iteration)
def load(self) -> Iterator[PageRasterPair]: for arxiv_id in self.arxiv_ids: output_dir = directories.arxiv_subdir( self.get_output_base_dirkey(), arxiv_id) file_utils.clean_directory(output_dir) # Get output file names from results of compiling the uncolorized TeX sources. output_files = get_output_files( directories.arxiv_subdir("compiled-sources", arxiv_id)) if len(output_files) == 0: continue for iteration in directories.iteration_names( self.get_raster_base_dirkey(), arxiv_id): original_images_dir = directories.arxiv_subdir( "paper-images", arxiv_id) modified_images_dir = directories.iteration( self.get_raster_base_dirkey(), arxiv_id, iteration) for output_file in output_files: relative_file_path = output_file.path original_images_path = os.path.join( original_images_dir, relative_file_path) for img_name in os.listdir(original_images_path): original_img_path = os.path.join( original_images_path, img_name) modified_img_path = os.path.join( modified_images_dir, relative_file_path, img_name) if not os.path.exists(modified_img_path): logging.warning( "Could not find expected image %s. Skipping diff for this paper.", modified_img_path, ) break original_img = cv2.imread(original_img_path) modified_img = cv2.imread(modified_img_path) yield PageRasterPair( arxiv_id, iteration, relative_file_path, img_name, original_img, modified_img, )
def load(self) -> Iterator[LocationTask]: for arxiv_id in self.arxiv_ids: output_dir = directories.arxiv_subdir("citation-cluster-locations", arxiv_id) file_utils.clean_directory(output_dir) boxes_by_hue_iteration = file_utils.load_citation_hue_locations( arxiv_id) if boxes_by_hue_iteration is None: continue boxes_by_citation_key: Dict[str, List[BoundingBox]] = {} for iteration in directories.iteration_names( "sources-with-colorized-citations", arxiv_id): citation_hues_path = os.path.join( directories.iteration( "sources-with-colorized-citations", arxiv_id, iteration, ), "entity_hues.csv", ) if not os.path.exists(citation_hues_path): logging.warning( "Could not find citation hue colors for %s iteration %s. Skipping", arxiv_id, iteration, ) continue for record in file_utils.load_from_csv(citation_hues_path, ColorizationRecord): key = record.entity_id if key not in boxes_by_citation_key: boxes_by_citation_key[key] = [] hue_iteration = HueIteration(record.hue, iteration) boxes_by_citation_key[key].extend( boxes_by_hue_iteration.get(hue_iteration, [])) for key, boxes in boxes_by_citation_key.items(): yield LocationTask( arxiv_id=arxiv_id, citation_key=key, boxes=boxes, )
def load(self) -> Iterator[SearchTask]: for arxiv_id in self.arxiv_ids: output_dir = directories.arxiv_subdir( self.get_output_base_dirkey(), arxiv_id) file_utils.clean_directory(output_dir) # Get output file names from results of compiling the uncolorized TeX sources. output_files = get_output_files( directories.arxiv_subdir("compiled-sources", arxiv_id)) for iteration in directories.iteration_names( self.get_diff_images_base_dirkey(), arxiv_id): diff_images_dir = directories.iteration( self.get_diff_images_base_dirkey(), arxiv_id, iteration) hue_searches = self.load_hues(arxiv_id, iteration) hue_searches_by_file: Dict[Path, List[HueSearchRegion]] = {} for search in hue_searches: output_paths = [f.path for f in output_files] files_to_search = ([search.relative_file_path] if search.relative_file_path is not None else output_paths) for path in files_to_search: if path not in hue_searches_by_file: hue_searches_by_file[path] = [] hue_searches_by_file[path].append(search) for relative_file_path, search_regions in hue_searches_by_file.items( ): diff_images_file_path = os.path.join( diff_images_dir, relative_file_path) page_images = {} colorization_error_detected = False for img_name in os.listdir(diff_images_file_path): img_path = os.path.join(diff_images_file_path, img_name) page_image = cv2.imread(img_path) if not self.args.skip_visual_validation: if contains_black_pixels(page_image): logging.warning( "Black pixels found in image diff %s", img_path) colorization_error_detected = True page_number = (int( os.path.splitext(img_name)[0].replace("page-", "")) - 1) page_images[page_number] = page_image if colorization_error_detected: logging.warning( # pylint: disable=logging-not-lazy "Colorization error detected. Skipping hue location for " + "iteration %s for arXiv paper %s", iteration, arxiv_id, ) break for search_region in search_regions: yield SearchTask( arxiv_id, iteration, page_images, relative_file_path, search_region, )
def get_paper_dirs(self, arxiv_id: ArxivId) -> Iterator[RelativePath]: for iteration in directories.iteration_names( self.get_papers_base_dirkey(), arxiv_id): yield directories.relpath_arxiv_id_iteration( arxiv_id, iteration)