def load_hues(self, arxiv_id: ArxivId, iteration: str) -> List[HueSearchRegion]: hues_path = os.path.join( directories.iteration( f"sources-with-colorized-{entity_name}", arxiv_id, iteration, ), "entity_hues.csv", ) if not os.path.exists(hues_path): logging.warning("Could not find any hues at %s", hues_path) return [] searches = [] for record in file_utils.load_from_csv(hues_path, ColorizationRecord): searches.append( HueSearchRegion( hue=record.hue, record=record, relative_file_path=None, masks=None, )) return searches
def save(self, item: PageRasterPair, result: np.ndarray) -> None: output_dir = directories.iteration(self.get_output_base_dirkey(), item.arxiv_id, item.iteration) image_path = os.path.join(output_dir, item.relative_path, item.image_name) image_dir = os.path.dirname(image_path) if not os.path.exists(image_dir): os.makedirs(image_dir) cv2.imwrite(image_path, result) logging.debug("Diffed images and stored result at %s", image_path)
def load(self) -> Iterator[PageRasterPair]: for arxiv_id in self.arxiv_ids: output_dir = directories.arxiv_subdir( self.get_output_base_dirkey(), arxiv_id) file_utils.clean_directory(output_dir) # Get output file names from results of compiling the uncolorized TeX sources. output_files = get_output_files( directories.arxiv_subdir("compiled-sources", arxiv_id)) if len(output_files) == 0: continue for iteration in directories.iteration_names( self.get_raster_base_dirkey(), arxiv_id): original_images_dir = directories.arxiv_subdir( "paper-images", arxiv_id) modified_images_dir = directories.iteration( self.get_raster_base_dirkey(), arxiv_id, iteration) for output_file in output_files: relative_file_path = output_file.path original_images_path = os.path.join( original_images_dir, relative_file_path) for img_name in os.listdir(original_images_path): original_img_path = os.path.join( original_images_path, img_name) modified_img_path = os.path.join( modified_images_dir, relative_file_path, img_name) if not os.path.exists(modified_img_path): logging.warning( "Could not find expected image %s. Skipping diff for this paper.", modified_img_path, ) break original_img = cv2.imread(original_img_path) modified_img = cv2.imread(modified_img_path) yield PageRasterPair( arxiv_id, iteration, relative_file_path, img_name, original_img, modified_img, )
def load(self) -> Iterator[LocationTask]: for arxiv_id in self.arxiv_ids: output_dir = directories.arxiv_subdir("citation-cluster-locations", arxiv_id) file_utils.clean_directory(output_dir) boxes_by_hue_iteration = file_utils.load_citation_hue_locations( arxiv_id) if boxes_by_hue_iteration is None: continue boxes_by_citation_key: Dict[str, List[BoundingBox]] = {} for iteration in directories.iteration_names( "sources-with-colorized-citations", arxiv_id): citation_hues_path = os.path.join( directories.iteration( "sources-with-colorized-citations", arxiv_id, iteration, ), "entity_hues.csv", ) if not os.path.exists(citation_hues_path): logging.warning( "Could not find citation hue colors for %s iteration %s. Skipping", arxiv_id, iteration, ) continue for record in file_utils.load_from_csv(citation_hues_path, ColorizationRecord): key = record.entity_id if key not in boxes_by_citation_key: boxes_by_citation_key[key] = [] hue_iteration = HueIteration(record.hue, iteration) boxes_by_citation_key[key].extend( boxes_by_hue_iteration.get(hue_iteration, [])) for key, boxes in boxes_by_citation_key.items(): yield LocationTask( arxiv_id=arxiv_id, citation_key=key, boxes=boxes, )
def save(self, item: TexAndTokens, result: ColorizationResult) -> None: iteration = result.iteration iteration_id = f"all-files-{iteration}" output_sources_path = directories.iteration( "sources-with-colorized-equation-tokens", item.arxiv_id, iteration_id, ) logging.debug("Outputting to %s", output_sources_path) # Create new directory for each colorization iteration. unpack_path = unpack(item.arxiv_id, output_sources_path) sources_unpacked = unpack_path is not None if unpack_path is None: logging.warning("Could not unpack sources into %s", output_sources_path) if sources_unpacked: for tex_path, colorized_tex in result.result.colorized_files.items( ): full_tex_path = os.path.join(output_sources_path, tex_path) with open(full_tex_path, "w", encoding=colorized_tex.encoding) as tex_file: tex_file.write(colorized_tex.contents) hues_path = os.path.join(output_sources_path, "entity_hues.csv") for colorized_token in result.result.colorized_tokens: file_utils.append_to_csv( hues_path, EquationTokenColorizationRecord( entity_id=(str(colorized_token.equation_index) + "-" + str(colorized_token.token_index)), hue=colorized_token.hue, tex_path=colorized_token.tex_path, iteration=str(iteration), equation_index=colorized_token.equation_index, token_index=colorized_token.token_index, start=colorized_token.start, end=colorized_token.end, text=colorized_token.text, ), )
def save(self, item: ColorizationTask, result: ColorizationResult) -> None: iteration = result.iteration colorized_tex = result.tex colorized_citations = result.colorized_citations iteration_id = directories.tex_iteration(item.tex_path, str(iteration)) output_sources_path = directories.iteration( "sources-with-colorized-citations", item.arxiv_id, iteration_id, ) logging.debug("Outputting to %s", output_sources_path) # Create new directory for each colorization iteration for each TeX file. unpack_path = unpack(item.arxiv_id, output_sources_path) sources_unpacked = unpack_path is not None if unpack_path is None: logging.warning("Could not unpack sources into %s", output_sources_path) if sources_unpacked: tex_path = os.path.join(output_sources_path, item.tex_path) with open(tex_path, "w", encoding=item.file_contents.encoding) as tex_file: tex_file.write(colorized_tex) hues_path = os.path.join(output_sources_path, "entity_hues.csv") # TODO(andrewhead): It might be better to save this CSV data with the same # encoding as the file the TeX was read from, for the citations, for the # equations, and for the symbols. There might be some gotchas for character # positions not lining up between the ones we save using Unicode here and the # positions in the intended encoding in the original files. for c in colorized_citations: record = ColorizationRecord( hue=c.hue, entity_id=c.key, tex_path=item.tex_path, iteration=iteration_id, ) file_utils.append_to_csv(hues_path, record)
def save(self, item: ColorizationTask, result: ColorizationResult) -> None: iteration = result.iteration colorized_tex = result.tex entity_hues = result.entity_hues iteration_id = directories.tex_iteration(item.tex_path, str(iteration)) output_sources_path = directories.iteration( self.get_output_base_dirkey(), item.arxiv_id, iteration_id, ) logging.debug("Outputting to %s", output_sources_path) # Each colorization batch gets a new sources directory. unpack_path = unpack(item.arxiv_id, output_sources_path) sources_unpacked = unpack_path is not None if unpack_path is None: logging.warning("Could not unpack sources into %s", output_sources_path) if sources_unpacked: # Rewrite the TeX with the colorized TeX. tex_path = os.path.join(output_sources_path, item.tex_path) with open(tex_path, "w", encoding=item.file_contents.encoding) as tex_file: tex_file.write(colorized_tex) # Save a log of which hues were assigned to which entities. hues_path = os.path.join(output_sources_path, "entity_hues.csv") for (hue, entity) in entity_hues: file_utils.append_to_csv( hues_path, ColorizationRecord( tex_path=item.tex_path, iteration=str(iteration), hue=hue, entity_id=entity.id_, ), )
def process(self, item: LocationTask) -> Iterator[HueLocationInfo]: # Filter out entities that are empty (i.e., have nothing to color) # A '-1' in the 'start' or 'end' field indicates that the entity does not occur in a # specific place in the TeX, but rather a custom coloring technique based on other # entity properties will be used. So entities that have a '-1' for their start and # end should still be processed even though they appear to be zero-length. entities_filtered = [ e for e in item.entities if e.start == -1 or e.end == -1 or e.start != e.end ] # Sort entities by the order in which they appear in the TeX. This allows the pipeline # to keep track of which ones appear first, when trying to recover from errors (i.e., when # trying to detect which entity in a batch may have shifted to cause many others to move.) entities_ordered = sorted(entities_filtered, key=lambda e: e.start) # Construct a queue of entities to detect. entities_by_id = {e.id_: e for e in entities_ordered} to_process = deque([e.id_ for e in entities_ordered]) to_process_alone: Deque[str] = deque() # Path to output directories. These directories will be redefined once for each batch. colorized_tex_dir: Optional[str] = None compiled_tex_dir: Optional[str] = None raster_output_dir: Optional[str] = None diffs_output_dir: Optional[str] = None # Iteration state batch_index = -1 iteration_id = None def next_batch() -> List[str]: """ Get the next batch of entities to process. First tries to sample a batch from 'to_process', and then attempts to sample individual entities from 'to_process_alone'. """ if len(to_process) > 0: return [ to_process.popleft() for _ in range(min(self.args.batch_size, len(to_process))) ] return [to_process_alone.popleft()] def _cleanup_from_last_batch() -> None: " Clean up output directories from the last batch. " if batch_index > -1 and not self.args.keep_intermediate_files: logging.debug( # pylint: disable=logging-not-lazy "Deleting intermediate files used to locate entities (i.e., colorized " + "sources, compilation results, and rasters) for paper %s iteration %s", item.arxiv_id, iteration_id or "''", ) intermediate_files_dirs = [ colorized_tex_dir, compiled_tex_dir, raster_output_dir, diffs_output_dir, ] for dir_ in intermediate_files_dirs: if dir_ and os.path.exists(dir_): file_utils.clean_directory(dir_) os.rmdir(dir_) while len(to_process) > 0 or len(to_process_alone) > 0: if batch_index > -1: _cleanup_from_last_batch() batch_index += 1 logging.debug( "Locating bounding boxes for batch %d-%d of entities of type %s for paper %s.", item.group, batch_index, self.get_entity_name(), item.arxiv_id, ) iteration_id = directories.tex_iteration( item.tex_path, f"{item.group}-{batch_index}") # Define output directory locations for this batch. colorized_tex_dir = directories.iteration( self.output_base_dirs["sources"], item.arxiv_id, iteration_id) compiled_tex_dir = directories.iteration( self.output_base_dirs["compiled-sources"], item.arxiv_id, iteration_id, ) raster_output_dir = directories.iteration( self.output_base_dirs["paper-images"], item.arxiv_id, iteration_id) diffs_output_dir = directories.iteration( self.output_base_dirs["diffed-images"], item.arxiv_id, iteration_id) # Fetch the next batch of entities to process. batch = next_batch() entities: List[SerializableEntity] = [ entities_by_id[id_] for id_ in batch ] # Colorize the TeX for all the entities. custom_colorize_func = self.get_colorize_func() logging.debug( "Attempting to colorize entities in TeX for entity batch %d-%d of paper %s.", item.group, batch_index, item.arxiv_id, ) if custom_colorize_func is not None: colorized_tex = custom_colorize_func( item.file_contents.contents, entities, self.get_colorize_options()) if len(colorized_tex.entity_hues) == 0: logging.info( # pylint: disable=logging-not-lazy "Custom colorization function colored nothing for entity batch %d-%d of " + "paper %s when coloring file %s. The function probably decide there was " + "nothing to do for this file, and will hopefullly colorize these " + "entities in another file. Skipping this batch for this file.", item.group, batch_index, item.arxiv_id, item.file_contents.path, ) continue else: colorized_tex = colorize_entities(item.file_contents.contents, entities, self.get_colorize_options()) # If some entities were skipped during colorization, perhaps because they # overlapped with each other, add them back to the work queue. if colorized_tex.skipped is not None and len( colorized_tex.skipped) > 0: logging.info( # pylint: disable=logging-not-lazy "Entities %s were skipped during colorization batch %d-%d for paper " + "%s. They will be processed in a later batch.", [e.id_ for e in colorized_tex.skipped], item.group, batch_index, item.arxiv_id, ) # Queue skipped entities in the order that they initially appeared in the batch. reprocess_ids = {e.id_ for e in colorized_tex.skipped} reprocess_sorted = [ id_ for id_ in batch if id_ in reprocess_ids ] to_process.extendleft(reversed(reprocess_sorted)) # Remove skipped entities from the current batch. for skip in colorized_tex.skipped: del batch[batch.index(skip.id_)] # Save the colorized TeX to the file system. save_success = save_colorized_tex( item.arxiv_id, colorized_tex_dir, item.tex_path, iteration_id, colorized_tex.tex, item.file_contents.encoding, colorized_tex.entity_hues, ) logging.debug( "Finished attempting to colorize entities for entity batch %d-%d of paper %s.", item.group, batch_index, item.arxiv_id, ) if not save_success: logging.error( # pylint: disable=logging-not-lazy "Failed to save colorized TeX files for arXiv paper %s. " "This paper will be skipped.", item.arxiv_id, ) # Compile the TeX with the colors. shutil.copytree(colorized_tex_dir, compiled_tex_dir) compilation_result = compile_tex(compiled_tex_dir) save_compilation_result("compiled-sources", item.arxiv_id, compiled_tex_dir, compilation_result) if not compilation_result.success: # If colorizing a specific entity caused the failure, remove the entity that caused # the problem from the batch and restart with a new batch, minus this entity. last_colorized_entity_id = get_last_colorized_entity( item.arxiv_id, compiled_tex_dir) if last_colorized_entity_id is not None: problem_ids = [last_colorized_entity_id] if batch.index(last_colorized_entity_id) < len(batch) - 1: problem_ids += [ batch[batch.index(last_colorized_entity_id) + 1] ] if len(batch) == 1: logging.warning( # pylint: disable=logging-not-lazy "Failed to compile paper %s with colorized entity %s, even when it was " + "colorized in isolation. The location of this entity will not be detected.", item.arxiv_id, batch[0], ) continue logging.warning( # pylint: disable=logging-not-lazy "Failed to compile paper %s with colorized entities. The culprit may be " + "the colorization command for entity %s. The problematic entities will be " + "colorized on their own, and the rest of the entities will be colorized " + "together in the next batch.", item.arxiv_id, " or ".join(problem_ids), ) for id_ in problem_ids: to_process_alone.append(id_) del batch[batch.index(id_)] to_process.extendleft(reversed(batch)) continue # If there was some other reason for the error, remove just the first entity from the batch. logging.error( # pylint: disable=logging-not-lazy "Failed to compile paper %s with colorized entities %s. The cause " + "is assumed to be in the first colorized entity. The location for the " + "first entity %s will not be detected. The remainder of the entities in " + "this batch will be processed in another batch.", item.arxiv_id, batch, batch[0], ) del [batch[0]] to_process.extendleft(reversed(batch)) continue # Raster the pages to images, and compute diffs from the original images. output_files = compilation_result.output_files for output_file in output_files: raster_success = raster_pages( compiled_tex_dir, os.path.join(raster_output_dir, directories.escape_slashes(output_file.path)), output_file.path, output_file.output_type, ) if not raster_success: logging.error( # pylint: disable=logging-not-lazy "Failed to rasterize pages for %s iteration %s. The locations for entities " + "with IDs %s with not be detected.", item.arxiv_id, iteration_id, batch, ) continue logging.debug( "Attempting to diff rastered pages for paper %s iteration %s.", item.arxiv_id, iteration_id, ) diff_success = diff_images_in_raster_dirs( output_files, raster_output_dir, diffs_output_dir, item.arxiv_id, ) logging.debug( "Finished diffing attempt for paper %s iteration %s. Success? %s.", item.arxiv_id, iteration_id, diff_success, ) if not diff_success: logging.error( # pylint: disable=logging-not-lazy "Failed to difference images of original and colorized versions of " + "papers %s in batch processing iteration %s. The locations for entities with IDs " + "%s will not be detected.", item.arxiv_id, iteration_id, batch, ) continue # Locate the entities in the diffed images. logging.debug( "Attempting to locate entities using image differences for paper %s iteration %s.", item.arxiv_id, iteration_id, ) entity_hues = colorized_tex.entity_hues location_result = locate_entities(item.arxiv_id, raster_output_dir, diffs_output_dir, entity_hues) if location_result is None: logging.warning( # pylint: disable=logging-not-lazy "Error occurred when locating entities by hue in diffed images " + "for paper %s. None of the entities in batch %s will be detected.", item.arxiv_id, batch, ) continue if self.should_sanity_check_images( ) and location_result.black_pixels_found: logging.warning( # pylint: disable=logging-not-lazy "Ignoring bounding boxes found for paper %s in batch %s due to " + "black pixels found in the images. This might indicate that the colorization " + "commands introduced subtle shifts of the text.", item.arxiv_id, batch, ) continue # If colorizing entities seemed to cause drift in the document... if len(location_result.shifted_entities) > 0: logging.warning( # pylint: disable=logging-not-lazy "Some entities shifted position in the colorized TeX for paper %s batch %s: " + "%s. Attempting to remove the first shifted entity from the batch.", item.arxiv_id, batch, location_result.shifted_entities, ) first_shifted_entity_id = None for entity_id in batch: if entity_id in location_result.shifted_entities: first_shifted_entity_id = entity_id break if first_shifted_entity_id is not None: if len(batch) > 1: logging.info( # pylint: disable=logging-not-lazy "Entity %s has been marked as being the potential cause of shifting in " + "the colorized document for paper %s batch %d-%d. It will be processed " + "later on its own. The other shifted entities in %s will be queued to " + "process as a group in an upcoming batch.", first_shifted_entity_id, item.arxiv_id, item.group, batch_index, location_result.shifted_entities, ) # Get the index of the first entity for which the location has shifted # during colorization. moved_entity_index = batch.index( first_shifted_entity_id) # Mark all other entities that have shifted after the first one one to be processed # in a later batch (instead of on their own). It could be that they won't shift # once the first shifted entity is removed. for i in range(len(batch) - 1, moved_entity_index, -1): if batch[i] in location_result.shifted_entities: to_process.appendleft(batch[i]) del batch[i] # Mark the first entity that shifted to be reprocessed alone, where its position # might be discoverable, without affecting the positions of other element. del batch[moved_entity_index] to_process_alone.append(first_shifted_entity_id) elif len(batch) == 1 and self.should_sanity_check_images(): logging.info( # pylint: disable=logging-not-lazy "Skipping entity %s for paper %s as it caused " + "colorization errors even when colorized in isolation.", first_shifted_entity_id, item.arxiv_id, ) continue elif len(batch) == 1: logging.info( # pylint: disable=logging-not-lazy "Entity %s has been marked as the cause of shifting in " + "the colorized document for paper %s. Its location will " + "still be saved (if one was found), though this location should be " + "considered potentially inaccurate.", first_shifted_entity_id, item.arxiv_id, ) else: logging.warning( # pylint: disable=logging-not-lazy "Could not find a single entity that was likely responsible for shifting in " + "the colorized version of paper %s batch %d-%d. All entities in batch %s will " + "be processed on their own.", item.arxiv_id, item.group, batch_index, batch, ) to_process_alone.extend(batch) logging.debug( "Finished attempt at locating entities with image diffs for paper %s iteration %s.", item.arxiv_id, iteration_id, ) # The code above is responsible for filtering 'batch' to ensure that it doesn't include # any entity IDs that shouldn't be saved to file, for example if the client has asked that # entity IDs that cause colorization errors be omitted from the results. for entity_id in batch: for box in location_result.locations[entity_id]: yield HueLocationInfo( tex_path=item.tex_path, iteration=iteration_id, hue=entity_hues[entity_id], entity_id=entity_id, page=box.page, left=box.left, top=box.top, width=box.width, height=box.height, ) _cleanup_from_last_batch()
def load(self) -> Iterator[SearchTask]: for arxiv_id in self.arxiv_ids: output_dir = directories.arxiv_subdir( self.get_output_base_dirkey(), arxiv_id) file_utils.clean_directory(output_dir) # Get output file names from results of compiling the uncolorized TeX sources. output_files = get_output_files( directories.arxiv_subdir("compiled-sources", arxiv_id)) for iteration in directories.iteration_names( self.get_diff_images_base_dirkey(), arxiv_id): diff_images_dir = directories.iteration( self.get_diff_images_base_dirkey(), arxiv_id, iteration) hue_searches = self.load_hues(arxiv_id, iteration) hue_searches_by_file: Dict[Path, List[HueSearchRegion]] = {} for search in hue_searches: output_paths = [f.path for f in output_files] files_to_search = ([search.relative_file_path] if search.relative_file_path is not None else output_paths) for path in files_to_search: if path not in hue_searches_by_file: hue_searches_by_file[path] = [] hue_searches_by_file[path].append(search) for relative_file_path, search_regions in hue_searches_by_file.items( ): diff_images_file_path = os.path.join( diff_images_dir, relative_file_path) page_images = {} colorization_error_detected = False for img_name in os.listdir(diff_images_file_path): img_path = os.path.join(diff_images_file_path, img_name) page_image = cv2.imread(img_path) if not self.args.skip_visual_validation: if contains_black_pixels(page_image): logging.warning( "Black pixels found in image diff %s", img_path) colorization_error_detected = True page_number = (int( os.path.splitext(img_name)[0].replace("page-", "")) - 1) page_images[page_number] = page_image if colorization_error_detected: logging.warning( # pylint: disable=logging-not-lazy "Colorization error detected. Skipping hue location for " + "iteration %s for arXiv paper %s", iteration, arxiv_id, ) break for search_region in search_regions: yield SearchTask( arxiv_id, iteration, page_images, relative_file_path, search_region, )
def load_hues(self, arxiv_id: ArxivId, iteration: str) -> List[HueSearchRegion]: equation_boxes_path = os.path.join( directories.arxiv_subdir("hue-locations-for-equations", arxiv_id), "hue_locations.csv", ) bounding_boxes: Dict[EquationId, BoundingBoxesByFile] = {} for location_info in file_utils.load_from_csv(equation_boxes_path, HueLocationInfo): equation_id = EquationId( tex_path=location_info.tex_path, equation_index=int(location_info.entity_id), ) if equation_id not in bounding_boxes: bounding_boxes[equation_id] = {} file_path = location_info.relative_file_path if file_path not in bounding_boxes[equation_id]: bounding_boxes[equation_id][file_path] = [] box = BoundingBox( page=location_info.page, left=location_info.left, top=location_info.top, width=location_info.width, height=location_info.height, ) bounding_boxes[equation_id][file_path].append(box) token_records_by_equation: Dict[EquationId, Dict[ int, EquationTokenColorizationRecord]] = {} token_hues_path = os.path.join( directories.iteration( "sources-with-colorized-equation-tokens", arxiv_id, iteration, ), "entity_hues.csv", ) for record in file_utils.load_from_csv( token_hues_path, EquationTokenColorizationRecord): equation_id = EquationId(tex_path=record.tex_path, equation_index=record.equation_index) token_index = int(record.token_index) if equation_id not in token_records_by_equation: token_records_by_equation[equation_id] = {} token_records_by_equation[equation_id][token_index] = record hue_searches = [] for equation_id, boxes_by_file in bounding_boxes.items(): for file_path, boxes in boxes_by_file.items(): masks_by_page: MasksForPages = {} for box in boxes: if box.page not in masks_by_page: masks_by_page[box.page] = [] masks_by_page[box.page].append( Rectangle(box.left, box.top, box.width, box.height)) if equation_id in token_records_by_equation: for token_index, record in token_records_by_equation[ equation_id].items(): hue_searches.append( HueSearchRegion( hue=record.hue, record=record, relative_file_path=file_path, masks=masks_by_page, )) return hue_searches