def load(self) -> Iterator[DetectDefinitionsTask]: for arxiv_id in self.arxiv_ids: output_dir = directories.arxiv_subdir("detected-definitions", arxiv_id) file_utils.clean_directory(output_dir) # Load cleaned sentences for definition detection. detected_sentences_path = os.path.join( directories.arxiv_subdir("sentence-tokens", arxiv_id), "sentences.csv", ) try: sentences = list( file_utils.load_from_csv(detected_sentences_path, EmbellishedSentence)) except FileNotFoundError: logging.warning( # pylint: disable=logging-not-lazy "No sentences data found for arXiv paper %s. Try re-running the pipeline, " + "this time enabling the processing of sentences. If that doesn't work, " + "there is likely an error in detected sentences for this paper.", arxiv_id, ) continue # Read in all TeX. Once definition detection is finished, all the TeX will be searched # for references to the defined terms. tex_by_file = file_utils.read_tex(arxiv_id) yield DetectDefinitionsTask(arxiv_id, sentences, tex_by_file)
def load(self) -> Iterator[ColorizationTask]: for arxiv_id in self.arxiv_ids: output_root = directories.arxiv_subdir( "sources-with-colorized-citations", arxiv_id) file_utils.clean_directory(output_root) bibitems_path = os.path.join( directories.arxiv_subdir("bibitems", arxiv_id), "bibitems.csv") if not os.path.exists(bibitems_path): logging.warning( "No bibitems were found for paper %s. Skipping", arxiv_id) continue bibitems = file_utils.load_from_csv(bibitems_path, Bibitem) bibitem_keys = [b.key for b in bibitems if b.key is not None] original_sources_path = directories.arxiv_subdir( "sources", arxiv_id) for tex_path in file_utils.find_files(original_sources_path, [".tex"], relative=True): file_contents = file_utils.read_file_tolerant( os.path.join(original_sources_path, tex_path)) if file_contents is not None: yield ColorizationTask(arxiv_id, tex_path, file_contents, bibitem_keys)
def load(self) -> Iterator[SymbolSentencesTask]: for arxiv_id in self.arxiv_ids: output_dir = directories.arxiv_subdir("sentences-for-symbols", arxiv_id) file_utils.clean_directory(output_dir) token_sentences_path = os.path.join( directories.arxiv_subdir("sentences-for-equation-tokens", arxiv_id), "entity_sentences.csv", ) if not os.path.exists(token_sentences_path): logging.warning( # pylint: disable=logging-not-lazy "Could not find links between sentences and equation tokens at " + "path %s for arXiv paper %s. Skipping the detection of symbol sentences.", token_sentences_path, arxiv_id, ) continue token_sentence_pairs = list( file_utils.load_from_csv(token_sentences_path, EntitySentencePairIds)) symbols = file_utils.load_symbols(arxiv_id) if not symbols: continue # Filter to only those symbols for which tokens have been detected symbols = [s for s in symbols if len(s.symbol.characters) > 0] yield SymbolSentencesTask(arxiv_id, symbols, token_sentence_pairs)
def load(self) -> Iterator[ColorizationTask]: for arxiv_id in self.arxiv_ids: output_root = directories.arxiv_subdir( self.get_output_base_dirkey(), arxiv_id) file_utils.clean_directory(output_root) entities_path = os.path.join( directories.arxiv_subdir(self.get_detected_entities_dirkey(), arxiv_id), "entities.csv", ) entities = list( file_utils.load_from_csv(entities_path, self.get_detected_entity_type())) original_sources_path = directories.arxiv_subdir( "sources", arxiv_id) for tex_path in file_utils.find_files(original_sources_path, [".tex"], relative=True): file_contents = file_utils.read_file_tolerant( os.path.join(original_sources_path, tex_path)) entities_for_tex_path = [ e for e in entities if e.tex_path == tex_path ] if file_contents is not None: yield ColorizationTask(arxiv_id, tex_path, file_contents, entities_for_tex_path)
def load(self) -> Iterator[TexAndTokens]: for arxiv_id in self.arxiv_ids: output_root = directories.arxiv_subdir( "sources-with-colorized-equation-tokens", arxiv_id) file_utils.clean_directory(output_root) tokens_path = os.path.join( directories.arxiv_subdir("detected-equation-tokens", arxiv_id), "entities.csv", ) if not os.path.exists(tokens_path): logging.info( "No equation token data found for paper %s. Skipping.", arxiv_id) continue # Load token location information tokens = file_utils.load_tokens(arxiv_id) if tokens is None: continue tex_paths = set({token.tex_path for token in tokens}) # Load original sources for TeX files that need to be colorized contents_by_file = {} for tex_path in tex_paths: absolute_tex_path = os.path.join( directories.arxiv_subdir("sources", arxiv_id), tex_path) file_contents = file_utils.read_file_tolerant( absolute_tex_path) if file_contents is not None: contents_by_file[tex_path] = file_contents yield TexAndTokens(arxiv_id, contents_by_file, tokens)
def load(self) -> Iterator[ArxivId]: for arxiv_id in self.arxiv_ids: file_utils.clean_directory( directories.arxiv_subdir("detected-equation-tokens", arxiv_id)) file_utils.clean_directory( directories.arxiv_subdir("detected-symbols", arxiv_id)) yield arxiv_id
def load(self) -> Iterator[MatchTask]: for arxiv_id in self.arxiv_ids: file_utils.clean_directory( directories.arxiv_subdir("bibitem-resolutions", arxiv_id)) bibitems_dir = directories.arxiv_subdir("detected-citations", arxiv_id) metadata_dir = directories.arxiv_subdir("s2-metadata", arxiv_id) references_path = os.path.join(metadata_dir, "references.csv") if not os.path.exists(references_path): logging.warning( "Could not find %s, skipping reference resolution for paper %s", references_path, arxiv_id, ) return references = list( file_utils.load_from_csv(references_path, SerializableReference)) bibitems_path = os.path.join(bibitems_dir, "entities.csv") if not os.path.exists(bibitems_path): logging.warning( "Could not find %s, skipping reference resolution for paper %s", bibitems_path, arxiv_id, ) return bibitems = list(file_utils.load_from_csv(bibitems_path, Bibitem)) yield MatchTask(arxiv_id, bibitems, references)
def load(self) -> Iterator[CitationData]: for arxiv_id in self.arxiv_ids: # Load citation locations citation_locations = load_located_citations(arxiv_id) if citation_locations is None: continue # Load metadata for bibitems key_s2_ids: Dict[CitationKey, S2Id] = {} key_resolutions_path = os.path.join( directories.arxiv_subdir("bibitem-resolutions", arxiv_id), "resolutions.csv", ) if not os.path.exists(key_resolutions_path): logging.warning( "Could not find citation resolutions for %s. Skipping", arxiv_id) continue for resolution in file_utils.load_from_csv(key_resolutions_path, BibitemMatch): if resolution.key is not None: key_s2_ids[resolution.key] = resolution.s2_id s2_id_path = os.path.join( directories.arxiv_subdir("s2-metadata", arxiv_id), "s2_id") if not os.path.exists(s2_id_path): logging.warning("Could not find S2 ID file for %s. Skipping", arxiv_id) continue with open(s2_id_path) as s2_id_file: s2_id = s2_id_file.read() s2_data: Dict[S2Id, SerializableReference] = {} s2_metadata_path = os.path.join( directories.arxiv_subdir("s2-metadata", arxiv_id), "references.csv") if not os.path.exists(s2_metadata_path): logging.warning( "Could not find S2 metadata file for citations for %s. Skipping", arxiv_id, ) continue for metadata in file_utils.load_from_csv(s2_metadata_path, SerializableReference): # Convert authors field to comma-delimited list of authors author_string = ",".join( [a["name"] for a in ast.literal_eval(metadata.authors)]) metadata = dataclasses.replace(metadata, authors=author_string) s2_data[metadata.s2_id] = metadata yield CitationData( arxiv_id, s2_id, citation_locations, key_s2_ids, s2_data, )
def load(self) -> Iterator[CompilationTask]: for arxiv_id in self.arxiv_ids: output_dir = directories.arxiv_subdir("compiled-sources", arxiv_id) if os.path.exists(output_dir): logging.warning( "Compilation directory already exists in %s. Deleting.", output_dir, ) shutil.rmtree(output_dir) shutil.copytree(directories.arxiv_subdir("sources", arxiv_id), output_dir) yield CompilationTask(arxiv_id, output_dir)
def load(self) -> Iterator[ExtractionTask]: for arxiv_id in self.arxiv_ids: sources_dir = directories.arxiv_subdir("sources", arxiv_id) file_utils.clean_directory( directories.arxiv_subdir("detected-citations", arxiv_id)) for path in file_utils.find_files(sources_dir, [".tex", ".bbl"]): file_contents = file_utils.read_file_tolerant(path) if file_contents is None: continue yield ExtractionTask(arxiv_id, file_contents)
def save(self, item: RasterTask, _: None) -> None: raster_pages( directories.arxiv_subdir("compiled-normalized-sources", item.arxiv_id), os.path.join( directories.arxiv_subdir("paper-images", item.arxiv_id), directories.escape_slashes(item.relative_output_file_path), ), item.relative_output_file_path, item.output_file_type, )
def load(self) -> Iterator[PaperProcessingResult]: for arxiv_id in self.arxiv_ids: # Load the S2 ID for this paper s2_id_path = os.path.join( directories.arxiv_subdir("s2-metadata", arxiv_id), "s2_id") if not os.path.exists(s2_id_path): logging.warning("Could not find S2 ID file for %s. Skipping", arxiv_id) continue with open(s2_id_path) as s2_id_file: s2_id = s2_id_file.read() # Load in all extracted entities. See note in 'colorize_tex.py' for why entities # might be saved in multiple files. If they are, for this upload function to work, # each of the entities need to have a unique pair of 'ID' and 'tex_path'. entities_dir = directories.arxiv_subdir( self.get_detected_entities_dirkey(), arxiv_id) entities: List[SerializableEntity] = [] for entities_path in glob.glob( os.path.join(entities_dir, "entities*.csv")): entities.extend( file_utils.load_from_csv( entities_path, self.get_detected_entity_type( os.path.basename(entities_path)), )) # Load in locations of all detected hues. hue_locations_path = os.path.join( directories.arxiv_subdir(self.get_hue_locations_dirkey(), arxiv_id), "entity_locations.csv", ) hue_location_infos = list( file_utils.load_from_csv(hue_locations_path, HueLocationInfo)) # Group each entity with its location. Pass the entity information, and the detected # locations for the entity, to the upload function. localized_enitites = [] for entity in entities: matching_locations = [] for h in hue_location_infos: if h.entity_id == entity.id_ and h.tex_path == entity.tex_path: matching_locations.append(h) localized_enitites.append( EntityAndLocation(entity, matching_locations)) yield PaperProcessingResult( arxiv_id=arxiv_id, s2_id=s2_id, localized_entities=localized_enitites, )
def read_tex(arxiv_id: str) -> Dict[str, FileContents]: """ Read the contents of all TeX files for this arXiv paper. """ contents_by_file = {} sources_path = directories.arxiv_subdir("sources", arxiv_id) for tex_path in find_files(sources_path, [".tex"], relative=True): absolute_tex_path = os.path.join( directories.arxiv_subdir("sources", arxiv_id), tex_path) file_contents = read_file_tolerant(absolute_tex_path) if file_contents is not None: contents_by_file[tex_path] = file_contents return contents_by_file
def load(self) -> Iterator[PaperProcessingResult]: for arxiv_id in self.arxiv_ids: # Load the S2 ID for this paper s2_id_path = os.path.join( directories.arxiv_subdir("s2-metadata", arxiv_id), "s2_id") if not os.path.exists(s2_id_path): logging.warning("Could not find S2 ID file for %s. Skipping", arxiv_id) continue with open(s2_id_path) as s2_id_file: s2_id = s2_id_file.read() # Load in all extracted entities. entities_path = os.path.join( directories.arxiv_subdir(self.get_detected_entities_dirkey(), arxiv_id), "entities.csv", ) entities = list( file_utils.load_from_csv(entities_path, self.get_detected_entity_type())) # Load in locations of all detected hues. hue_locations_path = os.path.join( directories.arxiv_subdir(self.get_hue_locations_dirkey(), arxiv_id), "hue_locations.csv", ) hue_location_infos = list( file_utils.load_from_csv(hue_locations_path, HueLocationInfo)) # Group each entity with its location. Pass the entity information, and the detected # locations for the entity, to the upload function. localized_enitites = [] for entity in entities: matching_locations = [] for h in hue_location_infos: if h.entity_id == entity.id_ and h.tex_path == entity.tex_path: matching_locations.append(h) localized_enitites.append( EntityAndLocation(entity, matching_locations)) yield PaperProcessingResult( arxiv_id=arxiv_id, s2_id=s2_id, localized_entities=localized_enitites, )
def load(self) -> Iterator[Task]: for arxiv_id in self.arxiv_ids: output_dir = directories.arxiv_subdir( f"contexts-for-{self.get_entity_name()}", arxiv_id) file_utils.clean_directory(output_dir) # Load entities from file. # Load in all extracted entities. See note in 'colorize_tex.py' for why entities # might be saved in multiple files. If they are, for this upload function to work, # each of the entities need to have a unique pair of 'ID' and 'tex_path'. entities_dir = directories.arxiv_subdir( f"detected-{self.get_entity_name()}", arxiv_id) entities: List[SerializableEntity] = [] for entities_path in glob.glob( os.path.join(entities_dir, "entities*.csv")): entities.extend( file_utils.load_from_csv(entities_path, self.get_entity_type())) # Load sentences from file. sentences_path = os.path.join( directories.arxiv_subdir("detected-sentences", arxiv_id), "entities.csv") try: sentences = list( file_utils.load_from_csv(sentences_path, Sentence)) except FileNotFoundError: logging.warning( # pylint: disable=logging-not-lazy "No sentences data found for arXiv paper %s. Try re-running the pipeline, " + "this time enabling the processing of sentences. If that doesn't work, " + "there was likely an error in detecting sentences for this paper.", arxiv_id, ) continue tex_paths = {e.tex_path for e in entities} for tex_path in tex_paths: entities_for_file = [ e for e in entities if e.tex_path == tex_path ] sentences_for_file = [ s for s in sentences if s.tex_path == tex_path ] yield Task(arxiv_id, tex_path, entities_for_file, sentences_for_file)
def load(self) -> Iterator[DetectionTask]: for arxiv_id in self.arxiv_ids: output_root = directories.arxiv_subdir( self.get_output_base_dirkey(), arxiv_id) file_utils.clean_directory(output_root) original_sources_path = directories.arxiv_subdir( "sources", arxiv_id) for tex_path in file_utils.find_files(original_sources_path, [".tex"], relative=True): file_contents = file_utils.read_file_tolerant( os.path.join(original_sources_path, tex_path)) if file_contents is not None: yield DetectionTask(arxiv_id, tex_path, file_contents)
def load(self) -> Iterator[LocationTask]: for arxiv_id in self.arxiv_ids: for output_base_dir in self.output_base_dirs.values(): file_utils.clean_directory( directories.arxiv_subdir(output_base_dir, arxiv_id)) # A directory of entities may contain files for each of multiple types of entities. # One example is that the definition detector detects both terms and definitions. # In that case, the colorizer colorizes all entities from all of these files. # Earlier entity extractor commands should include enough information in the entity IDs # so that the type of entities can be inferred from the entity ID in later commands. entities_dir = directories.arxiv_subdir(self.get_input_dirkey(), arxiv_id) entities: List[SerializableEntity] = [] for entities_path in glob.glob( os.path.join(entities_dir, "entities*.csv")): entities.extend( file_utils.load_from_csv(entities_path, self.get_detected_entity_type())) main_tex_files = get_compiled_tex_files( directories.arxiv_subdir("compiled-normalized-sources", arxiv_id)) normalized_sources_path = directories.arxiv_subdir( "normalized-sources", arxiv_id) for tex_file in main_tex_files: file_contents = file_utils.read_file_tolerant( os.path.join(normalized_sources_path, tex_file.path)) options = self.get_colorize_options() entities_for_tex_path = [ e for e in entities if e.tex_path == tex_file.path or e.tex_path == "N/A" ] if options.when is not None: entities_for_tex_path = list( filter(options.when, entities_for_tex_path)) if file_contents is not None: group_func = options.group or (lambda entities: [entities]) for group_index, entity_group in enumerate( group_func(entities_for_tex_path)): yield LocationTask( arxiv_id, tex_file.path, file_contents, entity_group, group_index, )
def save(self, item: SearchTask, result: HueLocation) -> None: logging.debug( "Found bounding box for %s, iteration %s, hue %f", item.relative_file_path, item.iteration, result.hue, ) output_dir = directories.arxiv_subdir(self.get_output_base_dirkey(), item.arxiv_id) if not os.path.exists(output_dir): os.makedirs(output_dir) output_path = os.path.join(output_dir, "hue_locations.csv") file_utils.append_to_csv( output_path, HueLocationInfo( tex_path=item.search.record.tex_path, iteration=item.iteration, hue=result.hue, entity_id=item.search.record.entity_id, page=result.box.page, left=result.box.left, top=result.box.top, width=result.box.width, height=result.box.height, relative_file_path=item.relative_file_path, ), )
def load(self) -> Iterator[RasterTask]: for arxiv_id in self.arxiv_ids: # Clean all past output for this arXiv ID. output_dir_for_arxiv_id = directories.arxiv_subdir("paper-images", arxiv_id) file_utils.clean_directory(output_dir_for_arxiv_id) paper_abs_path = directories.arxiv_subdir( "compiled-normalized-sources", arxiv_id ) output_files = get_output_files(paper_abs_path) for output_file in output_files: yield RasterTask( arxiv_id, output_file.output_type, output_file.path, )
def save(self, item: LocationTask, result: CitationLocation) -> None: output_dir = directories.arxiv_subdir("citation-locations", item.arxiv_id) if not os.path.exists(output_dir): os.makedirs(output_dir) locations_path = os.path.join(output_dir, "citation_locations.csv") file_utils.append_to_csv(locations_path, result)
def load(self) -> Iterator[LocationTask]: for arxiv_id in self.arxiv_ids: output_dir = directories.arxiv_subdir( "composite-symbols-locations", arxiv_id) file_utils.clean_directory(output_dir) token_locations = file_utils.load_equation_token_locations( arxiv_id) if token_locations is None: continue symbols_with_ids = file_utils.load_symbols(arxiv_id) if symbols_with_ids is None: continue for symbol_with_id in symbols_with_ids: # Symbols with affixes (e.g., arrows, hats) cannot be localized by taking the union # of their tokens' bounding boxes, because the bounding boxes of affix tokens # cannot be detected on their own. if not symbol_with_id.symbol.contains_affix: yield LocationTask( arxiv_id=arxiv_id, token_locations=token_locations, symbol_with_id=symbol_with_id, )
def update_compilation_log( output_dir_key: str, arxiv_id: ArxivId, stdout: bytes, source_path: RelativePath, success: bool, ) -> None: arxiv_id_output_root = directories.arxiv_subdir(output_dir_key, arxiv_id) results_path = os.path.join(arxiv_id_output_root, "compilation_results.csv") missing_driver = is_driver_unimplemented(stdout) errors = list(get_errors(stdout)) if missing_driver: logging.warning( # pylint: disable=logging-not-lazy "Could not compile arXiv ID %s because colorization commands are missing for the" + "driver needed to compile that TeX project.", arxiv_id, ) # Write the compilation result to the log. file_utils.append_to_csv( results_path, CompilationSummaryEntry( outcome="SUCCESS" if success else "FAILURE", source_path=source_path, missing_driver=missing_driver, errors=[e.decode("utf-8", "ignore") for e in errors], ), )
def locate_entities( arxiv_id: ArxivId, modified_images_dir: RelativePath, diffed_images_dir: RelativePath, entity_hues: Dict[str, float], ) -> Optional[LocationResult]: # Get output file names from results of compiling the uncolorized TeX sources. output_files = get_output_files( directories.arxiv_subdir("compiled-sources", arxiv_id)) output_paths = [f.path for f in output_files] black_pixels_found = False shifted_entity_ids: Set[str] = set() entity_locations: Dict[str, List[BoundingBox]] = defaultdict(list) for relative_file_path in output_paths: diffed_images_file_path = os.path.join(diffed_images_dir, relative_file_path) # Locate bounding boxes for each hue in the diffs. diff_images = {} if not os.path.exists(diffed_images_file_path): logging.warning( # pylint: disable=logging-not-lazy "Expected but could not find a directory %s from the image diffs. " + "This suggests that the colorized paper failed to compile. Hues " + "will not be searched for in this diff directory.", diffed_images_file_path, ) return None for img_name in os.listdir(diffed_images_file_path): img_path = os.path.join(diffed_images_file_path, img_name) page_image = cv2.imread(img_path) if contains_black_pixels(page_image): logging.warning("Black pixels found in image diff %s", img_path) black_pixels_found = True page_number = int( os.path.splitext(img_name)[0].replace("page-", "")) - 1 diff_images[page_number] = page_image for entity_id, hue in entity_hues.items(): for page_number, image in diff_images.items(): boxes = extract_bounding_boxes(image, page_number, hue) for box in boxes: entity_locations[entity_id].append(box) shifted_entity_ids.update( find_shifted_entities(arxiv_id, modified_images_dir, relative_file_path, entity_hues)) return LocationResult( locations=entity_locations, shifted_entities=list(shifted_entity_ids), black_pixels_found=black_pixels_found, )
def load_equation_token_locations( arxiv_id: ArxivId, ) -> Optional[Dict[TokenId, List[BoundingBox]]]: token_locations: Dict[TokenId, List[BoundingBox]] = {} token_locations_path = os.path.join( directories.arxiv_subdir("equation-tokens-locations", arxiv_id), "entity_locations.csv", ) if not os.path.exists(token_locations_path): logging.warning( "Could not find bounding boxes information for %s. Skipping", arxiv_id, ) return None for record in load_from_csv(token_locations_path, HueLocationInfo): equation_index, token_index = [int(t) for t in record.entity_id.split("-")] token_id = TokenId(record.tex_path, equation_index, token_index) box = BoundingBox( page=int(record.page), left=record.left, top=record.top, width=record.width, height=record.height, ) if token_id not in token_locations: token_locations[token_id] = [] token_locations[token_id].append(box) return token_locations
def save(self, item: DetectionTask, result: SerializableEntity) -> None: results_dir = directories.arxiv_subdir(self.get_output_base_dirkey(), item.arxiv_id) if not os.path.exists(results_dir): os.makedirs(results_dir) entities_path = os.path.join(results_dir, "entities.csv") file_utils.append_to_csv(entities_path, result)
def load_locations( arxiv_id: ArxivId, entity_name: str) -> Optional[Dict[EntityId, List[BoundingBox]]]: """ Load bounding boxes for each entity. Entities can have multiple bounding boxes (as will be the case if they are split over multiple lines). """ boxes_by_entity_id: Dict[EntityId, List[BoundingBox]] = defaultdict(list) bounding_boxes_path = os.path.join( directories.arxiv_subdir(f"{entity_name}-locations", arxiv_id), "entity_locations.csv", ) if not os.path.exists(bounding_boxes_path): logging.warning( "Could not find bounding boxes information for entity of type %s for paper %s. Skipping.", entity_name, arxiv_id, ) return None for hue_info in load_from_csv(bounding_boxes_path, EntityLocationInfo): box = BoundingBox( page=hue_info.page, left=hue_info.left, top=hue_info.top, width=hue_info.width, height=hue_info.height, ) boxes_by_entity_id[hue_info.entity_id].append(box) return boxes_by_entity_id
def save(self, item: TexAndSymbols, result: AnnotationResult) -> None: output_sources_path = directories.arxiv_subdir( "sources-with-annotated-symbols", item.arxiv_id) logging.debug("Outputting to %s", output_sources_path) # Create new directory for each colorization iteration. unpack_path = unpack(item.arxiv_id, output_sources_path) sources_unpacked = unpack_path is not None if unpack_path is None: logging.warning("Could not unpack sources into %s", output_sources_path) if sources_unpacked: for annotated_file in result: full_tex_path = os.path.join(output_sources_path, annotated_file.tex_path) with open(full_tex_path, "w", encoding=annotated_file.encoding) as tex_file: tex_file.write(annotated_file.contents) symbols_tex_path = os.path.join(output_sources_path, "symbol_tex.csv") with open(symbols_tex_path, "a", encoding="utf-8") as symbols_tex_file: writer = csv.writer(symbols_tex_file, quoting=csv.QUOTE_ALL) for annotated_file in result: for symbol_tex in annotated_file.symbol_tex: try: writer.writerow( [annotated_file.tex_path, symbol_tex]) except Exception: # pylint: disable=broad-except logging.warning( "Couldn't write row for annotated line for arXiv %s: can't be converted to utf-8", item.arxiv_id, )
def load(self) -> Iterator[TexAndSymbols]: for arxiv_id in self.arxiv_ids: output_root = directories.arxiv_subdir( "sources-with-annotated-symbols", arxiv_id) file_utils.clean_directory(output_root) symbols_dir = directories.arxiv_subdir("detected-equation-tokens", arxiv_id) tokens_path = os.path.join(symbols_dir, "entities.csv") if not os.path.exists(tokens_path): logging.info( "No equation token data found for paper %s. Skipping.", arxiv_id) continue symbols_with_ids = file_utils.load_symbols(arxiv_id) if symbols_with_ids is None: continue symbols = {swi.symbol_id: swi.symbol for swi in symbols_with_ids} tokens = file_utils.load_tokens(arxiv_id) if tokens is None: continue tex_paths = set({t.tex_path for t in tokens}) characters: Dict[CharacterId, Character] = {} for token in tokens: character_id = CharacterId(token.tex_path, token.equation_index, token.token_index) characters[character_id] = Character(token.text, token.token_index, token.start, token.end) # Load original sources for TeX files that need to be colorized contents_by_file = {} for tex_path in tex_paths: absolute_tex_path = os.path.join( directories.arxiv_subdir("sources", arxiv_id), tex_path) file_contents = file_utils.read_file_tolerant( absolute_tex_path) if file_contents is not None: contents_by_file[tex_path] = file_contents yield TexAndSymbols(arxiv_id, contents_by_file, symbols, characters)
def save(self, item: MatchTask, result: BibitemMatch) -> None: resolutions_dir = directories.arxiv_subdir("bibitem-resolutions", item.arxiv_id) if not os.path.exists(resolutions_dir): os.makedirs(resolutions_dir) resolutions_path = os.path.join(resolutions_dir, "resolutions.csv") file_utils.append_to_csv(resolutions_path, result)
def load(self) -> Iterator[PageRasterPair]: for arxiv_id in self.arxiv_ids: output_dir = directories.arxiv_subdir( self.get_output_base_dirkey(), arxiv_id) file_utils.clean_directory(output_dir) # Get output file names from results of compiling the uncolorized TeX sources. output_files = get_output_files( directories.arxiv_subdir("compiled-sources", arxiv_id)) if len(output_files) == 0: continue for iteration in directories.iteration_names( self.get_raster_base_dirkey(), arxiv_id): original_images_dir = directories.arxiv_subdir( "paper-images", arxiv_id) modified_images_dir = directories.iteration( self.get_raster_base_dirkey(), arxiv_id, iteration) for output_file in output_files: relative_file_path = output_file.path original_images_path = os.path.join( original_images_dir, relative_file_path) for img_name in os.listdir(original_images_path): original_img_path = os.path.join( original_images_path, img_name) modified_img_path = os.path.join( modified_images_dir, relative_file_path, img_name) if not os.path.exists(modified_img_path): logging.warning( "Could not find expected image %s. Skipping diff for this paper.", modified_img_path, ) break original_img = cv2.imread(original_img_path) modified_img = cv2.imread(modified_img_path) yield PageRasterPair( arxiv_id, iteration, relative_file_path, img_name, original_img, modified_img, )