def load(self) -> Iterator[TexAndTokens]: for arxiv_id in self.arxiv_ids: output_root = directories.arxiv_subdir( "sources-with-colorized-equation-tokens", arxiv_id) file_utils.clean_directory(output_root) tokens_path = os.path.join( directories.arxiv_subdir("detected-equation-tokens", arxiv_id), "entities.csv", ) if not os.path.exists(tokens_path): logging.info( "No equation token data found for paper %s. Skipping.", arxiv_id) continue # Load token location information tokens = file_utils.load_tokens(arxiv_id) if tokens is None: continue tex_paths = set({token.tex_path for token in tokens}) # Load original sources for TeX files that need to be colorized contents_by_file = {} for tex_path in tex_paths: absolute_tex_path = os.path.join( directories.arxiv_subdir("sources", arxiv_id), tex_path) file_contents = file_utils.read_file_tolerant( absolute_tex_path) if file_contents is not None: contents_by_file[tex_path] = file_contents yield TexAndTokens(arxiv_id, contents_by_file, tokens)
def load(self) -> Iterator[ColorizationTask]: for arxiv_id in self.arxiv_ids: output_root = directories.arxiv_subdir( self.get_output_base_dirkey(), arxiv_id) file_utils.clean_directory(output_root) entities_path = os.path.join( directories.arxiv_subdir(self.get_detected_entities_dirkey(), arxiv_id), "entities.csv", ) entities = list( file_utils.load_from_csv(entities_path, self.get_detected_entity_type())) original_sources_path = directories.arxiv_subdir( "sources", arxiv_id) for tex_path in file_utils.find_files(original_sources_path, [".tex"], relative=True): file_contents = file_utils.read_file_tolerant( os.path.join(original_sources_path, tex_path)) entities_for_tex_path = [ e for e in entities if e.tex_path == tex_path ] if file_contents is not None: yield ColorizationTask(arxiv_id, tex_path, file_contents, entities_for_tex_path)
def load(self) -> Iterator[ArxivId]: for arxiv_id in self.arxiv_ids: file_utils.clean_directory( directories.arxiv_subdir("detected-equation-tokens", arxiv_id)) file_utils.clean_directory( directories.arxiv_subdir("detected-symbols", arxiv_id)) yield arxiv_id
def load(self) -> Iterator[LocationTask]: for arxiv_id in self.arxiv_ids: output_dir = directories.arxiv_subdir( "composite-symbols-locations", arxiv_id) file_utils.clean_directory(output_dir) token_locations = file_utils.load_equation_token_locations( arxiv_id) if token_locations is None: continue symbols_with_ids = file_utils.load_symbols(arxiv_id) if symbols_with_ids is None: continue for symbol_with_id in symbols_with_ids: # Symbols with affixes (e.g., arrows, hats) cannot be localized by taking the union # of their tokens' bounding boxes, because the bounding boxes of affix tokens # cannot be detected on their own. if not symbol_with_id.symbol.contains_affix: yield LocationTask( arxiv_id=arxiv_id, token_locations=token_locations, symbol_with_id=symbol_with_id, )
def load(self) -> Iterator[MatchTask]: for arxiv_id in self.arxiv_ids: file_utils.clean_directory( directories.arxiv_subdir("bibitem-resolutions", arxiv_id)) bibitems_dir = directories.arxiv_subdir("detected-citations", arxiv_id) metadata_dir = directories.arxiv_subdir("s2-metadata", arxiv_id) references_path = os.path.join(metadata_dir, "references.csv") if not os.path.exists(references_path): logging.warning( "Could not find %s, skipping reference resolution for paper %s", references_path, arxiv_id, ) return references = list( file_utils.load_from_csv(references_path, SerializableReference)) bibitems_path = os.path.join(bibitems_dir, "entities.csv") if not os.path.exists(bibitems_path): logging.warning( "Could not find %s, skipping reference resolution for paper %s", bibitems_path, arxiv_id, ) return bibitems = list(file_utils.load_from_csv(bibitems_path, Bibitem)) yield MatchTask(arxiv_id, bibitems, references)
def load(self) -> Iterator[ColorizationTask]: for arxiv_id in self.arxiv_ids: output_root = directories.arxiv_subdir( "sources-with-colorized-citations", arxiv_id) file_utils.clean_directory(output_root) bibitems_path = os.path.join( directories.arxiv_subdir("bibitems", arxiv_id), "bibitems.csv") if not os.path.exists(bibitems_path): logging.warning( "No bibitems were found for paper %s. Skipping", arxiv_id) continue bibitems = file_utils.load_from_csv(bibitems_path, Bibitem) bibitem_keys = [b.key for b in bibitems if b.key is not None] original_sources_path = directories.arxiv_subdir( "sources", arxiv_id) for tex_path in file_utils.find_files(original_sources_path, [".tex"], relative=True): file_contents = file_utils.read_file_tolerant( os.path.join(original_sources_path, tex_path)) if file_contents is not None: yield ColorizationTask(arxiv_id, tex_path, file_contents, bibitem_keys)
def load(self) -> Iterator[DetectDefinitionsTask]: for arxiv_id in self.arxiv_ids: output_dir = directories.arxiv_subdir("detected-definitions", arxiv_id) file_utils.clean_directory(output_dir) # Load cleaned sentences for definition detection. detected_sentences_path = os.path.join( directories.arxiv_subdir("sentence-tokens", arxiv_id), "sentences.csv", ) try: sentences = list( file_utils.load_from_csv(detected_sentences_path, EmbellishedSentence)) except FileNotFoundError: logging.warning( # pylint: disable=logging-not-lazy "No sentences data found for arXiv paper %s. Try re-running the pipeline, " + "this time enabling the processing of sentences. If that doesn't work, " + "there is likely an error in detected sentences for this paper.", arxiv_id, ) continue # Read in all TeX. Once definition detection is finished, all the TeX will be searched # for references to the defined terms. tex_by_file = file_utils.read_tex(arxiv_id) yield DetectDefinitionsTask(arxiv_id, sentences, tex_by_file)
def load(self) -> Iterator[SymbolSentencesTask]: for arxiv_id in self.arxiv_ids: output_dir = directories.arxiv_subdir("sentences-for-symbols", arxiv_id) file_utils.clean_directory(output_dir) token_sentences_path = os.path.join( directories.arxiv_subdir("sentences-for-equation-tokens", arxiv_id), "entity_sentences.csv", ) if not os.path.exists(token_sentences_path): logging.warning( # pylint: disable=logging-not-lazy "Could not find links between sentences and equation tokens at " + "path %s for arXiv paper %s. Skipping the detection of symbol sentences.", token_sentences_path, arxiv_id, ) continue token_sentence_pairs = list( file_utils.load_from_csv(token_sentences_path, EntitySentencePairIds)) symbols = file_utils.load_symbols(arxiv_id) if not symbols: continue # Filter to only those symbols for which tokens have been detected symbols = [s for s in symbols if len(s.symbol.characters) > 0] yield SymbolSentencesTask(arxiv_id, symbols, token_sentence_pairs)
def load(self) -> Iterator[ExtractionTask]: for arxiv_id in self.arxiv_ids: sources_dir = directories.arxiv_subdir("sources", arxiv_id) file_utils.clean_directory( directories.arxiv_subdir("detected-citations", arxiv_id)) for path in file_utils.find_files(sources_dir, [".tex", ".bbl"]): file_contents = file_utils.read_file_tolerant(path) if file_contents is None: continue yield ExtractionTask(arxiv_id, file_contents)
def unpack(arxiv_id: str, unpack_path: str) -> Optional[str]: archive_path = directories.arxiv_subdir("sources-archives", arxiv_id) if not os.path.exists(archive_path): logging.warning("No source archive found for %s", arxiv_id) return None if os.path.exists(unpack_path): logging.warning("Directory already found at %s. Deleting contents.", unpack_path) clean_directory(unpack_path) _unpack(archive_path, unpack_path) return unpack_path
def load(self) -> Iterator[Task]: for arxiv_id in self.arxiv_ids: output_dir = directories.arxiv_subdir( f"contexts-for-{self.get_entity_name()}", arxiv_id) file_utils.clean_directory(output_dir) # Load entities from file. # Load in all extracted entities. See note in 'colorize_tex.py' for why entities # might be saved in multiple files. If they are, for this upload function to work, # each of the entities need to have a unique pair of 'ID' and 'tex_path'. entities_dir = directories.arxiv_subdir( f"detected-{self.get_entity_name()}", arxiv_id) entities: List[SerializableEntity] = [] for entities_path in glob.glob( os.path.join(entities_dir, "entities*.csv")): entities.extend( file_utils.load_from_csv(entities_path, self.get_entity_type())) # Load sentences from file. sentences_path = os.path.join( directories.arxiv_subdir("detected-sentences", arxiv_id), "entities.csv") try: sentences = list( file_utils.load_from_csv(sentences_path, Sentence)) except FileNotFoundError: logging.warning( # pylint: disable=logging-not-lazy "No sentences data found for arXiv paper %s. Try re-running the pipeline, " + "this time enabling the processing of sentences. If that doesn't work, " + "there was likely an error in detecting sentences for this paper.", arxiv_id, ) continue tex_paths = {e.tex_path for e in entities} for tex_path in tex_paths: entities_for_file = [ e for e in entities if e.tex_path == tex_path ] sentences_for_file = [ s for s in sentences if s.tex_path == tex_path ] yield Task(arxiv_id, tex_path, entities_for_file, sentences_for_file)
def load(self) -> Iterator[MathMLForPaper]: for arxiv_id in self.arxiv_ids: output_dir = directories.arxiv_subdir("symbol-matches", arxiv_id) file_utils.clean_directory(output_dir) symbols_with_ids = file_utils.load_symbols(arxiv_id) if symbols_with_ids is None: continue symbols_mathml = {swi.symbol.mathml for swi in symbols_with_ids} yield MathMLForPaper(arxiv_id=arxiv_id, mathml_equations=symbols_mathml)
def load(self) -> Iterator[DetectionTask]: for arxiv_id in self.arxiv_ids: output_root = directories.arxiv_subdir( self.get_output_base_dirkey(), arxiv_id) file_utils.clean_directory(output_root) original_sources_path = directories.arxiv_subdir( "sources", arxiv_id) for tex_path in file_utils.find_files(original_sources_path, [".tex"], relative=True): file_contents = file_utils.read_file_tolerant( os.path.join(original_sources_path, tex_path)) if file_contents is not None: yield DetectionTask(arxiv_id, tex_path, file_contents)
def load(self) -> Iterator[LocationTask]: for arxiv_id in self.arxiv_ids: for output_base_dir in self.output_base_dirs.values(): file_utils.clean_directory( directories.arxiv_subdir(output_base_dir, arxiv_id)) # A directory of entities may contain files for each of multiple types of entities. # One example is that the definition detector detects both terms and definitions. # In that case, the colorizer colorizes all entities from all of these files. # Earlier entity extractor commands should include enough information in the entity IDs # so that the type of entities can be inferred from the entity ID in later commands. entities_dir = directories.arxiv_subdir(self.get_input_dirkey(), arxiv_id) entities: List[SerializableEntity] = [] for entities_path in glob.glob( os.path.join(entities_dir, "entities*.csv")): entities.extend( file_utils.load_from_csv(entities_path, self.get_detected_entity_type())) main_tex_files = get_compiled_tex_files( directories.arxiv_subdir("compiled-normalized-sources", arxiv_id)) normalized_sources_path = directories.arxiv_subdir( "normalized-sources", arxiv_id) for tex_file in main_tex_files: file_contents = file_utils.read_file_tolerant( os.path.join(normalized_sources_path, tex_file.path)) options = self.get_colorize_options() entities_for_tex_path = [ e for e in entities if e.tex_path == tex_file.path or e.tex_path == "N/A" ] if options.when is not None: entities_for_tex_path = list( filter(options.when, entities_for_tex_path)) if file_contents is not None: group_func = options.group or (lambda entities: [entities]) for group_index, entity_group in enumerate( group_func(entities_for_tex_path)): yield LocationTask( arxiv_id, tex_file.path, file_contents, entity_group, group_index, )
def load(self) -> Iterator[RasterTask]: for arxiv_id in self.arxiv_ids: # Clean all past output for this arXiv ID. output_dir_for_arxiv_id = directories.arxiv_subdir("paper-images", arxiv_id) file_utils.clean_directory(output_dir_for_arxiv_id) paper_abs_path = directories.arxiv_subdir( "compiled-normalized-sources", arxiv_id ) output_files = get_output_files(paper_abs_path) for output_file in output_files: yield RasterTask( arxiv_id, output_file.output_type, output_file.path, )
def load(self) -> Iterator[TexAndSymbols]: for arxiv_id in self.arxiv_ids: output_root = directories.arxiv_subdir( "sources-with-annotated-symbols", arxiv_id) file_utils.clean_directory(output_root) symbols_dir = directories.arxiv_subdir("detected-equation-tokens", arxiv_id) tokens_path = os.path.join(symbols_dir, "entities.csv") if not os.path.exists(tokens_path): logging.info( "No equation token data found for paper %s. Skipping.", arxiv_id) continue symbols_with_ids = file_utils.load_symbols(arxiv_id) if symbols_with_ids is None: continue symbols = {swi.symbol_id: swi.symbol for swi in symbols_with_ids} tokens = file_utils.load_tokens(arxiv_id) if tokens is None: continue tex_paths = set({t.tex_path for t in tokens}) characters: Dict[CharacterId, Character] = {} for token in tokens: character_id = CharacterId(token.tex_path, token.equation_index, token.token_index) characters[character_id] = Character(token.text, token.token_index, token.start, token.end) # Load original sources for TeX files that need to be colorized contents_by_file = {} for tex_path in tex_paths: absolute_tex_path = os.path.join( directories.arxiv_subdir("sources", arxiv_id), tex_path) file_contents = file_utils.read_file_tolerant( absolute_tex_path) if file_contents is not None: contents_by_file[tex_path] = file_contents yield TexAndSymbols(arxiv_id, contents_by_file, symbols, characters)
def load(self) -> Iterator[LocationTask]: for arxiv_id in self.arxiv_ids: output_dir = directories.arxiv_subdir("citation-cluster-locations", arxiv_id) file_utils.clean_directory(output_dir) boxes_by_hue_iteration = file_utils.load_citation_hue_locations( arxiv_id) if boxes_by_hue_iteration is None: continue boxes_by_citation_key: Dict[str, List[BoundingBox]] = {} for iteration in directories.iteration_names( "sources-with-colorized-citations", arxiv_id): citation_hues_path = os.path.join( directories.iteration( "sources-with-colorized-citations", arxiv_id, iteration, ), "entity_hues.csv", ) if not os.path.exists(citation_hues_path): logging.warning( "Could not find citation hue colors for %s iteration %s. Skipping", arxiv_id, iteration, ) continue for record in file_utils.load_from_csv(citation_hues_path, ColorizationRecord): key = record.entity_id if key not in boxes_by_citation_key: boxes_by_citation_key[key] = [] hue_iteration = HueIteration(record.hue, iteration) boxes_by_citation_key[key].extend( boxes_by_hue_iteration.get(hue_iteration, [])) for key, boxes in boxes_by_citation_key.items(): yield LocationTask( arxiv_id=arxiv_id, citation_key=key, boxes=boxes, )
def load(self) -> Iterator[Task]: for arxiv_id in self.arxiv_ids: output_dir = directories.arxiv_subdir( f"contexts-for-{self.get_entity_name()}", arxiv_id) file_utils.clean_directory(output_dir) # Load entities from file. entities_path = os.path.join( directories.arxiv_subdir(f"detected-{self.get_entity_name()}", arxiv_id), "entities.csv", ) entities = list( file_utils.load_from_csv(entities_path, self.get_entity_type())) # Load sentences from file. sentences_path = os.path.join( directories.arxiv_subdir("detected-sentences", arxiv_id), "entities.csv") try: sentences = list( file_utils.load_from_csv(sentences_path, Sentence)) except FileNotFoundError: logging.warning( # pylint: disable=logging-not-lazy "No sentences data found for arXiv paper %s. Try re-running the pipeline, " + "this time enabling the processing of sentences. If that doesn't work, " + "there was likely an error in detecting sentences for this paper.", arxiv_id, ) continue tex_paths = {e.tex_path for e in entities} for tex_path in tex_paths: entities_for_file = [ e for e in entities if e.tex_path == tex_path ] sentences_for_file = [ s for s in sentences if s.tex_path == tex_path ] yield Task(arxiv_id, tex_path, entities_for_file, sentences_for_file)
def load(self) -> Iterator[PageRasterPair]: for arxiv_id in self.arxiv_ids: output_dir = directories.arxiv_subdir( self.get_output_base_dirkey(), arxiv_id) file_utils.clean_directory(output_dir) # Get output file names from results of compiling the uncolorized TeX sources. output_files = get_output_files( directories.arxiv_subdir("compiled-sources", arxiv_id)) if len(output_files) == 0: continue for iteration in directories.iteration_names( self.get_raster_base_dirkey(), arxiv_id): original_images_dir = directories.arxiv_subdir( "paper-images", arxiv_id) modified_images_dir = directories.iteration( self.get_raster_base_dirkey(), arxiv_id, iteration) for output_file in output_files: relative_file_path = output_file.path original_images_path = os.path.join( original_images_dir, relative_file_path) for img_name in os.listdir(original_images_path): original_img_path = os.path.join( original_images_path, img_name) modified_img_path = os.path.join( modified_images_dir, relative_file_path, img_name) if not os.path.exists(modified_img_path): logging.warning( "Could not find expected image %s. Skipping diff for this paper.", modified_img_path, ) break original_img = cv2.imread(original_img_path) modified_img = cv2.imread(modified_img_path) yield PageRasterPair( arxiv_id, iteration, relative_file_path, img_name, original_img, modified_img, )
def load(self) -> Iterator[Task]: for arxiv_id in self.arxiv_ids: output_dir = directories.arxiv_subdir("sentence-tokens", arxiv_id) file_utils.clean_directory(output_dir) # Load symbols, for use in embellishing equations. symbols: Dict[str, List[Symbol]] = defaultdict(list) symbol_data = file_utils.load_symbols(arxiv_id) if symbol_data is not None: for id_, symbol in symbol_data: symbols[id_.tex_path].append(symbol) else: logging.warning( # pylint: disable=logging-not-lazy "No symbol data found for arXiv ID %s. It will not be " + "possible to expand equations in sentences with symbol data. This should only " + "be a problem if it's expected that there are no symbols in paper %s.", arxiv_id, arxiv_id, ) # Load sentences. detected_sentences_path = os.path.join( directories.arxiv_subdir("detected-sentences", arxiv_id), "entities.csv", ) if not os.path.exists(detected_sentences_path): logging.warning( # pylint: disable=logging-not-lazy "No sentences data found for arXiv paper %s. Try re-running the pipeline, " + "this time enabling the processing of sentences. If that doesn't work, " + "there is likely an error in detecting sentences for this paper.", arxiv_id, ) continue sentences = file_utils.load_from_csv(detected_sentences_path, Sentence) for sentence in sentences: yield Task(arxiv_id, sentence, symbols[sentence.tex_path])
def _cleanup_from_last_batch() -> None: " Clean up output directories from the last batch. " if batch_index > -1 and not self.args.keep_intermediate_files: logging.debug( # pylint: disable=logging-not-lazy "Deleting intermediate files used to locate entities (i.e., colorized " + "sources, compilation results, and rasters) for paper %s iteration %s", item.arxiv_id, iteration_id or "''", ) intermediate_files_dirs = [ colorized_tex_dir, compiled_tex_dir, raster_output_dir, diffs_output_dir, ] for dir_ in intermediate_files_dirs: if dir_ and os.path.exists(dir_): file_utils.clean_directory(dir_) os.rmdir(dir_)
def load(self) -> Iterator[RasterTask]: for arxiv_id in self.arxiv_ids: # Clean all past output for this arXiv ID. output_dir_for_arxiv_id = directories.arxiv_subdir( self.get_output_base_dirkey(), arxiv_id) file_utils.clean_directory(output_dir_for_arxiv_id) for paper_dir in self.get_paper_dirs(arxiv_id): paper_abs_path = os.path.join( directories.dirpath(self.get_papers_base_dirkey()), paper_dir) output_files = get_output_files(paper_abs_path) for output_file in output_files: yield RasterTask( paper_dir, output_file.output_type, output_file.path, os.path.join(paper_abs_path, output_file.path), )
def load(self) -> Iterator[LocationTask]: for arxiv_id in self.arxiv_ids: output_dir = directories.arxiv_subdir("symbol-locations", arxiv_id) file_utils.clean_directory(output_dir) token_locations = file_utils.load_equation_token_locations( arxiv_id) if token_locations is None: continue symbols_with_ids = file_utils.load_symbols(arxiv_id) if symbols_with_ids is None: continue for symbol_with_id in symbols_with_ids: yield LocationTask( arxiv_id=arxiv_id, token_locations=token_locations, symbol_with_id=symbol_with_id, )
def load(self) -> Iterator[Locations]: for arxiv_id in self.arxiv_ids: output_dir = directories.arxiv_subdir("symbols-locations", arxiv_id) file_utils.clean_directory(output_dir) all_locations: List[EntityLocationInfo] = [] composite_symbols_path = os.path.join( directories.arxiv_subdir("composite-symbols-locations", arxiv_id), "symbol_locations.csv", ) if os.path.exists(composite_symbols_path): all_locations.extend( file_utils.load_from_csv(composite_symbols_path, EntityLocationInfo) ) else: logging.info( "No locations could be found for composite symbols for paper %s.", arxiv_id, ) symbols_with_affixes_path = os.path.join( directories.arxiv_subdir("symbols-with-affixes-locations", arxiv_id), "entity_locations.csv", ) if os.path.exists(symbols_with_affixes_path): all_locations.extend( file_utils.load_from_csv( symbols_with_affixes_path, EntityLocationInfo ) ) else: logging.info( "No locations could be found for symbols with affixes for paper %s.", arxiv_id, ) yield Locations(arxiv_id, all_locations)
def load(self) -> Iterator[PaperTokens]: for arxiv_id in self.arxiv_ids: output_dir = directories.arxiv_subdir("annotation-files", arxiv_id) file_utils.clean_directory(output_dir) # Load tokens. tokens_path = os.path.join( directories.arxiv_subdir("sentence-tokens", arxiv_id), "tokens.csv", ) try: tokens = list(file_utils.load_from_csv(tokens_path, Token)) except FileNotFoundError: logging.warning( # pylint: disable=logging-not-lazy "No tokens data found for arXiv paper %s. No annotation files will be " + "generated for this paper.", arxiv_id, ) continue yield PaperTokens(arxiv_id, tokens)
def load(self) -> Iterator[CompilationTask]: sources_base_dir = directories.dirpath(self.get_sources_base_dirkey()) output_base_dir = directories.dirpath(self.get_output_base_dirkey()) for arxiv_id in self.arxiv_ids: # Clean all past output for this arXiv ID. output_dir_for_arxiv_id = directories.arxiv_subdir( self.get_output_base_dirkey(), arxiv_id ) file_utils.clean_directory(output_dir_for_arxiv_id) for source_dir in self.get_source_dirs(arxiv_id): qualified_source_dir = os.path.join(sources_base_dir, source_dir) output_dir = os.path.join(output_base_dir, source_dir) if os.path.exists(output_dir): logging.warning( "Compilation directory already exists in %s. Deleting.", output_dir, ) shutil.rmtree(output_dir) shutil.copytree(qualified_source_dir, output_dir) yield CompilationTask(arxiv_id, output_dir)
def load(self) -> Iterator[DetectionTask]: for arxiv_id in self.arxiv_ids: output_root = directories.arxiv_subdir( self.get_output_base_dirkey(), arxiv_id) file_utils.clean_directory(output_root) for main_tex_file in get_compiled_tex_files( directories.arxiv_subdir("compiled-normalized-sources", arxiv_id)): # While the directory of compiled sources is inspected to find out which TeX # files were compiled, entities should be detected in the un-compiled sources, # because AutoTeX sometimes modifies the TeX files during compilation, meaning # that character offsets found in compiled TeX files won't match the character # offsets of the same text in the original TeX files downloaded from arXiv. file_contents = file_utils.read_file_tolerant( os.path.join( directories.arxiv_subdir("normalized-sources", arxiv_id), main_tex_file.path, )) if file_contents is not None: yield DetectionTask(arxiv_id, main_tex_file.path, file_contents)
def load(self) -> Iterator[LocationTask]: entity_name = self.get_entity_name() for arxiv_id in self.arxiv_ids: for output_base_dir in self.output_base_dirs.values(): file_utils.clean_directory( directories.arxiv_subdir(output_base_dir, arxiv_id)) # A directory of entities may contain files for each of multiple types of entities. # One example is that the definition detector detects both terms and definitions. # In that case, the colorizer colorizes all entities from all of these files. # Earlier entity extractor commands should include enough information in the entity IDs # so that the type of entities can be inferred from the entity ID in later commands. entities_dir = directories.arxiv_subdir(f"detected-{entity_name}", arxiv_id) entities: List[SerializableEntity] = [] for entities_path in glob.glob( os.path.join(entities_dir, "entities*.csv")): entities.extend( file_utils.load_from_csv(entities_path, self.get_detected_entity_type())) original_sources_path = directories.arxiv_subdir( "sources", arxiv_id) for tex_path in file_utils.find_files(original_sources_path, [".tex"], relative=True): file_contents = file_utils.read_file_tolerant( os.path.join(original_sources_path, tex_path)) entities_for_tex_path = [ e for e in entities if e.tex_path == tex_path or e.tex_path == "N/A" ] if file_contents is not None: yield LocationTask(arxiv_id, tex_path, file_contents, entities_for_tex_path)
def load(self) -> Iterator[SearchTask]: for arxiv_id in self.arxiv_ids: output_dir = directories.arxiv_subdir( self.get_output_base_dirkey(), arxiv_id) file_utils.clean_directory(output_dir) # Get output file names from results of compiling the uncolorized TeX sources. output_files = get_output_files( directories.arxiv_subdir("compiled-sources", arxiv_id)) for iteration in directories.iteration_names( self.get_diff_images_base_dirkey(), arxiv_id): diff_images_dir = directories.iteration( self.get_diff_images_base_dirkey(), arxiv_id, iteration) hue_searches = self.load_hues(arxiv_id, iteration) hue_searches_by_file: Dict[Path, List[HueSearchRegion]] = {} for search in hue_searches: output_paths = [f.path for f in output_files] files_to_search = ([search.relative_file_path] if search.relative_file_path is not None else output_paths) for path in files_to_search: if path not in hue_searches_by_file: hue_searches_by_file[path] = [] hue_searches_by_file[path].append(search) for relative_file_path, search_regions in hue_searches_by_file.items( ): diff_images_file_path = os.path.join( diff_images_dir, relative_file_path) page_images = {} colorization_error_detected = False for img_name in os.listdir(diff_images_file_path): img_path = os.path.join(diff_images_file_path, img_name) page_image = cv2.imread(img_path) if not self.args.skip_visual_validation: if contains_black_pixels(page_image): logging.warning( "Black pixels found in image diff %s", img_path) colorization_error_detected = True page_number = (int( os.path.splitext(img_name)[0].replace("page-", "")) - 1) page_images[page_number] = page_image if colorization_error_detected: logging.warning( # pylint: disable=logging-not-lazy "Colorization error detected. Skipping hue location for " + "iteration %s for arXiv paper %s", iteration, arxiv_id, ) break for search_region in search_regions: yield SearchTask( arxiv_id, iteration, page_images, relative_file_path, search_region, )
def load(self) -> Iterator[Task]: for arxiv_id in self.arxiv_ids: output_dir = directories.arxiv_subdir("embellished-sentences", arxiv_id) file_utils.clean_directory(output_dir) # Load equation data. equations: Equations = {} equations_path = os.path.join( directories.arxiv_subdir("detected-equations", arxiv_id), "entities.csv") try: equation_data = file_utils.load_from_csv( equations_path, Equation) for equation in equation_data: equations[(equation.tex_path, int(equation.id_))] = equation except FileNotFoundError: logging.warning( # pylint: disable=logging-not-lazy "No equation data found for arXiv ID %s. It will not be " + "possible to expand equations in sentences with symbol data. This should only " + "be a problem if it's expected that there are no symbols in paper %s.", arxiv_id, ) # Load symbols, for use in embellishing equations. symbols: Symbols = defaultdict(list) symbol_data = file_utils.load_symbols(arxiv_id) if symbol_data is not None: for id_, symbol in symbol_data: symbols[(id_.tex_path, id_.equation_index)].append(symbol) else: logging.warning( # pylint: disable=logging-not-lazy "No symbol data found for arXiv ID %s. It will not be " + "possible to expand equations in sentences with symbol data. This should only " + "be a problem if it's expected that there are no symbols in paper %s.", arxiv_id, ) # Load sentences. detected_sentences_path = os.path.join( directories.arxiv_subdir("detected-sentences", arxiv_id), "entities.csv", ) try: sentences = file_utils.load_from_csv(detected_sentences_path, Sentence) except FileNotFoundError: logging.warning( # pylint: disable=logging-not-lazy "No sentences data found for arXiv paper %s. Try re-running the pipeline, " + "this time enabling the processing of sentences. If that doesn't work, " + "there is likely an error in detcting sentences for this paper.", arxiv_id, ) continue for sentence in sentences: yield Task(arxiv_id, sentence, equations, symbols)