def load(self) -> Iterator[MatchTask]: for arxiv_id in self.arxiv_ids: file_utils.clean_directory( directories.arxiv_subdir("bibitem-resolutions", arxiv_id)) bibitems_dir = directories.arxiv_subdir("detected-citations", arxiv_id) metadata_dir = directories.arxiv_subdir("s2-metadata", arxiv_id) references_path = os.path.join(metadata_dir, "references.csv") if not os.path.exists(references_path): logging.warning( "Could not find %s, skipping reference resolution for paper %s", references_path, arxiv_id, ) return references = list( file_utils.load_from_csv(references_path, SerializableReference)) bibitems_path = os.path.join(bibitems_dir, "entities.csv") if not os.path.exists(bibitems_path): logging.warning( "Could not find %s, skipping reference resolution for paper %s", bibitems_path, arxiv_id, ) return bibitems = list(file_utils.load_from_csv(bibitems_path, Bibitem)) yield MatchTask(arxiv_id, bibitems, references)
def load(self) -> Iterator[CitationData]: for arxiv_id in self.arxiv_ids: # Load citation locations citation_locations = load_located_citations(arxiv_id) if citation_locations is None: continue # Load metadata for bibitems key_s2_ids: Dict[CitationKey, S2Id] = {} key_resolutions_path = os.path.join( directories.arxiv_subdir("bibitem-resolutions", arxiv_id), "resolutions.csv", ) if not os.path.exists(key_resolutions_path): logging.warning( "Could not find citation resolutions for %s. Skipping", arxiv_id) continue for resolution in file_utils.load_from_csv(key_resolutions_path, BibitemMatch): if resolution.key is not None: key_s2_ids[resolution.key] = resolution.s2_id s2_id_path = os.path.join( directories.arxiv_subdir("s2-metadata", arxiv_id), "s2_id") if not os.path.exists(s2_id_path): logging.warning("Could not find S2 ID file for %s. Skipping", arxiv_id) continue with open(s2_id_path) as s2_id_file: s2_id = s2_id_file.read() s2_data: Dict[S2Id, SerializableReference] = {} s2_metadata_path = os.path.join( directories.arxiv_subdir("s2-metadata", arxiv_id), "references.csv") if not os.path.exists(s2_metadata_path): logging.warning( "Could not find S2 metadata file for citations for %s. Skipping", arxiv_id, ) continue for metadata in file_utils.load_from_csv(s2_metadata_path, SerializableReference): # Convert authors field to comma-delimited list of authors author_string = ",".join( [a["name"] for a in ast.literal_eval(metadata.authors)]) metadata = dataclasses.replace(metadata, authors=author_string) s2_data[metadata.s2_id] = metadata yield CitationData( arxiv_id, s2_id, citation_locations, key_s2_ids, s2_data, )
def load(self) -> Iterator[PaperProcessingResult]: for arxiv_id in self.arxiv_ids: # Load the S2 ID for this paper s2_id_path = os.path.join( directories.arxiv_subdir("s2-metadata", arxiv_id), "s2_id") if not os.path.exists(s2_id_path): logging.warning("Could not find S2 ID file for %s. Skipping", arxiv_id) continue with open(s2_id_path) as s2_id_file: s2_id = s2_id_file.read() # Load in all extracted entities. See note in 'colorize_tex.py' for why entities # might be saved in multiple files. If they are, for this upload function to work, # each of the entities need to have a unique pair of 'ID' and 'tex_path'. entities_dir = directories.arxiv_subdir( self.get_detected_entities_dirkey(), arxiv_id) entities: List[SerializableEntity] = [] for entities_path in glob.glob( os.path.join(entities_dir, "entities*.csv")): entities.extend( file_utils.load_from_csv( entities_path, self.get_detected_entity_type( os.path.basename(entities_path)), )) # Load in locations of all detected hues. hue_locations_path = os.path.join( directories.arxiv_subdir(self.get_hue_locations_dirkey(), arxiv_id), "entity_locations.csv", ) hue_location_infos = list( file_utils.load_from_csv(hue_locations_path, HueLocationInfo)) # Group each entity with its location. Pass the entity information, and the detected # locations for the entity, to the upload function. localized_enitites = [] for entity in entities: matching_locations = [] for h in hue_location_infos: if h.entity_id == entity.id_ and h.tex_path == entity.tex_path: matching_locations.append(h) localized_enitites.append( EntityAndLocation(entity, matching_locations)) yield PaperProcessingResult( arxiv_id=arxiv_id, s2_id=s2_id, localized_entities=localized_enitites, )
def load(self) -> Iterator[PaperProcessingResult]: for arxiv_id in self.arxiv_ids: # Load the S2 ID for this paper s2_id_path = os.path.join( directories.arxiv_subdir("s2-metadata", arxiv_id), "s2_id") if not os.path.exists(s2_id_path): logging.warning("Could not find S2 ID file for %s. Skipping", arxiv_id) continue with open(s2_id_path) as s2_id_file: s2_id = s2_id_file.read() # Load in all extracted entities. entities_path = os.path.join( directories.arxiv_subdir(self.get_detected_entities_dirkey(), arxiv_id), "entities.csv", ) entities = list( file_utils.load_from_csv(entities_path, self.get_detected_entity_type())) # Load in locations of all detected hues. hue_locations_path = os.path.join( directories.arxiv_subdir(self.get_hue_locations_dirkey(), arxiv_id), "hue_locations.csv", ) hue_location_infos = list( file_utils.load_from_csv(hue_locations_path, HueLocationInfo)) # Group each entity with its location. Pass the entity information, and the detected # locations for the entity, to the upload function. localized_enitites = [] for entity in entities: matching_locations = [] for h in hue_location_infos: if h.entity_id == entity.id_ and h.tex_path == entity.tex_path: matching_locations.append(h) localized_enitites.append( EntityAndLocation(entity, matching_locations)) yield PaperProcessingResult( arxiv_id=arxiv_id, s2_id=s2_id, localized_entities=localized_enitites, )
def load(self) -> Iterator[Task]: for arxiv_id in self.arxiv_ids: output_dir = directories.arxiv_subdir( f"contexts-for-{self.get_entity_name()}", arxiv_id) file_utils.clean_directory(output_dir) # Load entities from file. # Load in all extracted entities. See note in 'colorize_tex.py' for why entities # might be saved in multiple files. If they are, for this upload function to work, # each of the entities need to have a unique pair of 'ID' and 'tex_path'. entities_dir = directories.arxiv_subdir( f"detected-{self.get_entity_name()}", arxiv_id) entities: List[SerializableEntity] = [] for entities_path in glob.glob( os.path.join(entities_dir, "entities*.csv")): entities.extend( file_utils.load_from_csv(entities_path, self.get_entity_type())) # Load sentences from file. sentences_path = os.path.join( directories.arxiv_subdir("detected-sentences", arxiv_id), "entities.csv") try: sentences = list( file_utils.load_from_csv(sentences_path, Sentence)) except FileNotFoundError: logging.warning( # pylint: disable=logging-not-lazy "No sentences data found for arXiv paper %s. Try re-running the pipeline, " + "this time enabling the processing of sentences. If that doesn't work, " + "there was likely an error in detecting sentences for this paper.", arxiv_id, ) continue tex_paths = {e.tex_path for e in entities} for tex_path in tex_paths: entities_for_file = [ e for e in entities if e.tex_path == tex_path ] sentences_for_file = [ s for s in sentences if s.tex_path == tex_path ] yield Task(arxiv_id, tex_path, entities_for_file, sentences_for_file)
def load(self) -> Iterator[SymbolSentencesTask]: for arxiv_id in self.arxiv_ids: output_dir = directories.arxiv_subdir("sentences-for-symbols", arxiv_id) file_utils.clean_directory(output_dir) token_sentences_path = os.path.join( directories.arxiv_subdir("sentences-for-equation-tokens", arxiv_id), "entity_sentences.csv", ) if not os.path.exists(token_sentences_path): logging.warning( # pylint: disable=logging-not-lazy "Could not find links between sentences and equation tokens at " + "path %s for arXiv paper %s. Skipping the detection of symbol sentences.", token_sentences_path, arxiv_id, ) continue token_sentence_pairs = list( file_utils.load_from_csv(token_sentences_path, EntitySentencePairIds)) symbols = file_utils.load_symbols(arxiv_id) if not symbols: continue # Filter to only those symbols for which tokens have been detected symbols = [s for s in symbols if len(s.symbol.characters) > 0] yield SymbolSentencesTask(arxiv_id, symbols, token_sentence_pairs)
def load(self) -> Iterator[ColorizationTask]: for arxiv_id in self.arxiv_ids: output_root = directories.arxiv_subdir( self.get_output_base_dirkey(), arxiv_id) file_utils.clean_directory(output_root) entities_path = os.path.join( directories.arxiv_subdir(self.get_detected_entities_dirkey(), arxiv_id), "entities.csv", ) entities = list( file_utils.load_from_csv(entities_path, self.get_detected_entity_type())) original_sources_path = directories.arxiv_subdir( "sources", arxiv_id) for tex_path in file_utils.find_files(original_sources_path, [".tex"], relative=True): file_contents = file_utils.read_file_tolerant( os.path.join(original_sources_path, tex_path)) entities_for_tex_path = [ e for e in entities if e.tex_path == tex_path ] if file_contents is not None: yield ColorizationTask(arxiv_id, tex_path, file_contents, entities_for_tex_path)
def load(self) -> Iterator[DetectDefinitionsTask]: for arxiv_id in self.arxiv_ids: output_dir = directories.arxiv_subdir("detected-definitions", arxiv_id) file_utils.clean_directory(output_dir) # Load cleaned sentences for definition detection. detected_sentences_path = os.path.join( directories.arxiv_subdir("sentence-tokens", arxiv_id), "sentences.csv", ) try: sentences = list( file_utils.load_from_csv(detected_sentences_path, EmbellishedSentence)) except FileNotFoundError: logging.warning( # pylint: disable=logging-not-lazy "No sentences data found for arXiv paper %s. Try re-running the pipeline, " + "this time enabling the processing of sentences. If that doesn't work, " + "there is likely an error in detected sentences for this paper.", arxiv_id, ) continue # Read in all TeX. Once definition detection is finished, all the TeX will be searched # for references to the defined terms. tex_by_file = file_utils.read_tex(arxiv_id) yield DetectDefinitionsTask(arxiv_id, sentences, tex_by_file)
def load_hues(self, arxiv_id: ArxivId, iteration: str) -> List[HueSearchRegion]: hues_path = os.path.join( directories.iteration( f"sources-with-colorized-{entity_name}", arxiv_id, iteration, ), "entity_hues.csv", ) if not os.path.exists(hues_path): logging.warning("Could not find any hues at %s", hues_path) return [] searches = [] for record in file_utils.load_from_csv(hues_path, ColorizationRecord): searches.append( HueSearchRegion( hue=record.hue, record=record, relative_file_path=None, masks=None, )) return searches
def load(self) -> Iterator[ColorizationTask]: for arxiv_id in self.arxiv_ids: output_root = directories.arxiv_subdir( "sources-with-colorized-citations", arxiv_id) file_utils.clean_directory(output_root) bibitems_path = os.path.join( directories.arxiv_subdir("bibitems", arxiv_id), "bibitems.csv") if not os.path.exists(bibitems_path): logging.warning( "No bibitems were found for paper %s. Skipping", arxiv_id) continue bibitems = file_utils.load_from_csv(bibitems_path, Bibitem) bibitem_keys = [b.key for b in bibitems if b.key is not None] original_sources_path = directories.arxiv_subdir( "sources", arxiv_id) for tex_path in file_utils.find_files(original_sources_path, [".tex"], relative=True): file_contents = file_utils.read_file_tolerant( os.path.join(original_sources_path, tex_path)) if file_contents is not None: yield ColorizationTask(arxiv_id, tex_path, file_contents, bibitem_keys)
def load(self) -> Iterator[Task]: for arxiv_id in self.arxiv_ids: output_dir = directories.arxiv_subdir( f"contexts-for-{self.get_entity_name()}", arxiv_id) file_utils.clean_directory(output_dir) # Load entities from file. entities_path = os.path.join( directories.arxiv_subdir(f"detected-{self.get_entity_name()}", arxiv_id), "entities.csv", ) entities = list( file_utils.load_from_csv(entities_path, self.get_entity_type())) # Load sentences from file. sentences_path = os.path.join( directories.arxiv_subdir("detected-sentences", arxiv_id), "entities.csv") try: sentences = list( file_utils.load_from_csv(sentences_path, Sentence)) except FileNotFoundError: logging.warning( # pylint: disable=logging-not-lazy "No sentences data found for arXiv paper %s. Try re-running the pipeline, " + "this time enabling the processing of sentences. If that doesn't work, " + "there was likely an error in detecting sentences for this paper.", arxiv_id, ) continue tex_paths = {e.tex_path for e in entities} for tex_path in tex_paths: entities_for_file = [ e for e in entities if e.tex_path == tex_path ] sentences_for_file = [ s for s in sentences if s.tex_path == tex_path ] yield Task(arxiv_id, tex_path, entities_for_file, sentences_for_file)
def count_entities_extracted(arxiv_id: ArxivId) -> Optional[int]: """ This is not the same as the number of citation commands in the TeX; specifically, it's the number of bibitems which are colorized to enable detection of citation locations. """ bibitems_path = os.path.join( directories.arxiv_subdir("detected-citations", arxiv_id), "entities.csv") if not os.path.exists(bibitems_path): return None return len(list(file_utils.load_from_csv(bibitems_path, Bibitem)))
def load(self) -> Iterator[Locations]: for arxiv_id in self.arxiv_ids: output_dir = directories.arxiv_subdir("symbols-locations", arxiv_id) file_utils.clean_directory(output_dir) all_locations: List[EntityLocationInfo] = [] composite_symbols_path = os.path.join( directories.arxiv_subdir("composite-symbols-locations", arxiv_id), "symbol_locations.csv", ) if os.path.exists(composite_symbols_path): all_locations.extend( file_utils.load_from_csv(composite_symbols_path, EntityLocationInfo) ) else: logging.info( "No locations could be found for composite symbols for paper %s.", arxiv_id, ) symbols_with_affixes_path = os.path.join( directories.arxiv_subdir("symbols-with-affixes-locations", arxiv_id), "entity_locations.csv", ) if os.path.exists(symbols_with_affixes_path): all_locations.extend( file_utils.load_from_csv( symbols_with_affixes_path, EntityLocationInfo ) ) else: logging.info( "No locations could be found for symbols with affixes for paper %s.", arxiv_id, ) yield Locations(arxiv_id, all_locations)
def load(self) -> Iterator[LocationTask]: for arxiv_id in self.arxiv_ids: for output_base_dir in self.output_base_dirs.values(): file_utils.clean_directory( directories.arxiv_subdir(output_base_dir, arxiv_id)) # A directory of entities may contain files for each of multiple types of entities. # One example is that the definition detector detects both terms and definitions. # In that case, the colorizer colorizes all entities from all of these files. # Earlier entity extractor commands should include enough information in the entity IDs # so that the type of entities can be inferred from the entity ID in later commands. entities_dir = directories.arxiv_subdir(self.get_input_dirkey(), arxiv_id) entities: List[SerializableEntity] = [] for entities_path in glob.glob( os.path.join(entities_dir, "entities*.csv")): entities.extend( file_utils.load_from_csv(entities_path, self.get_detected_entity_type())) main_tex_files = get_compiled_tex_files( directories.arxiv_subdir("compiled-normalized-sources", arxiv_id)) normalized_sources_path = directories.arxiv_subdir( "normalized-sources", arxiv_id) for tex_file in main_tex_files: file_contents = file_utils.read_file_tolerant( os.path.join(normalized_sources_path, tex_file.path)) options = self.get_colorize_options() entities_for_tex_path = [ e for e in entities if e.tex_path == tex_file.path or e.tex_path == "N/A" ] if options.when is not None: entities_for_tex_path = list( filter(options.when, entities_for_tex_path)) if file_contents is not None: group_func = options.group or (lambda entities: [entities]) for group_index, entity_group in enumerate( group_func(entities_for_tex_path)): yield LocationTask( arxiv_id, tex_file.path, file_contents, entity_group, group_index, )
def load(self) -> Iterator[LocationTask]: for arxiv_id in self.arxiv_ids: output_dir = directories.arxiv_subdir("citation-cluster-locations", arxiv_id) file_utils.clean_directory(output_dir) boxes_by_hue_iteration = file_utils.load_citation_hue_locations( arxiv_id) if boxes_by_hue_iteration is None: continue boxes_by_citation_key: Dict[str, List[BoundingBox]] = {} for iteration in directories.iteration_names( "sources-with-colorized-citations", arxiv_id): citation_hues_path = os.path.join( directories.iteration( "sources-with-colorized-citations", arxiv_id, iteration, ), "entity_hues.csv", ) if not os.path.exists(citation_hues_path): logging.warning( "Could not find citation hue colors for %s iteration %s. Skipping", arxiv_id, iteration, ) continue for record in file_utils.load_from_csv(citation_hues_path, ColorizationRecord): key = record.entity_id if key not in boxes_by_citation_key: boxes_by_citation_key[key] = [] hue_iteration = HueIteration(record.hue, iteration) boxes_by_citation_key[key].extend( boxes_by_hue_iteration.get(hue_iteration, [])) for key, boxes in boxes_by_citation_key.items(): yield LocationTask( arxiv_id=arxiv_id, citation_key=key, boxes=boxes, )
def get_output_files(compiled_tex_dir: str) -> List[OutputFile]: """ Get a list of output files for a directory of compiled TeX. """ compilation_results_dir = os.path.join(compiled_tex_dir, "compilation_results") result_path = os.path.join(compilation_results_dir, "result") with open(result_path) as result_file: result = result_file.read().strip() if result == "True": output_files_path = os.path.join(compilation_results_dir, "output_files.csv") output_files = list( file_utils.load_from_csv(output_files_path, OutputFile)) return output_files return []
def load_located_citations(arxiv_id: ArxivId) -> Optional[Citations]: citation_locations: Citations = {} citation_locations_path = os.path.join( directories.arxiv_subdir("citation-locations", arxiv_id), "citation_locations.csv", ) if not os.path.exists(citation_locations_path): logging.warning("Could not find citation locations for %s. Skipping", arxiv_id) return None for location in file_utils.load_from_csv(citation_locations_path, CitationLocation): if not location.key in citation_locations: citation_locations[location.key] = {} if not location.cluster_index in citation_locations[location.key]: citation_locations[location.key][location.cluster_index] = set() citation_locations[location.key][location.cluster_index].add(location) return citation_locations
def get_output_files(compiled_tex_dir: RelativePath) -> List[OutputFile]: " Get a list of output files for a directory of compiled TeX. " if _did_compilation_succeed(compiled_tex_dir): output_files_path = os.path.join( _get_compilation_results_dir(compiled_tex_dir), "output_files.csv" ) if not os.path.exists(output_files_path): logging.warning( # pylint: disable=logging-not-lazy "Although compilation succeeded for TeX compilation in directory %s, no " + "output files were produced. Something unexpected must have happened during " + "compilation of the TeX.", compiled_tex_dir, ) return [] output_files = list(file_utils.load_from_csv(output_files_path, OutputFile)) return output_files return []
def count_detected_entities( arxiv_id: ArxivId, detected_entities_dirkey: str, entities_filename: str = "entities.csv", ) -> Optional[int]: num_entities_detected = None if directories.registered(detected_entities_dirkey): detected_entities_path = os.path.join( directories.arxiv_subdir(detected_entities_dirkey, arxiv_id), entities_filename, ) if os.path.exists(detected_entities_path): num_entities_detected = len( list( file_utils.load_from_csv(detected_entities_path, SerializableEntity))) return num_entities_detected
def count_hues_located( arxiv_id: ArxivId, hue_locations_dirkey: str, hue_locations_filename: str = "hue_locations.csv", ) -> Optional[int]: num_hues_located = None if directories.registered(hue_locations_dirkey): hue_locations_path = os.path.join( directories.arxiv_subdir(hue_locations_dirkey, arxiv_id), hue_locations_filename, ) if os.path.exists(hue_locations_path): num_hues_located = len( list( file_utils.load_from_csv(hue_locations_path, HueLocationInfo))) return num_hues_located
def load(self) -> Iterator[Task]: for arxiv_id in self.arxiv_ids: output_dir = directories.arxiv_subdir("sentence-tokens", arxiv_id) file_utils.clean_directory(output_dir) # Load symbols, for use in embellishing equations. symbols: Dict[str, List[Symbol]] = defaultdict(list) symbol_data = file_utils.load_symbols(arxiv_id) if symbol_data is not None: for id_, symbol in symbol_data: symbols[id_.tex_path].append(symbol) else: logging.warning( # pylint: disable=logging-not-lazy "No symbol data found for arXiv ID %s. It will not be " + "possible to expand equations in sentences with symbol data. This should only " + "be a problem if it's expected that there are no symbols in paper %s.", arxiv_id, arxiv_id, ) # Load sentences. detected_sentences_path = os.path.join( directories.arxiv_subdir("detected-sentences", arxiv_id), "entities.csv", ) if not os.path.exists(detected_sentences_path): logging.warning( # pylint: disable=logging-not-lazy "No sentences data found for arXiv paper %s. Try re-running the pipeline, " + "this time enabling the processing of sentences. If that doesn't work, " + "there is likely an error in detecting sentences for this paper.", arxiv_id, ) continue sentences = file_utils.load_from_csv(detected_sentences_path, Sentence) for sentence in sentences: yield Task(arxiv_id, sentence, symbols[sentence.tex_path])
def get_compiled_tex_files(compiled_tex_dir: RelativePath) -> List[CompiledTexFile]: " Get a list of TeX files that were successfully compiled. " if _did_compilation_succeed(compiled_tex_dir): compiled_tex_files_path = os.path.join( _get_compilation_results_dir(compiled_tex_dir), "compiled_tex_files.csv" ) if not os.path.exists(compiled_tex_files_path): logging.warning( # pylint: disable=logging-not-lazy "Although compilation succeeded for TeX compilation in directory %s, no " + "specific TeX files were logged as having been compiled. Something " + "unexpected must have happened during compilation of the TeX.", compiled_tex_dir, ) return [] compiled_tex_files = list( file_utils.load_from_csv(compiled_tex_files_path, CompiledTexFile) ) return compiled_tex_files return []
def load(self) -> Iterator[PaperTokens]: for arxiv_id in self.arxiv_ids: output_dir = directories.arxiv_subdir("annotation-files", arxiv_id) file_utils.clean_directory(output_dir) # Load tokens. tokens_path = os.path.join( directories.arxiv_subdir("sentence-tokens", arxiv_id), "tokens.csv", ) try: tokens = list(file_utils.load_from_csv(tokens_path, Token)) except FileNotFoundError: logging.warning( # pylint: disable=logging-not-lazy "No tokens data found for arXiv paper %s. No annotation files will be " + "generated for this paper.", arxiv_id, ) continue yield PaperTokens(arxiv_id, tokens)
def load_located_citations(arxiv_id: ArxivId) -> Optional[Citations]: citation_locations: Citations = {} citation_locations_path = os.path.join( directories.arxiv_subdir("citations-locations", arxiv_id), "entity_locations.csv", ) if not os.path.exists(citation_locations_path): logging.warning("Could not find citation locations for %s. Skipping", arxiv_id) return None for location in file_utils.load_from_csv( citation_locations_path, EntityLocationInfo ): id_tokens = location.entity_id.rsplit("-", maxsplit=1) key = id_tokens[0] cluster_index = int(id_tokens[1]) if key not in citation_locations: citation_locations[key] = {} if not cluster_index in citation_locations[key]: citation_locations[key][cluster_index] = set() citation_locations[key][cluster_index].add(location) return citation_locations
def load(self) -> Iterator[LocationTask]: entity_name = self.get_entity_name() for arxiv_id in self.arxiv_ids: for output_base_dir in self.output_base_dirs.values(): file_utils.clean_directory( directories.arxiv_subdir(output_base_dir, arxiv_id)) # A directory of entities may contain files for each of multiple types of entities. # One example is that the definition detector detects both terms and definitions. # In that case, the colorizer colorizes all entities from all of these files. # Earlier entity extractor commands should include enough information in the entity IDs # so that the type of entities can be inferred from the entity ID in later commands. entities_dir = directories.arxiv_subdir(f"detected-{entity_name}", arxiv_id) entities: List[SerializableEntity] = [] for entities_path in glob.glob( os.path.join(entities_dir, "entities*.csv")): entities.extend( file_utils.load_from_csv(entities_path, self.get_detected_entity_type())) original_sources_path = directories.arxiv_subdir( "sources", arxiv_id) for tex_path in file_utils.find_files(original_sources_path, [".tex"], relative=True): file_contents = file_utils.read_file_tolerant( os.path.join(original_sources_path, tex_path)) entities_for_tex_path = [ e for e in entities if e.tex_path == tex_path or e.tex_path == "N/A" ] if file_contents is not None: yield LocationTask(arxiv_id, tex_path, file_contents, entities_for_tex_path)
def upload_symbols( processing_summary: PaperProcessingResult, data_version: Optional[int] ) -> None: arxiv_id = processing_summary.arxiv_id entities = [es.entity for es in processing_summary.entities] symbols = cast(List[SerializableSymbol], entities) symbols_by_id = {sid(s): s for s in symbols} entity_infos: List[EntityUploadInfo] = [] # Load MathML matches for partially matching of symbols. matches: Matches = {} matches_path = os.path.join( directories.arxiv_subdir("symbol-matches", processing_summary.arxiv_id), "matches.csv", ) if os.path.exists(matches_path): for match in file_utils.load_from_csv(matches_path, Match): if match.queried_mathml not in matches: matches[match.queried_mathml] = [] matches[match.queried_mathml].append(match) else: logging.warning( "Could not find symbol matches information for paper %s.", arxiv_id, ) # Load parent-child relationships for symbols. children: Dict[SymbolId, List[SymbolId]] = defaultdict(list) parents: Dict[SymbolId, SymbolId] = {} children_path = os.path.join( directories.arxiv_subdir("detected-symbols", arxiv_id), "symbol_children.csv" ) if os.path.exists(children_path): for parent in file_utils.load_from_csv(children_path, SerializableChild): pid = f"{parent.tex_path}-{parent.equation_index}-{parent.symbol_index}" cid = f"{parent.tex_path}-{parent.equation_index}-{parent.child_index}" parents[cid] = pid children[pid].append(cid) else: logging.warning( "Could not find file mapping from symbol to their children for paper %s.", arxiv_id, ) # Load contexts that the symbols appear in. Sort them by the symbol MathML. context_data_missing = False contexts_path = os.path.join( directories.arxiv_subdir("contexts-for-symbols", arxiv_id), "contexts.csv", ) if not os.path.exists(contexts_path): logging.warning( # pylint: disable=logging-not-lazy "Contexts have not been found for symbols for arXiv paper %s. " + "Symbol data will be uploaded without contexts.", arxiv_id, ) context_data_missing = True symbol_contexts = {} mathml_contexts = defaultdict(list) if not context_data_missing: for context in file_utils.load_from_csv(contexts_path, Context): tex_path = context.tex_path symbol_id = f"{tex_path}-{context.entity_id}" symbol_contexts[symbol_id] = context symbol = symbols_by_id[symbol_id] mathml_contexts[symbol.mathml].append(context) # Prepare collections of formulae that each symbol was found in. symbol_formulas = {} mathml_formulas: Dict[str, Set[DefiningFormula]] = defaultdict(set) for symbol in symbols: if ( symbol.is_definition and symbol.equation is not None and symbol.relative_start is not None and symbol.relative_end is not None ): highlighted = wrap_span( symbol.equation, symbol.relative_start, symbol.relative_end, before=r"\htmlClass{match-highlight}{", after="}", braces=True, ) formula = DefiningFormula( tex=highlighted, tex_path=symbol.tex_path, equation_id=str(symbol.equation_index), ) symbol_formulas[sid(symbol)] = formula mathml_formulas[symbol.mathml].add(formula) entity_infos = [] for localized_entity in processing_summary.entities: symbol = cast(SerializableSymbol, localized_entity.entity) boxes = [ BoundingBox(l.left, l.top, l.width, l.height, l.page) for l in localized_entity.locations ] # Get context and formula of the symbol, and other matching ones. symbol_context = symbol_contexts.get(sid(symbol)) matching_contexts = mathml_contexts.get(symbol.mathml, []) other_context_texs = [] other_context_sentence_ids = [] for c in matching_contexts: matching_sentence_id = f"{c.tex_path}-{c.sentence_id}" if matching_sentence_id not in other_context_sentence_ids: other_context_texs.append(c.snippet) other_context_sentence_ids.append(matching_sentence_id) matching_formulas = mathml_formulas.get(symbol.mathml, set()) other_formula_texs = [] other_formula_ids = [] for f in matching_formulas: equation_id = f"{f.tex_path}-{f.equation_id}" if equation_id not in other_formula_ids: other_formula_texs.append(f.tex) other_formula_ids.append(equation_id) # Package up data for the symbol. tags: List[str] = [] MAX_BOX_HEIGHT = 0.1 for b in boxes: if b.height > MAX_BOX_HEIGHT: logging.debug( # pylint: disable=logging-not-lazy "Detected large bounding box for symbol with height %f for entity %s of paper " + "%s. Entity will be given a tag indicating it is unexpectedly large.", b.height, f"{localized_entity.entity.tex_path}-{localized_entity.entity.id_}", arxiv_id, ) tags.append("large") break data: EntityData = { "tex": f"${symbol.tex}$", "tex_start": symbol.start, "tex_end": symbol.end, "type": symbol.type_, "mathml": symbol.mathml, "mathml_near_matches": [m.matching_mathml for m in matches[symbol.mathml]], "snippets": other_context_texs, "defining_formulas": other_formula_texs, "is_definition": symbol.is_definition or False, "tags": tags, } # Create links between this symbol, its sentence, and related symbols. sentence_id = ( f"{symbol_context.tex_path}-{symbol_context.sentence_id}" if symbol_context is not None else None ) parent_id = parents.get(sid(symbol)) child_ids = children.get(sid(symbol), []) relationships: EntityRelationships = { "equation": EntityReference( type_="equation", id_=f"{symbol.tex_path}-{symbol.equation_index}", ), "parent": EntityReference(type_="symbol", id_=parent_id), "children": [EntityReference(type_="symbol", id_=id_) for id_ in child_ids], "sentence": EntityReference(type_="sentence", id_=sentence_id) if sentence_id is not None else EntityReference(type_="sentence", id_=None), "defining_formula_equations": [ EntityReference(type_="equation", id_=id_) for id_ in other_formula_ids ], "snippet_sentences": [ EntityReference(type_="sentence", id_=id_) for id_ in other_context_sentence_ids ], } # Save all data for this symbol entity_information = EntityUploadInfo( id_=sid(symbol), type_="symbol", bounding_boxes=boxes, data=data, relationships=relationships, ) entity_infos.append(entity_information) upload_entities( processing_summary.s2_id, arxiv_id, entity_infos, data_version, )
def load(self) -> Iterator[Task]: for arxiv_id in self.arxiv_ids: output_dir = directories.arxiv_subdir("embellished-sentences", arxiv_id) file_utils.clean_directory(output_dir) # Load equation data. equations: Equations = {} equations_path = os.path.join( directories.arxiv_subdir("detected-equations", arxiv_id), "entities.csv") try: equation_data = file_utils.load_from_csv( equations_path, Equation) for equation in equation_data: equations[(equation.tex_path, int(equation.id_))] = equation except FileNotFoundError: logging.warning( # pylint: disable=logging-not-lazy "No equation data found for arXiv ID %s. It will not be " + "possible to expand equations in sentences with symbol data. This should only " + "be a problem if it's expected that there are no symbols in paper %s.", arxiv_id, ) # Load symbols, for use in embellishing equations. symbols: Symbols = defaultdict(list) symbol_data = file_utils.load_symbols(arxiv_id) if symbol_data is not None: for id_, symbol in symbol_data: symbols[(id_.tex_path, id_.equation_index)].append(symbol) else: logging.warning( # pylint: disable=logging-not-lazy "No symbol data found for arXiv ID %s. It will not be " + "possible to expand equations in sentences with symbol data. This should only " + "be a problem if it's expected that there are no symbols in paper %s.", arxiv_id, ) # Load sentences. detected_sentences_path = os.path.join( directories.arxiv_subdir("detected-sentences", arxiv_id), "entities.csv", ) try: sentences = file_utils.load_from_csv(detected_sentences_path, Sentence) except FileNotFoundError: logging.warning( # pylint: disable=logging-not-lazy "No sentences data found for arXiv paper %s. Try re-running the pipeline, " + "this time enabling the processing of sentences. If that doesn't work, " + "there is likely an error in detcting sentences for this paper.", arxiv_id, ) continue for sentence in sentences: yield Task(arxiv_id, sentence, equations, symbols)
def load(self) -> Iterator[PaperProcessingResult]: for arxiv_id in self.arxiv_ids: # Load the S2 ID for this paper s2_id_path = os.path.join( directories.arxiv_subdir("s2-metadata", arxiv_id), "s2_id") if not os.path.exists(s2_id_path): logging.warning("Could not find S2 ID file for %s. Skipping", arxiv_id) continue with open(s2_id_path) as s2_id_file: s2_id = s2_id_file.read() # Load in all extracted entities. See note in 'colorize_tex.py' for why entities # might be saved in multiple files. If they are, for this upload function to work, # each of the entities need to have a unique pair of 'ID' and 'tex_path'. entities_dir = directories.arxiv_subdir( f"detected-{self.get_entity_name()}", arxiv_id) entities: List[SerializableEntity] = [] for entities_path in glob.glob( os.path.join(entities_dir, "entities*.csv")): entities.extend( file_utils.load_from_csv( entities_path, self.get_detected_entity_type( os.path.basename(entities_path)), )) # Load locations for entities. locations_path = os.path.join( directories.arxiv_subdir(f"{self.get_entity_name()}-locations", arxiv_id), "entity_locations.csv", ) if not os.path.exists(locations_path): logging.warning( # pylint: disable=logging-not-lazy "No locations have been saved for entities in command '%s' for paper %s. No entities " + "will be uploaded for this paper.", str(self.get_name()), arxiv_id, ) continue entity_location_infos = list( file_utils.load_from_csv(locations_path, EntityLocationInfo)) # Load in contexts for all entities. contexts_loaded = False contexts_by_entity = {} if directories.registered( f"contexts-for-{self.get_entity_name()}"): contexts_path = os.path.join( directories.arxiv_subdir( f"contexts-for-{self.get_entity_name()}", arxiv_id), "contexts.csv", ) if os.path.exists(contexts_path): contexts = file_utils.load_from_csv(contexts_path, Context) contexts_by_entity = {c.entity_id: c for c in contexts} contexts_loaded = True if not contexts_loaded: logging.warning( # pylint: disable=logging-not-lazy "No contexts have been saved for entities in command '%s' for paper %s. No " + "contexts will be saved for any of these entities.", str(self.get_name()), arxiv_id, ) # Group each entity with its location and context. Then pass all entity information to # the upload function. entity_summaries = [] for entity in entities: matching_locations = [] for h in entity_location_infos: if h.entity_id == entity.id_ and h.tex_path == entity.tex_path: matching_locations.append(h) entity_summaries.append( EntityExtractionResult(entity, matching_locations, contexts_by_entity.get(entity.id_))) yield PaperProcessingResult( arxiv_id=arxiv_id, s2_id=s2_id, entities=entity_summaries, )
def load(self) -> Iterator[SymbolData]: for arxiv_id in self.arxiv_ids: s2_id = get_s2_id(arxiv_id) if s2_id is None: continue symbols_with_ids = file_utils.load_symbols(arxiv_id) if symbols_with_ids is None: continue symbols_by_id = {s.symbol_id: s.symbol for s in symbols_with_ids} boxes: Dict[SymbolId, BoundingBox] = {} boxes_path = os.path.join( directories.arxiv_subdir("symbol-locations", arxiv_id), "symbol_locations.csv", ) if not os.path.exists(boxes_path): logging.warning( "Could not find bounding boxes information for %s. Skipping", arxiv_id, ) continue for location in file_utils.load_from_csv(boxes_path, SymbolLocation): symbol_id = SymbolId( tex_path=location.tex_path, equation_index=location.equation_index, symbol_index=location.symbol_index, ) box = BoundingBox( page=int(location.page), left=location.left, top=location.top, width=location.width, height=location.height, ) boxes[symbol_id] = box matches: Matches = {} matches_path = os.path.join( directories.arxiv_subdir("symbol-matches", arxiv_id), "matches.csv") if not os.path.exists(matches_path): logging.warning( "Could not find symbol matches information for %s. Skipping", arxiv_id, ) continue for match in file_utils.load_from_csv(matches_path, Match): if match.queried_mathml not in matches: matches[match.queried_mathml] = [] matches[match.queried_mathml].append(match) context_data_missing = False contexts_path = os.path.join( directories.arxiv_subdir("contexts-for-symbols", arxiv_id), "contexts.csv", ) if not os.path.exists(contexts_path): logging.warning( # pylint: disable=logging-not-lazy "Contexts have not been found for symbols for arXiv paper %s. " + "Symbol data will be uploaded without contexts.", arxiv_id, ) context_data_missing = True symbol_contexts = {} mathml_contexts = defaultdict(list) if not context_data_missing: for context in file_utils.load_from_csv( contexts_path, Context): tex_path = context.tex_path equation_index, symbol_index = [ int(t) for t in context.entity_id.split("-") ] symbol_id = SymbolId(tex_path, equation_index, symbol_index) symbol_contexts[symbol_id] = context symbol = symbols_by_id[symbol_id] mathml_contexts[symbol.mathml].append(context) symbol_formulas = {} mathml_formulas = defaultdict(set) for id_, symbol in symbols_by_id.items(): if (symbol.is_definition and symbol.equation is not None and symbol.relative_start is not None and symbol.relative_end is not None): highlighted = wrap_span( symbol.equation, symbol.relative_start, symbol.relative_end, before=r"\htmlClass{match-highlight}{", after="}", braces=True, ) formula = DefiningFormula( tex=highlighted, tex_path=id_.tex_path, equation_id=id_.equation_index, ) symbol_formulas[id_] = formula mathml_formulas[symbol.mathml].add(formula) yield SymbolData( arxiv_id, s2_id, symbols_with_ids, boxes, symbol_contexts, symbol_formulas, mathml_contexts, mathml_formulas, matches, )
def upload_terms(processing_summary: PaperProcessingResult, data_version: Optional[int]) -> None: arxiv_id = processing_summary.arxiv_id contexts = file_utils.load_from_csv( os.path.join( directories.arxiv_subdir("contexts-for-glossary-terms", arxiv_id), "contexts.csv", ), Context, ) contexts_by_entity = {(c.tex_path, c.entity_id): c for c in contexts} # Assemble contexts that should be shown for each term. contexts_by_term: Dict[str, List[Context]] = defaultdict(list) for entity_and_location in processing_summary.localized_entities: term = cast(Term, entity_and_location.entity) if (term.tex_path, term.id_) in contexts_by_entity: contexts_by_term[term.text].append( contexts_by_entity[(term.tex_path, term.id_)]) entity_infos = [] for entity_and_location in processing_summary.localized_entities: term = cast(Term, entity_and_location.entity) context = contexts_by_entity.get((term.tex_path, term.id_)) boxes = [cast(BoundingBox, l) for l in entity_and_location.locations] # Cluster bounding boxes, in case any of these terms are defined as a macro (in which) # case all appearances of that term on the same page will have been lumped together. clusters = cluster_boxes(boxes, vertical_split=0.005) for i, cluster in enumerate(clusters): entity_info = EntityInformation( id_=f"{term.tex_path}-{term.id_}-{i}", type_="term", bounding_boxes=list(cluster), data={ "name": term.text, "definitions": term.definitions, "definition_texs": term.definitions, "sources": term.sources, "snippets": [c.snippet for c in contexts_by_term.get(term.text, [])], }, relationships={ "sentence": EntityReference( type_="sentence", id_=f"{context.tex_path}-{context.sentence_id}" if context is not None else None, ), "snippet_sentences": [ EntityReference(type_="sentence", id_=f"{c.tex_path}-{c.sentence_id}") for c in contexts_by_term.get(term.text, []) ], }, ) entity_infos.append(entity_info) upload_entities( processing_summary.s2_id, processing_summary.arxiv_id, entity_infos, data_version, )