def load(self) -> Iterator[TexAndTokens]: for arxiv_id in get_arxiv_ids(directories.SOURCES_DIR): output_root = get_data_subdirectory_for_arxiv_id( directories.SOURCES_WITH_COLORIZED_EQUATION_TOKENS_DIR, arxiv_id) clean_directory(output_root) tokens_path = os.path.join(directories.symbols(arxiv_id), "tokens.csv") if not os.path.exists(tokens_path): logging.info( "No equation token data found for paper %s. Skipping.", arxiv_id) continue # Load token location information tokens = load_tokens(arxiv_id) if tokens is None: continue tex_paths = set({token.tex_path for token in tokens}) # Load original sources for TeX files that need to be colorized contents_by_file = {} for tex_path in tex_paths: absolute_tex_path = os.path.join(directories.sources(arxiv_id), tex_path) contents = read_file_tolerant(absolute_tex_path) if contents is not None: contents_by_file[tex_path] = contents yield TexAndTokens(arxiv_id, contents_by_file, tokens)
def load(self) -> Iterator[FileContents]: for arxiv_id in get_arxiv_ids(SOURCES_DIR): sources_dir = sources(arxiv_id) clean_directory(directories.bibitems(arxiv_id)) for path in find_files(sources_dir, [".tex", ".bbl"]): contents = read_file_tolerant(path) if contents is None: continue yield FileContents(arxiv_id, path, contents)
def load(self) -> Iterator[FileContents]: for arxiv_id in get_arxiv_ids(directories.SOURCES_DIR): output_root = get_data_subdirectory_for_arxiv_id( directories.SOURCES_WITH_COLORIZED_CITATIONS_DIR, arxiv_id) clean_directory(output_root) original_sources_path = directories.sources(arxiv_id) for text_path in find_files(original_sources_path, [".tex"], relative=True): contents = read_file_tolerant( os.path.join(original_sources_path, text_path)) if contents is not None: yield FileContents(arxiv_id, text_path, contents)
def load(self) -> Iterator[TexAndSymbols]: for arxiv_id in get_arxiv_ids(directories.SOURCES_DIR): output_root = get_data_subdirectory_for_arxiv_id( directories.SOURCES_WITH_ANNOTATED_SYMBOLS, arxiv_id) clean_directory(output_root) symbols_dir = directories.symbols(arxiv_id) tokens_path = os.path.join(symbols_dir, "tokens.csv") if not os.path.exists(tokens_path): logging.info( "No equation token data found for paper %s. Skipping.", arxiv_id) continue symbols_with_ids = load_symbols(arxiv_id) if symbols_with_ids is None: continue symbols = {swi.symbol_id: swi.symbol for swi in symbols_with_ids} tokens = load_tokens(arxiv_id) if tokens is None: continue tex_paths = set({t.tex_path for t in tokens}) characters: Dict[CharacterId, Character] = {} for token in tokens: character_id = CharacterId(token.tex_path, token.equation_index, token.token_index) characters[character_id] = Character(token.text, token.token_index, token.start, token.end) # Load original sources for TeX files that need to be colorized contents_by_file = {} for tex_path in tex_paths: absolute_tex_path = os.path.join(directories.sources(arxiv_id), tex_path) contents = read_file_tolerant(absolute_tex_path) if contents is not None: contents_by_file[tex_path] = contents yield TexAndSymbols(arxiv_id, contents_by_file, symbols, characters)
def process(self, item: ArxivId) -> Iterator[None]: unpack(item, directories.sources(item)) yield None