def load(self) -> Iterator[TexAndTokens]:
        for arxiv_id in get_arxiv_ids(directories.SOURCES_DIR):

            output_root = get_data_subdirectory_for_arxiv_id(
                directories.SOURCES_WITH_COLORIZED_EQUATION_TOKENS_DIR,
                arxiv_id)
            clean_directory(output_root)

            tokens_path = os.path.join(directories.symbols(arxiv_id),
                                       "tokens.csv")
            if not os.path.exists(tokens_path):
                logging.info(
                    "No equation token data found for paper %s. Skipping.",
                    arxiv_id)
                continue

            # Load token location information
            tokens = load_tokens(arxiv_id)
            if tokens is None:
                continue
            tex_paths = set({token.tex_path for token in tokens})

            # Load original sources for TeX files that need to be colorized
            contents_by_file = {}
            for tex_path in tex_paths:
                absolute_tex_path = os.path.join(directories.sources(arxiv_id),
                                                 tex_path)
                contents = read_file_tolerant(absolute_tex_path)
                if contents is not None:
                    contents_by_file[tex_path] = contents

            yield TexAndTokens(arxiv_id, contents_by_file, tokens)
Exemple #2
0
 def load(self) -> Iterator[FileContents]:
     for arxiv_id in get_arxiv_ids(SOURCES_DIR):
         sources_dir = sources(arxiv_id)
         clean_directory(directories.bibitems(arxiv_id))
         for path in find_files(sources_dir, [".tex", ".bbl"]):
             contents = read_file_tolerant(path)
             if contents is None:
                 continue
             yield FileContents(arxiv_id, path, contents)
    def load(self) -> Iterator[FileContents]:
        for arxiv_id in get_arxiv_ids(directories.SOURCES_DIR):

            output_root = get_data_subdirectory_for_arxiv_id(
                directories.SOURCES_WITH_COLORIZED_CITATIONS_DIR, arxiv_id)
            clean_directory(output_root)

            original_sources_path = directories.sources(arxiv_id)
            for text_path in find_files(original_sources_path, [".tex"],
                                        relative=True):
                contents = read_file_tolerant(
                    os.path.join(original_sources_path, text_path))
                if contents is not None:
                    yield FileContents(arxiv_id, text_path, contents)
    def load(self) -> Iterator[TexAndSymbols]:
        for arxiv_id in get_arxiv_ids(directories.SOURCES_DIR):

            output_root = get_data_subdirectory_for_arxiv_id(
                directories.SOURCES_WITH_ANNOTATED_SYMBOLS, arxiv_id)
            clean_directory(output_root)

            symbols_dir = directories.symbols(arxiv_id)
            tokens_path = os.path.join(symbols_dir, "tokens.csv")
            if not os.path.exists(tokens_path):
                logging.info(
                    "No equation token data found for paper %s. Skipping.",
                    arxiv_id)
                continue

            symbols_with_ids = load_symbols(arxiv_id)
            if symbols_with_ids is None:
                continue
            symbols = {swi.symbol_id: swi.symbol for swi in symbols_with_ids}

            tokens = load_tokens(arxiv_id)
            if tokens is None:
                continue
            tex_paths = set({t.tex_path for t in tokens})

            characters: Dict[CharacterId, Character] = {}
            for token in tokens:
                character_id = CharacterId(token.tex_path,
                                           token.equation_index,
                                           token.token_index)
                characters[character_id] = Character(token.text,
                                                     token.token_index,
                                                     token.start, token.end)

            # Load original sources for TeX files that need to be colorized
            contents_by_file = {}
            for tex_path in tex_paths:
                absolute_tex_path = os.path.join(directories.sources(arxiv_id),
                                                 tex_path)
                contents = read_file_tolerant(absolute_tex_path)
                if contents is not None:
                    contents_by_file[tex_path] = contents

            yield TexAndSymbols(arxiv_id, contents_by_file, symbols,
                                characters)
 def process(self, item: ArxivId) -> Iterator[None]:
     unpack(item, directories.sources(item))
     yield None