Esempio n. 1
0
    def load(self) -> Iterator[TexAndTokens]:
        for arxiv_id in self.arxiv_ids:

            output_root = directories.arxiv_subdir(
                "sources-with-colorized-equation-tokens", arxiv_id)
            file_utils.clean_directory(output_root)

            tokens_path = os.path.join(
                directories.arxiv_subdir("detected-equation-tokens", arxiv_id),
                "entities.csv",
            )
            if not os.path.exists(tokens_path):
                logging.info(
                    "No equation token data found for paper %s. Skipping.",
                    arxiv_id)
                continue

            # Load token location information
            tokens = file_utils.load_tokens(arxiv_id)
            if tokens is None:
                continue
            tex_paths = set({token.tex_path for token in tokens})

            # Load original sources for TeX files that need to be colorized
            contents_by_file = {}
            for tex_path in tex_paths:
                absolute_tex_path = os.path.join(
                    directories.arxiv_subdir("sources", arxiv_id), tex_path)
                file_contents = file_utils.read_file_tolerant(
                    absolute_tex_path)
                if file_contents is not None:
                    contents_by_file[tex_path] = file_contents

            yield TexAndTokens(arxiv_id, contents_by_file, tokens)
Esempio n. 2
0
    def load(self) -> Iterator[ColorizationTask]:
        for arxiv_id in self.arxiv_ids:
            output_root = directories.arxiv_subdir(
                self.get_output_base_dirkey(), arxiv_id)
            file_utils.clean_directory(output_root)

            entities_path = os.path.join(
                directories.arxiv_subdir(self.get_detected_entities_dirkey(),
                                         arxiv_id),
                "entities.csv",
            )
            entities = list(
                file_utils.load_from_csv(entities_path,
                                         self.get_detected_entity_type()))

            original_sources_path = directories.arxiv_subdir(
                "sources", arxiv_id)
            for tex_path in file_utils.find_files(original_sources_path,
                                                  [".tex"],
                                                  relative=True):
                file_contents = file_utils.read_file_tolerant(
                    os.path.join(original_sources_path, tex_path))
                entities_for_tex_path = [
                    e for e in entities if e.tex_path == tex_path
                ]
                if file_contents is not None:
                    yield ColorizationTask(arxiv_id, tex_path, file_contents,
                                           entities_for_tex_path)
Esempio n. 3
0
 def load(self) -> Iterator[ArxivId]:
     for arxiv_id in self.arxiv_ids:
         file_utils.clean_directory(
             directories.arxiv_subdir("detected-equation-tokens", arxiv_id))
         file_utils.clean_directory(
             directories.arxiv_subdir("detected-symbols", arxiv_id))
         yield arxiv_id
Esempio n. 4
0
    def load(self) -> Iterator[LocationTask]:

        for arxiv_id in self.arxiv_ids:

            output_dir = directories.arxiv_subdir(
                "composite-symbols-locations", arxiv_id)
            file_utils.clean_directory(output_dir)

            token_locations = file_utils.load_equation_token_locations(
                arxiv_id)
            if token_locations is None:
                continue

            symbols_with_ids = file_utils.load_symbols(arxiv_id)
            if symbols_with_ids is None:
                continue

            for symbol_with_id in symbols_with_ids:
                # Symbols with affixes (e.g., arrows, hats) cannot be localized by taking the union
                # of their tokens' bounding boxes, because the bounding boxes of affix tokens
                # cannot be detected on their own.
                if not symbol_with_id.symbol.contains_affix:
                    yield LocationTask(
                        arxiv_id=arxiv_id,
                        token_locations=token_locations,
                        symbol_with_id=symbol_with_id,
                    )
Esempio n. 5
0
    def load(self) -> Iterator[MatchTask]:
        for arxiv_id in self.arxiv_ids:
            file_utils.clean_directory(
                directories.arxiv_subdir("bibitem-resolutions", arxiv_id))
            bibitems_dir = directories.arxiv_subdir("detected-citations",
                                                    arxiv_id)
            metadata_dir = directories.arxiv_subdir("s2-metadata", arxiv_id)

            references_path = os.path.join(metadata_dir, "references.csv")
            if not os.path.exists(references_path):
                logging.warning(
                    "Could not find %s, skipping reference resolution for paper %s",
                    references_path,
                    arxiv_id,
                )
                return
            references = list(
                file_utils.load_from_csv(references_path,
                                         SerializableReference))

            bibitems_path = os.path.join(bibitems_dir, "entities.csv")
            if not os.path.exists(bibitems_path):
                logging.warning(
                    "Could not find %s, skipping reference resolution for paper %s",
                    bibitems_path,
                    arxiv_id,
                )
                return
            bibitems = list(file_utils.load_from_csv(bibitems_path, Bibitem))

            yield MatchTask(arxiv_id, bibitems, references)
    def load(self) -> Iterator[ColorizationTask]:
        for arxiv_id in self.arxiv_ids:

            output_root = directories.arxiv_subdir(
                "sources-with-colorized-citations", arxiv_id)
            file_utils.clean_directory(output_root)

            bibitems_path = os.path.join(
                directories.arxiv_subdir("bibitems", arxiv_id), "bibitems.csv")
            if not os.path.exists(bibitems_path):
                logging.warning(
                    "No bibitems were found for paper %s. Skipping", arxiv_id)
                continue

            bibitems = file_utils.load_from_csv(bibitems_path, Bibitem)
            bibitem_keys = [b.key for b in bibitems if b.key is not None]

            original_sources_path = directories.arxiv_subdir(
                "sources", arxiv_id)
            for tex_path in file_utils.find_files(original_sources_path,
                                                  [".tex"],
                                                  relative=True):
                file_contents = file_utils.read_file_tolerant(
                    os.path.join(original_sources_path, tex_path))
                if file_contents is not None:
                    yield ColorizationTask(arxiv_id, tex_path, file_contents,
                                           bibitem_keys)
Esempio n. 7
0
    def load(self) -> Iterator[DetectDefinitionsTask]:
        for arxiv_id in self.arxiv_ids:

            output_dir = directories.arxiv_subdir("detected-definitions",
                                                  arxiv_id)
            file_utils.clean_directory(output_dir)

            # Load cleaned sentences for definition detection.
            detected_sentences_path = os.path.join(
                directories.arxiv_subdir("sentence-tokens", arxiv_id),
                "sentences.csv",
            )
            try:
                sentences = list(
                    file_utils.load_from_csv(detected_sentences_path,
                                             EmbellishedSentence))
            except FileNotFoundError:
                logging.warning(  # pylint: disable=logging-not-lazy
                    "No sentences data found for arXiv paper %s. Try re-running the pipeline, "
                    +
                    "this time enabling the processing of sentences. If that doesn't work, "
                    +
                    "there is likely an error in detected sentences for this paper.",
                    arxiv_id,
                )
                continue

            # Read in all TeX. Once definition detection is finished, all the TeX will be searched
            # for references to the defined terms.
            tex_by_file = file_utils.read_tex(arxiv_id)

            yield DetectDefinitionsTask(arxiv_id, sentences, tex_by_file)
Esempio n. 8
0
    def load(self) -> Iterator[SymbolSentencesTask]:

        for arxiv_id in self.arxiv_ids:

            output_dir = directories.arxiv_subdir("sentences-for-symbols",
                                                  arxiv_id)
            file_utils.clean_directory(output_dir)

            token_sentences_path = os.path.join(
                directories.arxiv_subdir("sentences-for-equation-tokens",
                                         arxiv_id),
                "entity_sentences.csv",
            )
            if not os.path.exists(token_sentences_path):
                logging.warning(  # pylint: disable=logging-not-lazy
                    "Could not find links between sentences and equation tokens at "
                    +
                    "path %s for arXiv paper %s. Skipping the detection of symbol sentences.",
                    token_sentences_path,
                    arxiv_id,
                )
                continue

            token_sentence_pairs = list(
                file_utils.load_from_csv(token_sentences_path,
                                         EntitySentencePairIds))

            symbols = file_utils.load_symbols(arxiv_id)
            if not symbols:
                continue

            # Filter to only those symbols for which tokens have been detected
            symbols = [s for s in symbols if len(s.symbol.characters) > 0]

            yield SymbolSentencesTask(arxiv_id, symbols, token_sentence_pairs)
Esempio n. 9
0
 def load(self) -> Iterator[ExtractionTask]:
     for arxiv_id in self.arxiv_ids:
         sources_dir = directories.arxiv_subdir("sources", arxiv_id)
         file_utils.clean_directory(
             directories.arxiv_subdir("detected-citations", arxiv_id))
         for path in file_utils.find_files(sources_dir, [".tex", ".bbl"]):
             file_contents = file_utils.read_file_tolerant(path)
             if file_contents is None:
                 continue
             yield ExtractionTask(arxiv_id, file_contents)
Esempio n. 10
0
def unpack(arxiv_id: str, unpack_path: str) -> Optional[str]:
    archive_path = directories.arxiv_subdir("sources-archives", arxiv_id)
    if not os.path.exists(archive_path):
        logging.warning("No source archive found for %s", arxiv_id)
        return None
    if os.path.exists(unpack_path):
        logging.warning("Directory already found at %s. Deleting contents.",
                        unpack_path)
        clean_directory(unpack_path)
    _unpack(archive_path, unpack_path)
    return unpack_path
Esempio n. 11
0
    def load(self) -> Iterator[Task]:

        for arxiv_id in self.arxiv_ids:

            output_dir = directories.arxiv_subdir(
                f"contexts-for-{self.get_entity_name()}", arxiv_id)
            file_utils.clean_directory(output_dir)

            # Load entities from file.
            # Load in all extracted entities. See note in 'colorize_tex.py' for why entities
            # might be saved in multiple files. If they are, for this upload function to work,
            # each of the entities need to have a unique pair of 'ID' and 'tex_path'.
            entities_dir = directories.arxiv_subdir(
                f"detected-{self.get_entity_name()}", arxiv_id)
            entities: List[SerializableEntity] = []
            for entities_path in glob.glob(
                    os.path.join(entities_dir, "entities*.csv")):
                entities.extend(
                    file_utils.load_from_csv(entities_path,
                                             self.get_entity_type()))

            # Load sentences from file.
            sentences_path = os.path.join(
                directories.arxiv_subdir("detected-sentences", arxiv_id),
                "entities.csv")
            try:
                sentences = list(
                    file_utils.load_from_csv(sentences_path, Sentence))
            except FileNotFoundError:
                logging.warning(  # pylint: disable=logging-not-lazy
                    "No sentences data found for arXiv paper %s. Try re-running the pipeline, "
                    +
                    "this time enabling the processing of sentences. If that doesn't work, "
                    +
                    "there was likely an error in detecting sentences for this paper.",
                    arxiv_id,
                )
                continue

            tex_paths = {e.tex_path for e in entities}
            for tex_path in tex_paths:
                entities_for_file = [
                    e for e in entities if e.tex_path == tex_path
                ]
                sentences_for_file = [
                    s for s in sentences if s.tex_path == tex_path
                ]
                yield Task(arxiv_id, tex_path, entities_for_file,
                           sentences_for_file)
Esempio n. 12
0
    def load(self) -> Iterator[MathMLForPaper]:

        for arxiv_id in self.arxiv_ids:

            output_dir = directories.arxiv_subdir("symbol-matches", arxiv_id)
            file_utils.clean_directory(output_dir)

            symbols_with_ids = file_utils.load_symbols(arxiv_id)
            if symbols_with_ids is None:
                continue

            symbols_mathml = {swi.symbol.mathml for swi in symbols_with_ids}

            yield MathMLForPaper(arxiv_id=arxiv_id,
                                 mathml_equations=symbols_mathml)
    def load(self) -> Iterator[DetectionTask]:
        for arxiv_id in self.arxiv_ids:
            output_root = directories.arxiv_subdir(
                self.get_output_base_dirkey(), arxiv_id)
            file_utils.clean_directory(output_root)

            original_sources_path = directories.arxiv_subdir(
                "sources", arxiv_id)
            for tex_path in file_utils.find_files(original_sources_path,
                                                  [".tex"],
                                                  relative=True):
                file_contents = file_utils.read_file_tolerant(
                    os.path.join(original_sources_path, tex_path))
                if file_contents is not None:
                    yield DetectionTask(arxiv_id, tex_path, file_contents)
Esempio n. 14
0
    def load(self) -> Iterator[LocationTask]:

        for arxiv_id in self.arxiv_ids:
            for output_base_dir in self.output_base_dirs.values():
                file_utils.clean_directory(
                    directories.arxiv_subdir(output_base_dir, arxiv_id))

            # A directory of entities may contain files for each of multiple types of entities.
            # One example is that the definition detector detects both terms and definitions.
            # In that case, the colorizer colorizes all entities from all of these files.
            # Earlier entity extractor commands should include enough information in the entity IDs
            # so that the type of entities can be inferred from the entity ID in later commands.
            entities_dir = directories.arxiv_subdir(self.get_input_dirkey(),
                                                    arxiv_id)
            entities: List[SerializableEntity] = []
            for entities_path in glob.glob(
                    os.path.join(entities_dir, "entities*.csv")):
                entities.extend(
                    file_utils.load_from_csv(entities_path,
                                             self.get_detected_entity_type()))

            main_tex_files = get_compiled_tex_files(
                directories.arxiv_subdir("compiled-normalized-sources",
                                         arxiv_id))
            normalized_sources_path = directories.arxiv_subdir(
                "normalized-sources", arxiv_id)
            for tex_file in main_tex_files:
                file_contents = file_utils.read_file_tolerant(
                    os.path.join(normalized_sources_path, tex_file.path))
                options = self.get_colorize_options()
                entities_for_tex_path = [
                    e for e in entities
                    if e.tex_path == tex_file.path or e.tex_path == "N/A"
                ]
                if options.when is not None:
                    entities_for_tex_path = list(
                        filter(options.when, entities_for_tex_path))
                if file_contents is not None:
                    group_func = options.group or (lambda entities: [entities])
                    for group_index, entity_group in enumerate(
                            group_func(entities_for_tex_path)):
                        yield LocationTask(
                            arxiv_id,
                            tex_file.path,
                            file_contents,
                            entity_group,
                            group_index,
                        )
Esempio n. 15
0
    def load(self) -> Iterator[RasterTask]:

        for arxiv_id in self.arxiv_ids:

            # Clean all past output for this arXiv ID.
            output_dir_for_arxiv_id = directories.arxiv_subdir("paper-images", arxiv_id)
            file_utils.clean_directory(output_dir_for_arxiv_id)

            paper_abs_path = directories.arxiv_subdir(
                "compiled-normalized-sources", arxiv_id
            )
            output_files = get_output_files(paper_abs_path)
            for output_file in output_files:
                yield RasterTask(
                    arxiv_id, output_file.output_type, output_file.path,
                )
Esempio n. 16
0
    def load(self) -> Iterator[TexAndSymbols]:
        for arxiv_id in self.arxiv_ids:

            output_root = directories.arxiv_subdir(
                "sources-with-annotated-symbols", arxiv_id)
            file_utils.clean_directory(output_root)

            symbols_dir = directories.arxiv_subdir("detected-equation-tokens",
                                                   arxiv_id)
            tokens_path = os.path.join(symbols_dir, "entities.csv")
            if not os.path.exists(tokens_path):
                logging.info(
                    "No equation token data found for paper %s. Skipping.",
                    arxiv_id)
                continue

            symbols_with_ids = file_utils.load_symbols(arxiv_id)
            if symbols_with_ids is None:
                continue
            symbols = {swi.symbol_id: swi.symbol for swi in symbols_with_ids}

            tokens = file_utils.load_tokens(arxiv_id)
            if tokens is None:
                continue
            tex_paths = set({t.tex_path for t in tokens})

            characters: Dict[CharacterId, Character] = {}
            for token in tokens:
                character_id = CharacterId(token.tex_path,
                                           token.equation_index,
                                           token.token_index)
                characters[character_id] = Character(token.text,
                                                     token.token_index,
                                                     token.start, token.end)

            # Load original sources for TeX files that need to be colorized
            contents_by_file = {}
            for tex_path in tex_paths:
                absolute_tex_path = os.path.join(
                    directories.arxiv_subdir("sources", arxiv_id), tex_path)
                file_contents = file_utils.read_file_tolerant(
                    absolute_tex_path)
                if file_contents is not None:
                    contents_by_file[tex_path] = file_contents

            yield TexAndSymbols(arxiv_id, contents_by_file, symbols,
                                characters)
Esempio n. 17
0
    def load(self) -> Iterator[LocationTask]:

        for arxiv_id in self.arxiv_ids:

            output_dir = directories.arxiv_subdir("citation-cluster-locations",
                                                  arxiv_id)
            file_utils.clean_directory(output_dir)

            boxes_by_hue_iteration = file_utils.load_citation_hue_locations(
                arxiv_id)
            if boxes_by_hue_iteration is None:
                continue

            boxes_by_citation_key: Dict[str, List[BoundingBox]] = {}
            for iteration in directories.iteration_names(
                    "sources-with-colorized-citations", arxiv_id):
                citation_hues_path = os.path.join(
                    directories.iteration(
                        "sources-with-colorized-citations",
                        arxiv_id,
                        iteration,
                    ),
                    "entity_hues.csv",
                )
                if not os.path.exists(citation_hues_path):
                    logging.warning(
                        "Could not find citation hue colors for %s iteration %s. Skipping",
                        arxiv_id,
                        iteration,
                    )
                    continue
                for record in file_utils.load_from_csv(citation_hues_path,
                                                       ColorizationRecord):
                    key = record.entity_id
                    if key not in boxes_by_citation_key:
                        boxes_by_citation_key[key] = []
                    hue_iteration = HueIteration(record.hue, iteration)
                    boxes_by_citation_key[key].extend(
                        boxes_by_hue_iteration.get(hue_iteration, []))

            for key, boxes in boxes_by_citation_key.items():
                yield LocationTask(
                    arxiv_id=arxiv_id,
                    citation_key=key,
                    boxes=boxes,
                )
Esempio n. 18
0
    def load(self) -> Iterator[Task]:

        for arxiv_id in self.arxiv_ids:

            output_dir = directories.arxiv_subdir(
                f"contexts-for-{self.get_entity_name()}", arxiv_id)
            file_utils.clean_directory(output_dir)

            # Load entities from file.
            entities_path = os.path.join(
                directories.arxiv_subdir(f"detected-{self.get_entity_name()}",
                                         arxiv_id),
                "entities.csv",
            )
            entities = list(
                file_utils.load_from_csv(entities_path,
                                         self.get_entity_type()))

            # Load sentences from file.
            sentences_path = os.path.join(
                directories.arxiv_subdir("detected-sentences", arxiv_id),
                "entities.csv")
            try:
                sentences = list(
                    file_utils.load_from_csv(sentences_path, Sentence))
            except FileNotFoundError:
                logging.warning(  # pylint: disable=logging-not-lazy
                    "No sentences data found for arXiv paper %s. Try re-running the pipeline, "
                    +
                    "this time enabling the processing of sentences. If that doesn't work, "
                    +
                    "there was likely an error in detecting sentences for this paper.",
                    arxiv_id,
                )
                continue

            tex_paths = {e.tex_path for e in entities}
            for tex_path in tex_paths:
                entities_for_file = [
                    e for e in entities if e.tex_path == tex_path
                ]
                sentences_for_file = [
                    s for s in sentences if s.tex_path == tex_path
                ]
                yield Task(arxiv_id, tex_path, entities_for_file,
                           sentences_for_file)
Esempio n. 19
0
    def load(self) -> Iterator[PageRasterPair]:
        for arxiv_id in self.arxiv_ids:
            output_dir = directories.arxiv_subdir(
                self.get_output_base_dirkey(), arxiv_id)
            file_utils.clean_directory(output_dir)

            # Get output file names from results of compiling the uncolorized TeX sources.
            output_files = get_output_files(
                directories.arxiv_subdir("compiled-sources", arxiv_id))
            if len(output_files) == 0:
                continue

            for iteration in directories.iteration_names(
                    self.get_raster_base_dirkey(), arxiv_id):

                original_images_dir = directories.arxiv_subdir(
                    "paper-images", arxiv_id)
                modified_images_dir = directories.iteration(
                    self.get_raster_base_dirkey(), arxiv_id, iteration)

                for output_file in output_files:
                    relative_file_path = output_file.path
                    original_images_path = os.path.join(
                        original_images_dir, relative_file_path)
                    for img_name in os.listdir(original_images_path):
                        original_img_path = os.path.join(
                            original_images_path, img_name)
                        modified_img_path = os.path.join(
                            modified_images_dir, relative_file_path, img_name)
                        if not os.path.exists(modified_img_path):
                            logging.warning(
                                "Could not find expected image %s. Skipping diff for this paper.",
                                modified_img_path,
                            )
                            break

                        original_img = cv2.imread(original_img_path)
                        modified_img = cv2.imread(modified_img_path)
                        yield PageRasterPair(
                            arxiv_id,
                            iteration,
                            relative_file_path,
                            img_name,
                            original_img,
                            modified_img,
                        )
Esempio n. 20
0
    def load(self) -> Iterator[Task]:
        for arxiv_id in self.arxiv_ids:

            output_dir = directories.arxiv_subdir("sentence-tokens", arxiv_id)
            file_utils.clean_directory(output_dir)

            # Load symbols, for use in embellishing equations.
            symbols: Dict[str, List[Symbol]] = defaultdict(list)
            symbol_data = file_utils.load_symbols(arxiv_id)
            if symbol_data is not None:
                for id_, symbol in symbol_data:
                    symbols[id_.tex_path].append(symbol)
            else:
                logging.warning(  # pylint: disable=logging-not-lazy
                    "No symbol data found for arXiv ID %s. It will not be " +
                    "possible to expand equations in sentences with symbol data. This should only "
                    +
                    "be a problem if it's expected that there are no symbols in paper %s.",
                    arxiv_id,
                    arxiv_id,
                )

            # Load sentences.
            detected_sentences_path = os.path.join(
                directories.arxiv_subdir("detected-sentences", arxiv_id),
                "entities.csv",
            )
            if not os.path.exists(detected_sentences_path):
                logging.warning(  # pylint: disable=logging-not-lazy
                    "No sentences data found for arXiv paper %s. Try re-running the pipeline, "
                    +
                    "this time enabling the processing of sentences. If that doesn't work, "
                    +
                    "there is likely an error in detecting sentences for this paper.",
                    arxiv_id,
                )
                continue

            sentences = file_utils.load_from_csv(detected_sentences_path,
                                                 Sentence)
            for sentence in sentences:
                yield Task(arxiv_id, sentence, symbols[sentence.tex_path])
Esempio n. 21
0
 def _cleanup_from_last_batch() -> None:
     " Clean up output directories from the last batch. "
     if batch_index > -1 and not self.args.keep_intermediate_files:
         logging.debug(  # pylint: disable=logging-not-lazy
             "Deleting intermediate files used to locate entities (i.e., colorized "
             +
             "sources, compilation results, and rasters) for paper %s iteration %s",
             item.arxiv_id,
             iteration_id or "''",
         )
         intermediate_files_dirs = [
             colorized_tex_dir,
             compiled_tex_dir,
             raster_output_dir,
             diffs_output_dir,
         ]
         for dir_ in intermediate_files_dirs:
             if dir_ and os.path.exists(dir_):
                 file_utils.clean_directory(dir_)
                 os.rmdir(dir_)
Esempio n. 22
0
    def load(self) -> Iterator[RasterTask]:

        for arxiv_id in self.arxiv_ids:

            # Clean all past output for this arXiv ID.
            output_dir_for_arxiv_id = directories.arxiv_subdir(
                self.get_output_base_dirkey(), arxiv_id)
            file_utils.clean_directory(output_dir_for_arxiv_id)

            for paper_dir in self.get_paper_dirs(arxiv_id):
                paper_abs_path = os.path.join(
                    directories.dirpath(self.get_papers_base_dirkey()),
                    paper_dir)
                output_files = get_output_files(paper_abs_path)
                for output_file in output_files:
                    yield RasterTask(
                        paper_dir,
                        output_file.output_type,
                        output_file.path,
                        os.path.join(paper_abs_path, output_file.path),
                    )
Esempio n. 23
0
    def load(self) -> Iterator[LocationTask]:

        for arxiv_id in self.arxiv_ids:

            output_dir = directories.arxiv_subdir("symbol-locations", arxiv_id)
            file_utils.clean_directory(output_dir)

            token_locations = file_utils.load_equation_token_locations(
                arxiv_id)
            if token_locations is None:
                continue

            symbols_with_ids = file_utils.load_symbols(arxiv_id)
            if symbols_with_ids is None:
                continue

            for symbol_with_id in symbols_with_ids:
                yield LocationTask(
                    arxiv_id=arxiv_id,
                    token_locations=token_locations,
                    symbol_with_id=symbol_with_id,
                )
Esempio n. 24
0
    def load(self) -> Iterator[Locations]:

        for arxiv_id in self.arxiv_ids:

            output_dir = directories.arxiv_subdir("symbols-locations", arxiv_id)
            file_utils.clean_directory(output_dir)

            all_locations: List[EntityLocationInfo] = []
            composite_symbols_path = os.path.join(
                directories.arxiv_subdir("composite-symbols-locations", arxiv_id),
                "symbol_locations.csv",
            )
            if os.path.exists(composite_symbols_path):
                all_locations.extend(
                    file_utils.load_from_csv(composite_symbols_path, EntityLocationInfo)
                )
            else:
                logging.info(
                    "No locations could be found for composite symbols for paper %s.",
                    arxiv_id,
                )

            symbols_with_affixes_path = os.path.join(
                directories.arxiv_subdir("symbols-with-affixes-locations", arxiv_id),
                "entity_locations.csv",
            )
            if os.path.exists(symbols_with_affixes_path):
                all_locations.extend(
                    file_utils.load_from_csv(
                        symbols_with_affixes_path, EntityLocationInfo
                    )
                )
            else:
                logging.info(
                    "No locations could be found for symbols with affixes for paper %s.",
                    arxiv_id,
                )

            yield Locations(arxiv_id, all_locations)
Esempio n. 25
0
    def load(self) -> Iterator[PaperTokens]:
        for arxiv_id in self.arxiv_ids:

            output_dir = directories.arxiv_subdir("annotation-files", arxiv_id)
            file_utils.clean_directory(output_dir)

            # Load tokens.
            tokens_path = os.path.join(
                directories.arxiv_subdir("sentence-tokens", arxiv_id),
                "tokens.csv",
            )
            try:
                tokens = list(file_utils.load_from_csv(tokens_path, Token))
            except FileNotFoundError:
                logging.warning(  # pylint: disable=logging-not-lazy
                    "No tokens data found for arXiv paper %s. No annotation files will be "
                    + "generated for this paper.",
                    arxiv_id,
                )
                continue

            yield PaperTokens(arxiv_id, tokens)
Esempio n. 26
0
    def load(self) -> Iterator[CompilationTask]:

        sources_base_dir = directories.dirpath(self.get_sources_base_dirkey())
        output_base_dir = directories.dirpath(self.get_output_base_dirkey())
        for arxiv_id in self.arxiv_ids:

            # Clean all past output for this arXiv ID.
            output_dir_for_arxiv_id = directories.arxiv_subdir(
                self.get_output_base_dirkey(), arxiv_id
            )
            file_utils.clean_directory(output_dir_for_arxiv_id)

            for source_dir in self.get_source_dirs(arxiv_id):
                qualified_source_dir = os.path.join(sources_base_dir, source_dir)
                output_dir = os.path.join(output_base_dir, source_dir)
                if os.path.exists(output_dir):
                    logging.warning(
                        "Compilation directory already exists in %s. Deleting.",
                        output_dir,
                    )
                    shutil.rmtree(output_dir)
                shutil.copytree(qualified_source_dir, output_dir)
                yield CompilationTask(arxiv_id, output_dir)
Esempio n. 27
0
    def load(self) -> Iterator[DetectionTask]:
        for arxiv_id in self.arxiv_ids:
            output_root = directories.arxiv_subdir(
                self.get_output_base_dirkey(), arxiv_id)
            file_utils.clean_directory(output_root)

            for main_tex_file in get_compiled_tex_files(
                    directories.arxiv_subdir("compiled-normalized-sources",
                                             arxiv_id)):
                # While the directory of compiled sources is inspected to find out which TeX
                # files were compiled, entities should be detected in the un-compiled sources,
                # because AutoTeX sometimes modifies the TeX files during compilation, meaning
                # that character offsets found in compiled TeX files won't match the character
                # offsets of the same text in the original TeX files downloaded from arXiv.
                file_contents = file_utils.read_file_tolerant(
                    os.path.join(
                        directories.arxiv_subdir("normalized-sources",
                                                 arxiv_id),
                        main_tex_file.path,
                    ))
                if file_contents is not None:
                    yield DetectionTask(arxiv_id, main_tex_file.path,
                                        file_contents)
Esempio n. 28
0
    def load(self) -> Iterator[LocationTask]:

        entity_name = self.get_entity_name()
        for arxiv_id in self.arxiv_ids:
            for output_base_dir in self.output_base_dirs.values():
                file_utils.clean_directory(
                    directories.arxiv_subdir(output_base_dir, arxiv_id))

            # A directory of entities may contain files for each of multiple types of entities.
            # One example is that the definition detector detects both terms and definitions.
            # In that case, the colorizer colorizes all entities from all of these files.
            # Earlier entity extractor commands should include enough information in the entity IDs
            # so that the type of entities can be inferred from the entity ID in later commands.
            entities_dir = directories.arxiv_subdir(f"detected-{entity_name}",
                                                    arxiv_id)
            entities: List[SerializableEntity] = []
            for entities_path in glob.glob(
                    os.path.join(entities_dir, "entities*.csv")):
                entities.extend(
                    file_utils.load_from_csv(entities_path,
                                             self.get_detected_entity_type()))

            original_sources_path = directories.arxiv_subdir(
                "sources", arxiv_id)
            for tex_path in file_utils.find_files(original_sources_path,
                                                  [".tex"],
                                                  relative=True):
                file_contents = file_utils.read_file_tolerant(
                    os.path.join(original_sources_path, tex_path))
                entities_for_tex_path = [
                    e for e in entities
                    if e.tex_path == tex_path or e.tex_path == "N/A"
                ]
                if file_contents is not None:
                    yield LocationTask(arxiv_id, tex_path, file_contents,
                                       entities_for_tex_path)
Esempio n. 29
0
    def load(self) -> Iterator[SearchTask]:
        for arxiv_id in self.arxiv_ids:
            output_dir = directories.arxiv_subdir(
                self.get_output_base_dirkey(), arxiv_id)
            file_utils.clean_directory(output_dir)

            # Get output file names from results of compiling the uncolorized TeX sources.
            output_files = get_output_files(
                directories.arxiv_subdir("compiled-sources", arxiv_id))

            for iteration in directories.iteration_names(
                    self.get_diff_images_base_dirkey(), arxiv_id):

                diff_images_dir = directories.iteration(
                    self.get_diff_images_base_dirkey(), arxiv_id, iteration)

                hue_searches = self.load_hues(arxiv_id, iteration)
                hue_searches_by_file: Dict[Path, List[HueSearchRegion]] = {}
                for search in hue_searches:
                    output_paths = [f.path for f in output_files]
                    files_to_search = ([search.relative_file_path]
                                       if search.relative_file_path is not None
                                       else output_paths)
                    for path in files_to_search:
                        if path not in hue_searches_by_file:
                            hue_searches_by_file[path] = []
                        hue_searches_by_file[path].append(search)

                for relative_file_path, search_regions in hue_searches_by_file.items(
                ):

                    diff_images_file_path = os.path.join(
                        diff_images_dir, relative_file_path)
                    page_images = {}

                    colorization_error_detected = False
                    for img_name in os.listdir(diff_images_file_path):
                        img_path = os.path.join(diff_images_file_path,
                                                img_name)
                        page_image = cv2.imread(img_path)

                        if not self.args.skip_visual_validation:
                            if contains_black_pixels(page_image):
                                logging.warning(
                                    "Black pixels found in image diff %s",
                                    img_path)
                                colorization_error_detected = True

                        page_number = (int(
                            os.path.splitext(img_name)[0].replace("page-", ""))
                                       - 1)
                        page_images[page_number] = page_image

                    if colorization_error_detected:
                        logging.warning(  # pylint: disable=logging-not-lazy
                            "Colorization error detected. Skipping hue location for "
                            + "iteration %s for arXiv paper %s",
                            iteration,
                            arxiv_id,
                        )
                        break

                    for search_region in search_regions:
                        yield SearchTask(
                            arxiv_id,
                            iteration,
                            page_images,
                            relative_file_path,
                            search_region,
                        )
Esempio n. 30
0
    def load(self) -> Iterator[Task]:
        for arxiv_id in self.arxiv_ids:

            output_dir = directories.arxiv_subdir("embellished-sentences",
                                                  arxiv_id)
            file_utils.clean_directory(output_dir)

            # Load equation data.
            equations: Equations = {}
            equations_path = os.path.join(
                directories.arxiv_subdir("detected-equations", arxiv_id),
                "entities.csv")
            try:
                equation_data = file_utils.load_from_csv(
                    equations_path, Equation)
                for equation in equation_data:
                    equations[(equation.tex_path,
                               int(equation.id_))] = equation
            except FileNotFoundError:
                logging.warning(  # pylint: disable=logging-not-lazy
                    "No equation data found for arXiv ID %s. It will not be " +
                    "possible to expand equations in sentences with symbol data. This should only "
                    +
                    "be a problem if it's expected that there are no symbols in paper %s.",
                    arxiv_id,
                )

            # Load symbols, for use in embellishing equations.
            symbols: Symbols = defaultdict(list)
            symbol_data = file_utils.load_symbols(arxiv_id)
            if symbol_data is not None:
                for id_, symbol in symbol_data:
                    symbols[(id_.tex_path, id_.equation_index)].append(symbol)
            else:
                logging.warning(  # pylint: disable=logging-not-lazy
                    "No symbol data found for arXiv ID %s. It will not be " +
                    "possible to expand equations in sentences with symbol data. This should only "
                    +
                    "be a problem if it's expected that there are no symbols in paper %s.",
                    arxiv_id,
                )

            # Load sentences.
            detected_sentences_path = os.path.join(
                directories.arxiv_subdir("detected-sentences", arxiv_id),
                "entities.csv",
            )
            try:
                sentences = file_utils.load_from_csv(detected_sentences_path,
                                                     Sentence)
            except FileNotFoundError:
                logging.warning(  # pylint: disable=logging-not-lazy
                    "No sentences data found for arXiv paper %s. Try re-running the pipeline, "
                    +
                    "this time enabling the processing of sentences. If that doesn't work, "
                    +
                    "there is likely an error in detcting sentences for this paper.",
                    arxiv_id,
                )
                continue

            for sentence in sentences:
                yield Task(arxiv_id, sentence, equations, symbols)