Exemple #1
0
    def load(self) -> Iterator[DetectDefinitionsTask]:
        for arxiv_id in self.arxiv_ids:

            output_dir = directories.arxiv_subdir("detected-definitions",
                                                  arxiv_id)
            file_utils.clean_directory(output_dir)

            # Load cleaned sentences for definition detection.
            detected_sentences_path = os.path.join(
                directories.arxiv_subdir("sentence-tokens", arxiv_id),
                "sentences.csv",
            )
            try:
                sentences = list(
                    file_utils.load_from_csv(detected_sentences_path,
                                             EmbellishedSentence))
            except FileNotFoundError:
                logging.warning(  # pylint: disable=logging-not-lazy
                    "No sentences data found for arXiv paper %s. Try re-running the pipeline, "
                    +
                    "this time enabling the processing of sentences. If that doesn't work, "
                    +
                    "there is likely an error in detected sentences for this paper.",
                    arxiv_id,
                )
                continue

            # Read in all TeX. Once definition detection is finished, all the TeX will be searched
            # for references to the defined terms.
            tex_by_file = file_utils.read_tex(arxiv_id)

            yield DetectDefinitionsTask(arxiv_id, sentences, tex_by_file)
    def load(self) -> Iterator[ColorizationTask]:
        for arxiv_id in self.arxiv_ids:

            output_root = directories.arxiv_subdir(
                "sources-with-colorized-citations", arxiv_id)
            file_utils.clean_directory(output_root)

            bibitems_path = os.path.join(
                directories.arxiv_subdir("bibitems", arxiv_id), "bibitems.csv")
            if not os.path.exists(bibitems_path):
                logging.warning(
                    "No bibitems were found for paper %s. Skipping", arxiv_id)
                continue

            bibitems = file_utils.load_from_csv(bibitems_path, Bibitem)
            bibitem_keys = [b.key for b in bibitems if b.key is not None]

            original_sources_path = directories.arxiv_subdir(
                "sources", arxiv_id)
            for tex_path in file_utils.find_files(original_sources_path,
                                                  [".tex"],
                                                  relative=True):
                file_contents = file_utils.read_file_tolerant(
                    os.path.join(original_sources_path, tex_path))
                if file_contents is not None:
                    yield ColorizationTask(arxiv_id, tex_path, file_contents,
                                           bibitem_keys)
Exemple #3
0
    def load(self) -> Iterator[SymbolSentencesTask]:

        for arxiv_id in self.arxiv_ids:

            output_dir = directories.arxiv_subdir("sentences-for-symbols",
                                                  arxiv_id)
            file_utils.clean_directory(output_dir)

            token_sentences_path = os.path.join(
                directories.arxiv_subdir("sentences-for-equation-tokens",
                                         arxiv_id),
                "entity_sentences.csv",
            )
            if not os.path.exists(token_sentences_path):
                logging.warning(  # pylint: disable=logging-not-lazy
                    "Could not find links between sentences and equation tokens at "
                    +
                    "path %s for arXiv paper %s. Skipping the detection of symbol sentences.",
                    token_sentences_path,
                    arxiv_id,
                )
                continue

            token_sentence_pairs = list(
                file_utils.load_from_csv(token_sentences_path,
                                         EntitySentencePairIds))

            symbols = file_utils.load_symbols(arxiv_id)
            if not symbols:
                continue

            # Filter to only those symbols for which tokens have been detected
            symbols = [s for s in symbols if len(s.symbol.characters) > 0]

            yield SymbolSentencesTask(arxiv_id, symbols, token_sentence_pairs)
Exemple #4
0
    def load(self) -> Iterator[ColorizationTask]:
        for arxiv_id in self.arxiv_ids:
            output_root = directories.arxiv_subdir(
                self.get_output_base_dirkey(), arxiv_id)
            file_utils.clean_directory(output_root)

            entities_path = os.path.join(
                directories.arxiv_subdir(self.get_detected_entities_dirkey(),
                                         arxiv_id),
                "entities.csv",
            )
            entities = list(
                file_utils.load_from_csv(entities_path,
                                         self.get_detected_entity_type()))

            original_sources_path = directories.arxiv_subdir(
                "sources", arxiv_id)
            for tex_path in file_utils.find_files(original_sources_path,
                                                  [".tex"],
                                                  relative=True):
                file_contents = file_utils.read_file_tolerant(
                    os.path.join(original_sources_path, tex_path))
                entities_for_tex_path = [
                    e for e in entities if e.tex_path == tex_path
                ]
                if file_contents is not None:
                    yield ColorizationTask(arxiv_id, tex_path, file_contents,
                                           entities_for_tex_path)
Exemple #5
0
    def load(self) -> Iterator[TexAndTokens]:
        for arxiv_id in self.arxiv_ids:

            output_root = directories.arxiv_subdir(
                "sources-with-colorized-equation-tokens", arxiv_id)
            file_utils.clean_directory(output_root)

            tokens_path = os.path.join(
                directories.arxiv_subdir("detected-equation-tokens", arxiv_id),
                "entities.csv",
            )
            if not os.path.exists(tokens_path):
                logging.info(
                    "No equation token data found for paper %s. Skipping.",
                    arxiv_id)
                continue

            # Load token location information
            tokens = file_utils.load_tokens(arxiv_id)
            if tokens is None:
                continue
            tex_paths = set({token.tex_path for token in tokens})

            # Load original sources for TeX files that need to be colorized
            contents_by_file = {}
            for tex_path in tex_paths:
                absolute_tex_path = os.path.join(
                    directories.arxiv_subdir("sources", arxiv_id), tex_path)
                file_contents = file_utils.read_file_tolerant(
                    absolute_tex_path)
                if file_contents is not None:
                    contents_by_file[tex_path] = file_contents

            yield TexAndTokens(arxiv_id, contents_by_file, tokens)
 def load(self) -> Iterator[ArxivId]:
     for arxiv_id in self.arxiv_ids:
         file_utils.clean_directory(
             directories.arxiv_subdir("detected-equation-tokens", arxiv_id))
         file_utils.clean_directory(
             directories.arxiv_subdir("detected-symbols", arxiv_id))
         yield arxiv_id
Exemple #7
0
    def load(self) -> Iterator[MatchTask]:
        for arxiv_id in self.arxiv_ids:
            file_utils.clean_directory(
                directories.arxiv_subdir("bibitem-resolutions", arxiv_id))
            bibitems_dir = directories.arxiv_subdir("detected-citations",
                                                    arxiv_id)
            metadata_dir = directories.arxiv_subdir("s2-metadata", arxiv_id)

            references_path = os.path.join(metadata_dir, "references.csv")
            if not os.path.exists(references_path):
                logging.warning(
                    "Could not find %s, skipping reference resolution for paper %s",
                    references_path,
                    arxiv_id,
                )
                return
            references = list(
                file_utils.load_from_csv(references_path,
                                         SerializableReference))

            bibitems_path = os.path.join(bibitems_dir, "entities.csv")
            if not os.path.exists(bibitems_path):
                logging.warning(
                    "Could not find %s, skipping reference resolution for paper %s",
                    bibitems_path,
                    arxiv_id,
                )
                return
            bibitems = list(file_utils.load_from_csv(bibitems_path, Bibitem))

            yield MatchTask(arxiv_id, bibitems, references)
Exemple #8
0
    def load(self) -> Iterator[CitationData]:
        for arxiv_id in self.arxiv_ids:

            # Load citation locations
            citation_locations = load_located_citations(arxiv_id)
            if citation_locations is None:
                continue

            # Load metadata for bibitems
            key_s2_ids: Dict[CitationKey, S2Id] = {}
            key_resolutions_path = os.path.join(
                directories.arxiv_subdir("bibitem-resolutions", arxiv_id),
                "resolutions.csv",
            )
            if not os.path.exists(key_resolutions_path):
                logging.warning(
                    "Could not find citation resolutions for %s. Skipping",
                    arxiv_id)
                continue
            for resolution in file_utils.load_from_csv(key_resolutions_path,
                                                       BibitemMatch):
                if resolution.key is not None:
                    key_s2_ids[resolution.key] = resolution.s2_id

            s2_id_path = os.path.join(
                directories.arxiv_subdir("s2-metadata", arxiv_id), "s2_id")
            if not os.path.exists(s2_id_path):
                logging.warning("Could not find S2 ID file for %s. Skipping",
                                arxiv_id)
                continue
            with open(s2_id_path) as s2_id_file:
                s2_id = s2_id_file.read()

            s2_data: Dict[S2Id, SerializableReference] = {}
            s2_metadata_path = os.path.join(
                directories.arxiv_subdir("s2-metadata", arxiv_id),
                "references.csv")
            if not os.path.exists(s2_metadata_path):
                logging.warning(
                    "Could not find S2 metadata file for citations for %s. Skipping",
                    arxiv_id,
                )
                continue
            for metadata in file_utils.load_from_csv(s2_metadata_path,
                                                     SerializableReference):
                # Convert authors field to comma-delimited list of authors
                author_string = ",".join(
                    [a["name"] for a in ast.literal_eval(metadata.authors)])
                metadata = dataclasses.replace(metadata, authors=author_string)
                s2_data[metadata.s2_id] = metadata

            yield CitationData(
                arxiv_id,
                s2_id,
                citation_locations,
                key_s2_ids,
                s2_data,
            )
Exemple #9
0
 def load(self) -> Iterator[CompilationTask]:
     for arxiv_id in self.arxiv_ids:
         output_dir = directories.arxiv_subdir("compiled-sources", arxiv_id)
         if os.path.exists(output_dir):
             logging.warning(
                 "Compilation directory already exists in %s. Deleting.", output_dir,
             )
             shutil.rmtree(output_dir)
         shutil.copytree(directories.arxiv_subdir("sources", arxiv_id), output_dir)
         yield CompilationTask(arxiv_id, output_dir)
Exemple #10
0
 def load(self) -> Iterator[ExtractionTask]:
     for arxiv_id in self.arxiv_ids:
         sources_dir = directories.arxiv_subdir("sources", arxiv_id)
         file_utils.clean_directory(
             directories.arxiv_subdir("detected-citations", arxiv_id))
         for path in file_utils.find_files(sources_dir, [".tex", ".bbl"]):
             file_contents = file_utils.read_file_tolerant(path)
             if file_contents is None:
                 continue
             yield ExtractionTask(arxiv_id, file_contents)
Exemple #11
0
 def save(self, item: RasterTask, _: None) -> None:
     raster_pages(
         directories.arxiv_subdir("compiled-normalized-sources", item.arxiv_id),
         os.path.join(
             directories.arxiv_subdir("paper-images", item.arxiv_id),
             directories.escape_slashes(item.relative_output_file_path),
         ),
         item.relative_output_file_path,
         item.output_file_type,
     )
Exemple #12
0
    def load(self) -> Iterator[PaperProcessingResult]:
        for arxiv_id in self.arxiv_ids:

            # Load the S2 ID for this paper
            s2_id_path = os.path.join(
                directories.arxiv_subdir("s2-metadata", arxiv_id), "s2_id")
            if not os.path.exists(s2_id_path):
                logging.warning("Could not find S2 ID file for %s. Skipping",
                                arxiv_id)
                continue
            with open(s2_id_path) as s2_id_file:
                s2_id = s2_id_file.read()

            # Load in all extracted entities. See note in 'colorize_tex.py' for why entities
            # might be saved in multiple files. If they are, for this upload function to work,
            # each of the entities need to have a unique pair of 'ID' and 'tex_path'.
            entities_dir = directories.arxiv_subdir(
                self.get_detected_entities_dirkey(), arxiv_id)
            entities: List[SerializableEntity] = []
            for entities_path in glob.glob(
                    os.path.join(entities_dir, "entities*.csv")):
                entities.extend(
                    file_utils.load_from_csv(
                        entities_path,
                        self.get_detected_entity_type(
                            os.path.basename(entities_path)),
                    ))

            # Load in locations of all detected hues.
            hue_locations_path = os.path.join(
                directories.arxiv_subdir(self.get_hue_locations_dirkey(),
                                         arxiv_id),
                "entity_locations.csv",
            )
            hue_location_infos = list(
                file_utils.load_from_csv(hue_locations_path, HueLocationInfo))

            # Group each entity with its location. Pass the entity information, and the detected
            # locations for the entity, to the upload function.
            localized_enitites = []
            for entity in entities:
                matching_locations = []
                for h in hue_location_infos:
                    if h.entity_id == entity.id_ and h.tex_path == entity.tex_path:
                        matching_locations.append(h)

                localized_enitites.append(
                    EntityAndLocation(entity, matching_locations))

            yield PaperProcessingResult(
                arxiv_id=arxiv_id,
                s2_id=s2_id,
                localized_entities=localized_enitites,
            )
Exemple #13
0
def read_tex(arxiv_id: str) -> Dict[str, FileContents]:
    """
    Read the contents of all TeX files for this arXiv paper.
    """
    contents_by_file = {}
    sources_path = directories.arxiv_subdir("sources", arxiv_id)
    for tex_path in find_files(sources_path, [".tex"], relative=True):
        absolute_tex_path = os.path.join(
            directories.arxiv_subdir("sources", arxiv_id), tex_path)
        file_contents = read_file_tolerant(absolute_tex_path)
        if file_contents is not None:
            contents_by_file[tex_path] = file_contents

    return contents_by_file
Exemple #14
0
    def load(self) -> Iterator[PaperProcessingResult]:
        for arxiv_id in self.arxiv_ids:

            # Load the S2 ID for this paper
            s2_id_path = os.path.join(
                directories.arxiv_subdir("s2-metadata", arxiv_id), "s2_id")
            if not os.path.exists(s2_id_path):
                logging.warning("Could not find S2 ID file for %s. Skipping",
                                arxiv_id)
                continue
            with open(s2_id_path) as s2_id_file:
                s2_id = s2_id_file.read()

            # Load in all extracted entities.
            entities_path = os.path.join(
                directories.arxiv_subdir(self.get_detected_entities_dirkey(),
                                         arxiv_id),
                "entities.csv",
            )
            entities = list(
                file_utils.load_from_csv(entities_path,
                                         self.get_detected_entity_type()))

            # Load in locations of all detected hues.
            hue_locations_path = os.path.join(
                directories.arxiv_subdir(self.get_hue_locations_dirkey(),
                                         arxiv_id),
                "hue_locations.csv",
            )
            hue_location_infos = list(
                file_utils.load_from_csv(hue_locations_path, HueLocationInfo))

            # Group each entity with its location. Pass the entity information, and the detected
            # locations for the entity, to the upload function.
            localized_enitites = []
            for entity in entities:
                matching_locations = []
                for h in hue_location_infos:
                    if h.entity_id == entity.id_ and h.tex_path == entity.tex_path:
                        matching_locations.append(h)

                localized_enitites.append(
                    EntityAndLocation(entity, matching_locations))

            yield PaperProcessingResult(
                arxiv_id=arxiv_id,
                s2_id=s2_id,
                localized_entities=localized_enitites,
            )
Exemple #15
0
    def load(self) -> Iterator[Task]:

        for arxiv_id in self.arxiv_ids:

            output_dir = directories.arxiv_subdir(
                f"contexts-for-{self.get_entity_name()}", arxiv_id)
            file_utils.clean_directory(output_dir)

            # Load entities from file.
            # Load in all extracted entities. See note in 'colorize_tex.py' for why entities
            # might be saved in multiple files. If they are, for this upload function to work,
            # each of the entities need to have a unique pair of 'ID' and 'tex_path'.
            entities_dir = directories.arxiv_subdir(
                f"detected-{self.get_entity_name()}", arxiv_id)
            entities: List[SerializableEntity] = []
            for entities_path in glob.glob(
                    os.path.join(entities_dir, "entities*.csv")):
                entities.extend(
                    file_utils.load_from_csv(entities_path,
                                             self.get_entity_type()))

            # Load sentences from file.
            sentences_path = os.path.join(
                directories.arxiv_subdir("detected-sentences", arxiv_id),
                "entities.csv")
            try:
                sentences = list(
                    file_utils.load_from_csv(sentences_path, Sentence))
            except FileNotFoundError:
                logging.warning(  # pylint: disable=logging-not-lazy
                    "No sentences data found for arXiv paper %s. Try re-running the pipeline, "
                    +
                    "this time enabling the processing of sentences. If that doesn't work, "
                    +
                    "there was likely an error in detecting sentences for this paper.",
                    arxiv_id,
                )
                continue

            tex_paths = {e.tex_path for e in entities}
            for tex_path in tex_paths:
                entities_for_file = [
                    e for e in entities if e.tex_path == tex_path
                ]
                sentences_for_file = [
                    s for s in sentences if s.tex_path == tex_path
                ]
                yield Task(arxiv_id, tex_path, entities_for_file,
                           sentences_for_file)
    def load(self) -> Iterator[DetectionTask]:
        for arxiv_id in self.arxiv_ids:
            output_root = directories.arxiv_subdir(
                self.get_output_base_dirkey(), arxiv_id)
            file_utils.clean_directory(output_root)

            original_sources_path = directories.arxiv_subdir(
                "sources", arxiv_id)
            for tex_path in file_utils.find_files(original_sources_path,
                                                  [".tex"],
                                                  relative=True):
                file_contents = file_utils.read_file_tolerant(
                    os.path.join(original_sources_path, tex_path))
                if file_contents is not None:
                    yield DetectionTask(arxiv_id, tex_path, file_contents)
Exemple #17
0
    def load(self) -> Iterator[LocationTask]:

        for arxiv_id in self.arxiv_ids:
            for output_base_dir in self.output_base_dirs.values():
                file_utils.clean_directory(
                    directories.arxiv_subdir(output_base_dir, arxiv_id))

            # A directory of entities may contain files for each of multiple types of entities.
            # One example is that the definition detector detects both terms and definitions.
            # In that case, the colorizer colorizes all entities from all of these files.
            # Earlier entity extractor commands should include enough information in the entity IDs
            # so that the type of entities can be inferred from the entity ID in later commands.
            entities_dir = directories.arxiv_subdir(self.get_input_dirkey(),
                                                    arxiv_id)
            entities: List[SerializableEntity] = []
            for entities_path in glob.glob(
                    os.path.join(entities_dir, "entities*.csv")):
                entities.extend(
                    file_utils.load_from_csv(entities_path,
                                             self.get_detected_entity_type()))

            main_tex_files = get_compiled_tex_files(
                directories.arxiv_subdir("compiled-normalized-sources",
                                         arxiv_id))
            normalized_sources_path = directories.arxiv_subdir(
                "normalized-sources", arxiv_id)
            for tex_file in main_tex_files:
                file_contents = file_utils.read_file_tolerant(
                    os.path.join(normalized_sources_path, tex_file.path))
                options = self.get_colorize_options()
                entities_for_tex_path = [
                    e for e in entities
                    if e.tex_path == tex_file.path or e.tex_path == "N/A"
                ]
                if options.when is not None:
                    entities_for_tex_path = list(
                        filter(options.when, entities_for_tex_path))
                if file_contents is not None:
                    group_func = options.group or (lambda entities: [entities])
                    for group_index, entity_group in enumerate(
                            group_func(entities_for_tex_path)):
                        yield LocationTask(
                            arxiv_id,
                            tex_file.path,
                            file_contents,
                            entity_group,
                            group_index,
                        )
Exemple #18
0
    def save(self, item: SearchTask, result: HueLocation) -> None:
        logging.debug(
            "Found bounding box for %s, iteration %s, hue %f",
            item.relative_file_path,
            item.iteration,
            result.hue,
        )

        output_dir = directories.arxiv_subdir(self.get_output_base_dirkey(),
                                              item.arxiv_id)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        output_path = os.path.join(output_dir, "hue_locations.csv")

        file_utils.append_to_csv(
            output_path,
            HueLocationInfo(
                tex_path=item.search.record.tex_path,
                iteration=item.iteration,
                hue=result.hue,
                entity_id=item.search.record.entity_id,
                page=result.box.page,
                left=result.box.left,
                top=result.box.top,
                width=result.box.width,
                height=result.box.height,
                relative_file_path=item.relative_file_path,
            ),
        )
Exemple #19
0
    def load(self) -> Iterator[RasterTask]:

        for arxiv_id in self.arxiv_ids:

            # Clean all past output for this arXiv ID.
            output_dir_for_arxiv_id = directories.arxiv_subdir("paper-images", arxiv_id)
            file_utils.clean_directory(output_dir_for_arxiv_id)

            paper_abs_path = directories.arxiv_subdir(
                "compiled-normalized-sources", arxiv_id
            )
            output_files = get_output_files(paper_abs_path)
            for output_file in output_files:
                yield RasterTask(
                    arxiv_id, output_file.output_type, output_file.path,
                )
    def save(self, item: LocationTask, result: CitationLocation) -> None:
        output_dir = directories.arxiv_subdir("citation-locations", item.arxiv_id)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        locations_path = os.path.join(output_dir, "citation_locations.csv")
        file_utils.append_to_csv(locations_path, result)
Exemple #21
0
    def load(self) -> Iterator[LocationTask]:

        for arxiv_id in self.arxiv_ids:

            output_dir = directories.arxiv_subdir(
                "composite-symbols-locations", arxiv_id)
            file_utils.clean_directory(output_dir)

            token_locations = file_utils.load_equation_token_locations(
                arxiv_id)
            if token_locations is None:
                continue

            symbols_with_ids = file_utils.load_symbols(arxiv_id)
            if symbols_with_ids is None:
                continue

            for symbol_with_id in symbols_with_ids:
                # Symbols with affixes (e.g., arrows, hats) cannot be localized by taking the union
                # of their tokens' bounding boxes, because the bounding boxes of affix tokens
                # cannot be detected on their own.
                if not symbol_with_id.symbol.contains_affix:
                    yield LocationTask(
                        arxiv_id=arxiv_id,
                        token_locations=token_locations,
                        symbol_with_id=symbol_with_id,
                    )
Exemple #22
0
def update_compilation_log(
    output_dir_key: str,
    arxiv_id: ArxivId,
    stdout: bytes,
    source_path: RelativePath,
    success: bool,
) -> None:

    arxiv_id_output_root = directories.arxiv_subdir(output_dir_key, arxiv_id)
    results_path = os.path.join(arxiv_id_output_root, "compilation_results.csv")

    missing_driver = is_driver_unimplemented(stdout)
    errors = list(get_errors(stdout))
    if missing_driver:
        logging.warning(  # pylint: disable=logging-not-lazy
            "Could not compile arXiv ID %s because colorization commands are missing for the"
            + "driver needed to compile that TeX project.",
            arxiv_id,
        )

    # Write the compilation result to the log.
    file_utils.append_to_csv(
        results_path,
        CompilationSummaryEntry(
            outcome="SUCCESS" if success else "FAILURE",
            source_path=source_path,
            missing_driver=missing_driver,
            errors=[e.decode("utf-8", "ignore") for e in errors],
        ),
    )
Exemple #23
0
def locate_entities(
    arxiv_id: ArxivId,
    modified_images_dir: RelativePath,
    diffed_images_dir: RelativePath,
    entity_hues: Dict[str, float],
) -> Optional[LocationResult]:

    # Get output file names from results of compiling the uncolorized TeX sources.
    output_files = get_output_files(
        directories.arxiv_subdir("compiled-sources", arxiv_id))
    output_paths = [f.path for f in output_files]

    black_pixels_found = False
    shifted_entity_ids: Set[str] = set()
    entity_locations: Dict[str, List[BoundingBox]] = defaultdict(list)

    for relative_file_path in output_paths:
        diffed_images_file_path = os.path.join(diffed_images_dir,
                                               relative_file_path)

        # Locate bounding boxes for each hue in the diffs.
        diff_images = {}
        if not os.path.exists(diffed_images_file_path):
            logging.warning(  # pylint: disable=logging-not-lazy
                "Expected but could not find a directory %s from the image diffs. "
                +
                "This suggests that the colorized paper failed to compile. Hues "
                + "will not be searched for in this diff directory.",
                diffed_images_file_path,
            )
            return None

        for img_name in os.listdir(diffed_images_file_path):
            img_path = os.path.join(diffed_images_file_path, img_name)
            page_image = cv2.imread(img_path)

            if contains_black_pixels(page_image):
                logging.warning("Black pixels found in image diff %s",
                                img_path)
                black_pixels_found = True

            page_number = int(
                os.path.splitext(img_name)[0].replace("page-", "")) - 1
            diff_images[page_number] = page_image

        for entity_id, hue in entity_hues.items():
            for page_number, image in diff_images.items():
                boxes = extract_bounding_boxes(image, page_number, hue)
                for box in boxes:
                    entity_locations[entity_id].append(box)

        shifted_entity_ids.update(
            find_shifted_entities(arxiv_id, modified_images_dir,
                                  relative_file_path, entity_hues))

    return LocationResult(
        locations=entity_locations,
        shifted_entities=list(shifted_entity_ids),
        black_pixels_found=black_pixels_found,
    )
Exemple #24
0
def load_equation_token_locations(
    arxiv_id: ArxivId,
) -> Optional[Dict[TokenId, List[BoundingBox]]]:

    token_locations: Dict[TokenId, List[BoundingBox]] = {}
    token_locations_path = os.path.join(
        directories.arxiv_subdir("equation-tokens-locations", arxiv_id),
        "entity_locations.csv",
    )
    if not os.path.exists(token_locations_path):
        logging.warning(
            "Could not find bounding boxes information for %s. Skipping", arxiv_id,
        )
        return None

    for record in load_from_csv(token_locations_path, HueLocationInfo):
        equation_index, token_index = [int(t) for t in record.entity_id.split("-")]
        token_id = TokenId(record.tex_path, equation_index, token_index)
        box = BoundingBox(
            page=int(record.page),
            left=record.left,
            top=record.top,
            width=record.width,
            height=record.height,
        )
        if token_id not in token_locations:
            token_locations[token_id] = []
        token_locations[token_id].append(box)

    return token_locations
Exemple #25
0
 def save(self, item: DetectionTask, result: SerializableEntity) -> None:
     results_dir = directories.arxiv_subdir(self.get_output_base_dirkey(),
                                            item.arxiv_id)
     if not os.path.exists(results_dir):
         os.makedirs(results_dir)
     entities_path = os.path.join(results_dir, "entities.csv")
     file_utils.append_to_csv(entities_path, result)
Exemple #26
0
def load_locations(
        arxiv_id: ArxivId,
        entity_name: str) -> Optional[Dict[EntityId, List[BoundingBox]]]:
    """
    Load bounding boxes for each entity. Entities can have multiple bounding boxes (as will
    be the case if they are split over multiple lines).
    """

    boxes_by_entity_id: Dict[EntityId, List[BoundingBox]] = defaultdict(list)
    bounding_boxes_path = os.path.join(
        directories.arxiv_subdir(f"{entity_name}-locations", arxiv_id),
        "entity_locations.csv",
    )
    if not os.path.exists(bounding_boxes_path):
        logging.warning(
            "Could not find bounding boxes information for entity of type %s for paper %s. Skipping.",
            entity_name,
            arxiv_id,
        )
        return None

    for hue_info in load_from_csv(bounding_boxes_path, EntityLocationInfo):
        box = BoundingBox(
            page=hue_info.page,
            left=hue_info.left,
            top=hue_info.top,
            width=hue_info.width,
            height=hue_info.height,
        )
        boxes_by_entity_id[hue_info.entity_id].append(box)

    return boxes_by_entity_id
Exemple #27
0
    def save(self, item: TexAndSymbols, result: AnnotationResult) -> None:
        output_sources_path = directories.arxiv_subdir(
            "sources-with-annotated-symbols", item.arxiv_id)
        logging.debug("Outputting to %s", output_sources_path)

        # Create new directory for each colorization iteration.
        unpack_path = unpack(item.arxiv_id, output_sources_path)
        sources_unpacked = unpack_path is not None
        if unpack_path is None:
            logging.warning("Could not unpack sources into %s",
                            output_sources_path)

        if sources_unpacked:
            for annotated_file in result:
                full_tex_path = os.path.join(output_sources_path,
                                             annotated_file.tex_path)
                with open(full_tex_path, "w",
                          encoding=annotated_file.encoding) as tex_file:
                    tex_file.write(annotated_file.contents)

            symbols_tex_path = os.path.join(output_sources_path,
                                            "symbol_tex.csv")
            with open(symbols_tex_path, "a",
                      encoding="utf-8") as symbols_tex_file:
                writer = csv.writer(symbols_tex_file, quoting=csv.QUOTE_ALL)
                for annotated_file in result:
                    for symbol_tex in annotated_file.symbol_tex:
                        try:
                            writer.writerow(
                                [annotated_file.tex_path, symbol_tex])
                        except Exception:  # pylint: disable=broad-except
                            logging.warning(
                                "Couldn't write row for annotated line for arXiv %s: can't be converted to utf-8",
                                item.arxiv_id,
                            )
Exemple #28
0
    def load(self) -> Iterator[TexAndSymbols]:
        for arxiv_id in self.arxiv_ids:

            output_root = directories.arxiv_subdir(
                "sources-with-annotated-symbols", arxiv_id)
            file_utils.clean_directory(output_root)

            symbols_dir = directories.arxiv_subdir("detected-equation-tokens",
                                                   arxiv_id)
            tokens_path = os.path.join(symbols_dir, "entities.csv")
            if not os.path.exists(tokens_path):
                logging.info(
                    "No equation token data found for paper %s. Skipping.",
                    arxiv_id)
                continue

            symbols_with_ids = file_utils.load_symbols(arxiv_id)
            if symbols_with_ids is None:
                continue
            symbols = {swi.symbol_id: swi.symbol for swi in symbols_with_ids}

            tokens = file_utils.load_tokens(arxiv_id)
            if tokens is None:
                continue
            tex_paths = set({t.tex_path for t in tokens})

            characters: Dict[CharacterId, Character] = {}
            for token in tokens:
                character_id = CharacterId(token.tex_path,
                                           token.equation_index,
                                           token.token_index)
                characters[character_id] = Character(token.text,
                                                     token.token_index,
                                                     token.start, token.end)

            # Load original sources for TeX files that need to be colorized
            contents_by_file = {}
            for tex_path in tex_paths:
                absolute_tex_path = os.path.join(
                    directories.arxiv_subdir("sources", arxiv_id), tex_path)
                file_contents = file_utils.read_file_tolerant(
                    absolute_tex_path)
                if file_contents is not None:
                    contents_by_file[tex_path] = file_contents

            yield TexAndSymbols(arxiv_id, contents_by_file, symbols,
                                characters)
Exemple #29
0
    def save(self, item: MatchTask, result: BibitemMatch) -> None:
        resolutions_dir = directories.arxiv_subdir("bibitem-resolutions",
                                                   item.arxiv_id)
        if not os.path.exists(resolutions_dir):
            os.makedirs(resolutions_dir)

        resolutions_path = os.path.join(resolutions_dir, "resolutions.csv")
        file_utils.append_to_csv(resolutions_path, result)
Exemple #30
0
    def load(self) -> Iterator[PageRasterPair]:
        for arxiv_id in self.arxiv_ids:
            output_dir = directories.arxiv_subdir(
                self.get_output_base_dirkey(), arxiv_id)
            file_utils.clean_directory(output_dir)

            # Get output file names from results of compiling the uncolorized TeX sources.
            output_files = get_output_files(
                directories.arxiv_subdir("compiled-sources", arxiv_id))
            if len(output_files) == 0:
                continue

            for iteration in directories.iteration_names(
                    self.get_raster_base_dirkey(), arxiv_id):

                original_images_dir = directories.arxiv_subdir(
                    "paper-images", arxiv_id)
                modified_images_dir = directories.iteration(
                    self.get_raster_base_dirkey(), arxiv_id, iteration)

                for output_file in output_files:
                    relative_file_path = output_file.path
                    original_images_path = os.path.join(
                        original_images_dir, relative_file_path)
                    for img_name in os.listdir(original_images_path):
                        original_img_path = os.path.join(
                            original_images_path, img_name)
                        modified_img_path = os.path.join(
                            modified_images_dir, relative_file_path, img_name)
                        if not os.path.exists(modified_img_path):
                            logging.warning(
                                "Could not find expected image %s. Skipping diff for this paper.",
                                modified_img_path,
                            )
                            break

                        original_img = cv2.imread(original_img_path)
                        modified_img = cv2.imread(modified_img_path)
                        yield PageRasterPair(
                            arxiv_id,
                            iteration,
                            relative_file_path,
                            img_name,
                            original_img,
                            modified_img,
                        )