예제 #1
0
    def load(self) -> Iterator[MatchTask]:
        for arxiv_id in self.arxiv_ids:
            file_utils.clean_directory(
                directories.arxiv_subdir("bibitem-resolutions", arxiv_id))
            bibitems_dir = directories.arxiv_subdir("detected-citations",
                                                    arxiv_id)
            metadata_dir = directories.arxiv_subdir("s2-metadata", arxiv_id)

            references_path = os.path.join(metadata_dir, "references.csv")
            if not os.path.exists(references_path):
                logging.warning(
                    "Could not find %s, skipping reference resolution for paper %s",
                    references_path,
                    arxiv_id,
                )
                return
            references = list(
                file_utils.load_from_csv(references_path,
                                         SerializableReference))

            bibitems_path = os.path.join(bibitems_dir, "entities.csv")
            if not os.path.exists(bibitems_path):
                logging.warning(
                    "Could not find %s, skipping reference resolution for paper %s",
                    bibitems_path,
                    arxiv_id,
                )
                return
            bibitems = list(file_utils.load_from_csv(bibitems_path, Bibitem))

            yield MatchTask(arxiv_id, bibitems, references)
예제 #2
0
    def load(self) -> Iterator[CitationData]:
        for arxiv_id in self.arxiv_ids:

            # Load citation locations
            citation_locations = load_located_citations(arxiv_id)
            if citation_locations is None:
                continue

            # Load metadata for bibitems
            key_s2_ids: Dict[CitationKey, S2Id] = {}
            key_resolutions_path = os.path.join(
                directories.arxiv_subdir("bibitem-resolutions", arxiv_id),
                "resolutions.csv",
            )
            if not os.path.exists(key_resolutions_path):
                logging.warning(
                    "Could not find citation resolutions for %s. Skipping",
                    arxiv_id)
                continue
            for resolution in file_utils.load_from_csv(key_resolutions_path,
                                                       BibitemMatch):
                if resolution.key is not None:
                    key_s2_ids[resolution.key] = resolution.s2_id

            s2_id_path = os.path.join(
                directories.arxiv_subdir("s2-metadata", arxiv_id), "s2_id")
            if not os.path.exists(s2_id_path):
                logging.warning("Could not find S2 ID file for %s. Skipping",
                                arxiv_id)
                continue
            with open(s2_id_path) as s2_id_file:
                s2_id = s2_id_file.read()

            s2_data: Dict[S2Id, SerializableReference] = {}
            s2_metadata_path = os.path.join(
                directories.arxiv_subdir("s2-metadata", arxiv_id),
                "references.csv")
            if not os.path.exists(s2_metadata_path):
                logging.warning(
                    "Could not find S2 metadata file for citations for %s. Skipping",
                    arxiv_id,
                )
                continue
            for metadata in file_utils.load_from_csv(s2_metadata_path,
                                                     SerializableReference):
                # Convert authors field to comma-delimited list of authors
                author_string = ",".join(
                    [a["name"] for a in ast.literal_eval(metadata.authors)])
                metadata = dataclasses.replace(metadata, authors=author_string)
                s2_data[metadata.s2_id] = metadata

            yield CitationData(
                arxiv_id,
                s2_id,
                citation_locations,
                key_s2_ids,
                s2_data,
            )
예제 #3
0
    def load(self) -> Iterator[PaperProcessingResult]:
        for arxiv_id in self.arxiv_ids:

            # Load the S2 ID for this paper
            s2_id_path = os.path.join(
                directories.arxiv_subdir("s2-metadata", arxiv_id), "s2_id")
            if not os.path.exists(s2_id_path):
                logging.warning("Could not find S2 ID file for %s. Skipping",
                                arxiv_id)
                continue
            with open(s2_id_path) as s2_id_file:
                s2_id = s2_id_file.read()

            # Load in all extracted entities. See note in 'colorize_tex.py' for why entities
            # might be saved in multiple files. If they are, for this upload function to work,
            # each of the entities need to have a unique pair of 'ID' and 'tex_path'.
            entities_dir = directories.arxiv_subdir(
                self.get_detected_entities_dirkey(), arxiv_id)
            entities: List[SerializableEntity] = []
            for entities_path in glob.glob(
                    os.path.join(entities_dir, "entities*.csv")):
                entities.extend(
                    file_utils.load_from_csv(
                        entities_path,
                        self.get_detected_entity_type(
                            os.path.basename(entities_path)),
                    ))

            # Load in locations of all detected hues.
            hue_locations_path = os.path.join(
                directories.arxiv_subdir(self.get_hue_locations_dirkey(),
                                         arxiv_id),
                "entity_locations.csv",
            )
            hue_location_infos = list(
                file_utils.load_from_csv(hue_locations_path, HueLocationInfo))

            # Group each entity with its location. Pass the entity information, and the detected
            # locations for the entity, to the upload function.
            localized_enitites = []
            for entity in entities:
                matching_locations = []
                for h in hue_location_infos:
                    if h.entity_id == entity.id_ and h.tex_path == entity.tex_path:
                        matching_locations.append(h)

                localized_enitites.append(
                    EntityAndLocation(entity, matching_locations))

            yield PaperProcessingResult(
                arxiv_id=arxiv_id,
                s2_id=s2_id,
                localized_entities=localized_enitites,
            )
예제 #4
0
    def load(self) -> Iterator[PaperProcessingResult]:
        for arxiv_id in self.arxiv_ids:

            # Load the S2 ID for this paper
            s2_id_path = os.path.join(
                directories.arxiv_subdir("s2-metadata", arxiv_id), "s2_id")
            if not os.path.exists(s2_id_path):
                logging.warning("Could not find S2 ID file for %s. Skipping",
                                arxiv_id)
                continue
            with open(s2_id_path) as s2_id_file:
                s2_id = s2_id_file.read()

            # Load in all extracted entities.
            entities_path = os.path.join(
                directories.arxiv_subdir(self.get_detected_entities_dirkey(),
                                         arxiv_id),
                "entities.csv",
            )
            entities = list(
                file_utils.load_from_csv(entities_path,
                                         self.get_detected_entity_type()))

            # Load in locations of all detected hues.
            hue_locations_path = os.path.join(
                directories.arxiv_subdir(self.get_hue_locations_dirkey(),
                                         arxiv_id),
                "hue_locations.csv",
            )
            hue_location_infos = list(
                file_utils.load_from_csv(hue_locations_path, HueLocationInfo))

            # Group each entity with its location. Pass the entity information, and the detected
            # locations for the entity, to the upload function.
            localized_enitites = []
            for entity in entities:
                matching_locations = []
                for h in hue_location_infos:
                    if h.entity_id == entity.id_ and h.tex_path == entity.tex_path:
                        matching_locations.append(h)

                localized_enitites.append(
                    EntityAndLocation(entity, matching_locations))

            yield PaperProcessingResult(
                arxiv_id=arxiv_id,
                s2_id=s2_id,
                localized_entities=localized_enitites,
            )
예제 #5
0
    def load(self) -> Iterator[Task]:

        for arxiv_id in self.arxiv_ids:

            output_dir = directories.arxiv_subdir(
                f"contexts-for-{self.get_entity_name()}", arxiv_id)
            file_utils.clean_directory(output_dir)

            # Load entities from file.
            # Load in all extracted entities. See note in 'colorize_tex.py' for why entities
            # might be saved in multiple files. If they are, for this upload function to work,
            # each of the entities need to have a unique pair of 'ID' and 'tex_path'.
            entities_dir = directories.arxiv_subdir(
                f"detected-{self.get_entity_name()}", arxiv_id)
            entities: List[SerializableEntity] = []
            for entities_path in glob.glob(
                    os.path.join(entities_dir, "entities*.csv")):
                entities.extend(
                    file_utils.load_from_csv(entities_path,
                                             self.get_entity_type()))

            # Load sentences from file.
            sentences_path = os.path.join(
                directories.arxiv_subdir("detected-sentences", arxiv_id),
                "entities.csv")
            try:
                sentences = list(
                    file_utils.load_from_csv(sentences_path, Sentence))
            except FileNotFoundError:
                logging.warning(  # pylint: disable=logging-not-lazy
                    "No sentences data found for arXiv paper %s. Try re-running the pipeline, "
                    +
                    "this time enabling the processing of sentences. If that doesn't work, "
                    +
                    "there was likely an error in detecting sentences for this paper.",
                    arxiv_id,
                )
                continue

            tex_paths = {e.tex_path for e in entities}
            for tex_path in tex_paths:
                entities_for_file = [
                    e for e in entities if e.tex_path == tex_path
                ]
                sentences_for_file = [
                    s for s in sentences if s.tex_path == tex_path
                ]
                yield Task(arxiv_id, tex_path, entities_for_file,
                           sentences_for_file)
예제 #6
0
    def load(self) -> Iterator[SymbolSentencesTask]:

        for arxiv_id in self.arxiv_ids:

            output_dir = directories.arxiv_subdir("sentences-for-symbols",
                                                  arxiv_id)
            file_utils.clean_directory(output_dir)

            token_sentences_path = os.path.join(
                directories.arxiv_subdir("sentences-for-equation-tokens",
                                         arxiv_id),
                "entity_sentences.csv",
            )
            if not os.path.exists(token_sentences_path):
                logging.warning(  # pylint: disable=logging-not-lazy
                    "Could not find links between sentences and equation tokens at "
                    +
                    "path %s for arXiv paper %s. Skipping the detection of symbol sentences.",
                    token_sentences_path,
                    arxiv_id,
                )
                continue

            token_sentence_pairs = list(
                file_utils.load_from_csv(token_sentences_path,
                                         EntitySentencePairIds))

            symbols = file_utils.load_symbols(arxiv_id)
            if not symbols:
                continue

            # Filter to only those symbols for which tokens have been detected
            symbols = [s for s in symbols if len(s.symbol.characters) > 0]

            yield SymbolSentencesTask(arxiv_id, symbols, token_sentence_pairs)
예제 #7
0
    def load(self) -> Iterator[ColorizationTask]:
        for arxiv_id in self.arxiv_ids:
            output_root = directories.arxiv_subdir(
                self.get_output_base_dirkey(), arxiv_id)
            file_utils.clean_directory(output_root)

            entities_path = os.path.join(
                directories.arxiv_subdir(self.get_detected_entities_dirkey(),
                                         arxiv_id),
                "entities.csv",
            )
            entities = list(
                file_utils.load_from_csv(entities_path,
                                         self.get_detected_entity_type()))

            original_sources_path = directories.arxiv_subdir(
                "sources", arxiv_id)
            for tex_path in file_utils.find_files(original_sources_path,
                                                  [".tex"],
                                                  relative=True):
                file_contents = file_utils.read_file_tolerant(
                    os.path.join(original_sources_path, tex_path))
                entities_for_tex_path = [
                    e for e in entities if e.tex_path == tex_path
                ]
                if file_contents is not None:
                    yield ColorizationTask(arxiv_id, tex_path, file_contents,
                                           entities_for_tex_path)
예제 #8
0
    def load(self) -> Iterator[DetectDefinitionsTask]:
        for arxiv_id in self.arxiv_ids:

            output_dir = directories.arxiv_subdir("detected-definitions",
                                                  arxiv_id)
            file_utils.clean_directory(output_dir)

            # Load cleaned sentences for definition detection.
            detected_sentences_path = os.path.join(
                directories.arxiv_subdir("sentence-tokens", arxiv_id),
                "sentences.csv",
            )
            try:
                sentences = list(
                    file_utils.load_from_csv(detected_sentences_path,
                                             EmbellishedSentence))
            except FileNotFoundError:
                logging.warning(  # pylint: disable=logging-not-lazy
                    "No sentences data found for arXiv paper %s. Try re-running the pipeline, "
                    +
                    "this time enabling the processing of sentences. If that doesn't work, "
                    +
                    "there is likely an error in detected sentences for this paper.",
                    arxiv_id,
                )
                continue

            # Read in all TeX. Once definition detection is finished, all the TeX will be searched
            # for references to the defined terms.
            tex_by_file = file_utils.read_tex(arxiv_id)

            yield DetectDefinitionsTask(arxiv_id, sentences, tex_by_file)
예제 #9
0
        def load_hues(self, arxiv_id: ArxivId,
                      iteration: str) -> List[HueSearchRegion]:
            hues_path = os.path.join(
                directories.iteration(
                    f"sources-with-colorized-{entity_name}",
                    arxiv_id,
                    iteration,
                ),
                "entity_hues.csv",
            )
            if not os.path.exists(hues_path):
                logging.warning("Could not find any hues at %s", hues_path)
                return []

            searches = []
            for record in file_utils.load_from_csv(hues_path,
                                                   ColorizationRecord):
                searches.append(
                    HueSearchRegion(
                        hue=record.hue,
                        record=record,
                        relative_file_path=None,
                        masks=None,
                    ))

            return searches
    def load(self) -> Iterator[ColorizationTask]:
        for arxiv_id in self.arxiv_ids:

            output_root = directories.arxiv_subdir(
                "sources-with-colorized-citations", arxiv_id)
            file_utils.clean_directory(output_root)

            bibitems_path = os.path.join(
                directories.arxiv_subdir("bibitems", arxiv_id), "bibitems.csv")
            if not os.path.exists(bibitems_path):
                logging.warning(
                    "No bibitems were found for paper %s. Skipping", arxiv_id)
                continue

            bibitems = file_utils.load_from_csv(bibitems_path, Bibitem)
            bibitem_keys = [b.key for b in bibitems if b.key is not None]

            original_sources_path = directories.arxiv_subdir(
                "sources", arxiv_id)
            for tex_path in file_utils.find_files(original_sources_path,
                                                  [".tex"],
                                                  relative=True):
                file_contents = file_utils.read_file_tolerant(
                    os.path.join(original_sources_path, tex_path))
                if file_contents is not None:
                    yield ColorizationTask(arxiv_id, tex_path, file_contents,
                                           bibitem_keys)
예제 #11
0
    def load(self) -> Iterator[Task]:

        for arxiv_id in self.arxiv_ids:

            output_dir = directories.arxiv_subdir(
                f"contexts-for-{self.get_entity_name()}", arxiv_id)
            file_utils.clean_directory(output_dir)

            # Load entities from file.
            entities_path = os.path.join(
                directories.arxiv_subdir(f"detected-{self.get_entity_name()}",
                                         arxiv_id),
                "entities.csv",
            )
            entities = list(
                file_utils.load_from_csv(entities_path,
                                         self.get_entity_type()))

            # Load sentences from file.
            sentences_path = os.path.join(
                directories.arxiv_subdir("detected-sentences", arxiv_id),
                "entities.csv")
            try:
                sentences = list(
                    file_utils.load_from_csv(sentences_path, Sentence))
            except FileNotFoundError:
                logging.warning(  # pylint: disable=logging-not-lazy
                    "No sentences data found for arXiv paper %s. Try re-running the pipeline, "
                    +
                    "this time enabling the processing of sentences. If that doesn't work, "
                    +
                    "there was likely an error in detecting sentences for this paper.",
                    arxiv_id,
                )
                continue

            tex_paths = {e.tex_path for e in entities}
            for tex_path in tex_paths:
                entities_for_file = [
                    e for e in entities if e.tex_path == tex_path
                ]
                sentences_for_file = [
                    s for s in sentences if s.tex_path == tex_path
                ]
                yield Task(arxiv_id, tex_path, entities_for_file,
                           sentences_for_file)
예제 #12
0
def count_entities_extracted(arxiv_id: ArxivId) -> Optional[int]:
    """
    This is not the same as the number of citation commands in the TeX; specifically, it's the
    number of bibitems which are colorized to enable detection of citation locations.
    """
    bibitems_path = os.path.join(
        directories.arxiv_subdir("detected-citations", arxiv_id),
        "entities.csv")
    if not os.path.exists(bibitems_path):
        return None
    return len(list(file_utils.load_from_csv(bibitems_path, Bibitem)))
예제 #13
0
    def load(self) -> Iterator[Locations]:

        for arxiv_id in self.arxiv_ids:

            output_dir = directories.arxiv_subdir("symbols-locations", arxiv_id)
            file_utils.clean_directory(output_dir)

            all_locations: List[EntityLocationInfo] = []
            composite_symbols_path = os.path.join(
                directories.arxiv_subdir("composite-symbols-locations", arxiv_id),
                "symbol_locations.csv",
            )
            if os.path.exists(composite_symbols_path):
                all_locations.extend(
                    file_utils.load_from_csv(composite_symbols_path, EntityLocationInfo)
                )
            else:
                logging.info(
                    "No locations could be found for composite symbols for paper %s.",
                    arxiv_id,
                )

            symbols_with_affixes_path = os.path.join(
                directories.arxiv_subdir("symbols-with-affixes-locations", arxiv_id),
                "entity_locations.csv",
            )
            if os.path.exists(symbols_with_affixes_path):
                all_locations.extend(
                    file_utils.load_from_csv(
                        symbols_with_affixes_path, EntityLocationInfo
                    )
                )
            else:
                logging.info(
                    "No locations could be found for symbols with affixes for paper %s.",
                    arxiv_id,
                )

            yield Locations(arxiv_id, all_locations)
예제 #14
0
    def load(self) -> Iterator[LocationTask]:

        for arxiv_id in self.arxiv_ids:
            for output_base_dir in self.output_base_dirs.values():
                file_utils.clean_directory(
                    directories.arxiv_subdir(output_base_dir, arxiv_id))

            # A directory of entities may contain files for each of multiple types of entities.
            # One example is that the definition detector detects both terms and definitions.
            # In that case, the colorizer colorizes all entities from all of these files.
            # Earlier entity extractor commands should include enough information in the entity IDs
            # so that the type of entities can be inferred from the entity ID in later commands.
            entities_dir = directories.arxiv_subdir(self.get_input_dirkey(),
                                                    arxiv_id)
            entities: List[SerializableEntity] = []
            for entities_path in glob.glob(
                    os.path.join(entities_dir, "entities*.csv")):
                entities.extend(
                    file_utils.load_from_csv(entities_path,
                                             self.get_detected_entity_type()))

            main_tex_files = get_compiled_tex_files(
                directories.arxiv_subdir("compiled-normalized-sources",
                                         arxiv_id))
            normalized_sources_path = directories.arxiv_subdir(
                "normalized-sources", arxiv_id)
            for tex_file in main_tex_files:
                file_contents = file_utils.read_file_tolerant(
                    os.path.join(normalized_sources_path, tex_file.path))
                options = self.get_colorize_options()
                entities_for_tex_path = [
                    e for e in entities
                    if e.tex_path == tex_file.path or e.tex_path == "N/A"
                ]
                if options.when is not None:
                    entities_for_tex_path = list(
                        filter(options.when, entities_for_tex_path))
                if file_contents is not None:
                    group_func = options.group or (lambda entities: [entities])
                    for group_index, entity_group in enumerate(
                            group_func(entities_for_tex_path)):
                        yield LocationTask(
                            arxiv_id,
                            tex_file.path,
                            file_contents,
                            entity_group,
                            group_index,
                        )
예제 #15
0
    def load(self) -> Iterator[LocationTask]:

        for arxiv_id in self.arxiv_ids:

            output_dir = directories.arxiv_subdir("citation-cluster-locations",
                                                  arxiv_id)
            file_utils.clean_directory(output_dir)

            boxes_by_hue_iteration = file_utils.load_citation_hue_locations(
                arxiv_id)
            if boxes_by_hue_iteration is None:
                continue

            boxes_by_citation_key: Dict[str, List[BoundingBox]] = {}
            for iteration in directories.iteration_names(
                    "sources-with-colorized-citations", arxiv_id):
                citation_hues_path = os.path.join(
                    directories.iteration(
                        "sources-with-colorized-citations",
                        arxiv_id,
                        iteration,
                    ),
                    "entity_hues.csv",
                )
                if not os.path.exists(citation_hues_path):
                    logging.warning(
                        "Could not find citation hue colors for %s iteration %s. Skipping",
                        arxiv_id,
                        iteration,
                    )
                    continue
                for record in file_utils.load_from_csv(citation_hues_path,
                                                       ColorizationRecord):
                    key = record.entity_id
                    if key not in boxes_by_citation_key:
                        boxes_by_citation_key[key] = []
                    hue_iteration = HueIteration(record.hue, iteration)
                    boxes_by_citation_key[key].extend(
                        boxes_by_hue_iteration.get(hue_iteration, []))

            for key, boxes in boxes_by_citation_key.items():
                yield LocationTask(
                    arxiv_id=arxiv_id,
                    citation_key=key,
                    boxes=boxes,
                )
예제 #16
0
파일: compile.py 프로젝트: silky/scholarphi
def get_output_files(compiled_tex_dir: str) -> List[OutputFile]:
    """
    Get a list of output files for a directory of compiled TeX.
    """
    compilation_results_dir = os.path.join(compiled_tex_dir,
                                           "compilation_results")
    result_path = os.path.join(compilation_results_dir, "result")
    with open(result_path) as result_file:
        result = result_file.read().strip()
        if result == "True":
            output_files_path = os.path.join(compilation_results_dir,
                                             "output_files.csv")
            output_files = list(
                file_utils.load_from_csv(output_files_path, OutputFile))
            return output_files

    return []
예제 #17
0
def load_located_citations(arxiv_id: ArxivId) -> Optional[Citations]:
    citation_locations: Citations = {}
    citation_locations_path = os.path.join(
        directories.arxiv_subdir("citation-locations", arxiv_id),
        "citation_locations.csv",
    )
    if not os.path.exists(citation_locations_path):
        logging.warning("Could not find citation locations for %s. Skipping", arxiv_id)
        return None

    for location in file_utils.load_from_csv(citation_locations_path, CitationLocation):
        if not location.key in citation_locations:
            citation_locations[location.key] = {}
        if not location.cluster_index in citation_locations[location.key]:
            citation_locations[location.key][location.cluster_index] = set()
        citation_locations[location.key][location.cluster_index].add(location)

    return citation_locations
예제 #18
0
def get_output_files(compiled_tex_dir: RelativePath) -> List[OutputFile]:
    " Get a list of output files for a directory of compiled TeX. "
    if _did_compilation_succeed(compiled_tex_dir):
        output_files_path = os.path.join(
            _get_compilation_results_dir(compiled_tex_dir), "output_files.csv"
        )
        if not os.path.exists(output_files_path):
            logging.warning(  # pylint: disable=logging-not-lazy
                "Although compilation succeeded for TeX compilation in directory %s, no "
                + "output files were produced. Something unexpected must have happened during "
                + "compilation of the TeX.",
                compiled_tex_dir,
            )
            return []
        output_files = list(file_utils.load_from_csv(output_files_path, OutputFile))
        return output_files

    return []
예제 #19
0
def count_detected_entities(
    arxiv_id: ArxivId,
    detected_entities_dirkey: str,
    entities_filename: str = "entities.csv",
) -> Optional[int]:

    num_entities_detected = None
    if directories.registered(detected_entities_dirkey):
        detected_entities_path = os.path.join(
            directories.arxiv_subdir(detected_entities_dirkey, arxiv_id),
            entities_filename,
        )
        if os.path.exists(detected_entities_path):
            num_entities_detected = len(
                list(
                    file_utils.load_from_csv(detected_entities_path,
                                             SerializableEntity)))

    return num_entities_detected
예제 #20
0
def count_hues_located(
    arxiv_id: ArxivId,
    hue_locations_dirkey: str,
    hue_locations_filename: str = "hue_locations.csv",
) -> Optional[int]:

    num_hues_located = None
    if directories.registered(hue_locations_dirkey):
        hue_locations_path = os.path.join(
            directories.arxiv_subdir(hue_locations_dirkey, arxiv_id),
            hue_locations_filename,
        )
        if os.path.exists(hue_locations_path):
            num_hues_located = len(
                list(
                    file_utils.load_from_csv(hue_locations_path,
                                             HueLocationInfo)))

    return num_hues_located
예제 #21
0
    def load(self) -> Iterator[Task]:
        for arxiv_id in self.arxiv_ids:

            output_dir = directories.arxiv_subdir("sentence-tokens", arxiv_id)
            file_utils.clean_directory(output_dir)

            # Load symbols, for use in embellishing equations.
            symbols: Dict[str, List[Symbol]] = defaultdict(list)
            symbol_data = file_utils.load_symbols(arxiv_id)
            if symbol_data is not None:
                for id_, symbol in symbol_data:
                    symbols[id_.tex_path].append(symbol)
            else:
                logging.warning(  # pylint: disable=logging-not-lazy
                    "No symbol data found for arXiv ID %s. It will not be " +
                    "possible to expand equations in sentences with symbol data. This should only "
                    +
                    "be a problem if it's expected that there are no symbols in paper %s.",
                    arxiv_id,
                    arxiv_id,
                )

            # Load sentences.
            detected_sentences_path = os.path.join(
                directories.arxiv_subdir("detected-sentences", arxiv_id),
                "entities.csv",
            )
            if not os.path.exists(detected_sentences_path):
                logging.warning(  # pylint: disable=logging-not-lazy
                    "No sentences data found for arXiv paper %s. Try re-running the pipeline, "
                    +
                    "this time enabling the processing of sentences. If that doesn't work, "
                    +
                    "there is likely an error in detecting sentences for this paper.",
                    arxiv_id,
                )
                continue

            sentences = file_utils.load_from_csv(detected_sentences_path,
                                                 Sentence)
            for sentence in sentences:
                yield Task(arxiv_id, sentence, symbols[sentence.tex_path])
예제 #22
0
def get_compiled_tex_files(compiled_tex_dir: RelativePath) -> List[CompiledTexFile]:
    " Get a list of TeX files that were successfully compiled. "
    if _did_compilation_succeed(compiled_tex_dir):
        compiled_tex_files_path = os.path.join(
            _get_compilation_results_dir(compiled_tex_dir), "compiled_tex_files.csv"
        )
        if not os.path.exists(compiled_tex_files_path):
            logging.warning(  # pylint: disable=logging-not-lazy
                "Although compilation succeeded for TeX compilation in directory %s, no "
                + "specific TeX files were logged as having been compiled. Something "
                + "unexpected must have happened during compilation of the TeX.",
                compiled_tex_dir,
            )
            return []
        compiled_tex_files = list(
            file_utils.load_from_csv(compiled_tex_files_path, CompiledTexFile)
        )
        return compiled_tex_files

    return []
예제 #23
0
    def load(self) -> Iterator[PaperTokens]:
        for arxiv_id in self.arxiv_ids:

            output_dir = directories.arxiv_subdir("annotation-files", arxiv_id)
            file_utils.clean_directory(output_dir)

            # Load tokens.
            tokens_path = os.path.join(
                directories.arxiv_subdir("sentence-tokens", arxiv_id),
                "tokens.csv",
            )
            try:
                tokens = list(file_utils.load_from_csv(tokens_path, Token))
            except FileNotFoundError:
                logging.warning(  # pylint: disable=logging-not-lazy
                    "No tokens data found for arXiv paper %s. No annotation files will be "
                    + "generated for this paper.",
                    arxiv_id,
                )
                continue

            yield PaperTokens(arxiv_id, tokens)
예제 #24
0
파일: utils.py 프로젝트: z-314/scholarphi
def load_located_citations(arxiv_id: ArxivId) -> Optional[Citations]:
    citation_locations: Citations = {}
    citation_locations_path = os.path.join(
        directories.arxiv_subdir("citations-locations", arxiv_id),
        "entity_locations.csv",
    )
    if not os.path.exists(citation_locations_path):
        logging.warning("Could not find citation locations for %s. Skipping", arxiv_id)
        return None

    for location in file_utils.load_from_csv(
        citation_locations_path, EntityLocationInfo
    ):
        id_tokens = location.entity_id.rsplit("-", maxsplit=1)
        key = id_tokens[0]
        cluster_index = int(id_tokens[1])
        if key not in citation_locations:
            citation_locations[key] = {}
        if not cluster_index in citation_locations[key]:
            citation_locations[key][cluster_index] = set()
        citation_locations[key][cluster_index].add(location)

    return citation_locations
예제 #25
0
    def load(self) -> Iterator[LocationTask]:

        entity_name = self.get_entity_name()
        for arxiv_id in self.arxiv_ids:
            for output_base_dir in self.output_base_dirs.values():
                file_utils.clean_directory(
                    directories.arxiv_subdir(output_base_dir, arxiv_id))

            # A directory of entities may contain files for each of multiple types of entities.
            # One example is that the definition detector detects both terms and definitions.
            # In that case, the colorizer colorizes all entities from all of these files.
            # Earlier entity extractor commands should include enough information in the entity IDs
            # so that the type of entities can be inferred from the entity ID in later commands.
            entities_dir = directories.arxiv_subdir(f"detected-{entity_name}",
                                                    arxiv_id)
            entities: List[SerializableEntity] = []
            for entities_path in glob.glob(
                    os.path.join(entities_dir, "entities*.csv")):
                entities.extend(
                    file_utils.load_from_csv(entities_path,
                                             self.get_detected_entity_type()))

            original_sources_path = directories.arxiv_subdir(
                "sources", arxiv_id)
            for tex_path in file_utils.find_files(original_sources_path,
                                                  [".tex"],
                                                  relative=True):
                file_contents = file_utils.read_file_tolerant(
                    os.path.join(original_sources_path, tex_path))
                entities_for_tex_path = [
                    e for e in entities
                    if e.tex_path == tex_path or e.tex_path == "N/A"
                ]
                if file_contents is not None:
                    yield LocationTask(arxiv_id, tex_path, file_contents,
                                       entities_for_tex_path)
예제 #26
0
def upload_symbols(
    processing_summary: PaperProcessingResult, data_version: Optional[int]
) -> None:

    arxiv_id = processing_summary.arxiv_id
    entities = [es.entity for es in processing_summary.entities]
    symbols = cast(List[SerializableSymbol], entities)
    symbols_by_id = {sid(s): s for s in symbols}

    entity_infos: List[EntityUploadInfo] = []

    # Load MathML matches for partially matching of symbols.
    matches: Matches = {}
    matches_path = os.path.join(
        directories.arxiv_subdir("symbol-matches", processing_summary.arxiv_id),
        "matches.csv",
    )
    if os.path.exists(matches_path):
        for match in file_utils.load_from_csv(matches_path, Match):
            if match.queried_mathml not in matches:
                matches[match.queried_mathml] = []
            matches[match.queried_mathml].append(match)
    else:
        logging.warning(
            "Could not find symbol matches information for paper %s.", arxiv_id,
        )

    # Load parent-child relationships for symbols.
    children: Dict[SymbolId, List[SymbolId]] = defaultdict(list)
    parents: Dict[SymbolId, SymbolId] = {}
    children_path = os.path.join(
        directories.arxiv_subdir("detected-symbols", arxiv_id), "symbol_children.csv"
    )
    if os.path.exists(children_path):
        for parent in file_utils.load_from_csv(children_path, SerializableChild):
            pid = f"{parent.tex_path}-{parent.equation_index}-{parent.symbol_index}"
            cid = f"{parent.tex_path}-{parent.equation_index}-{parent.child_index}"
            parents[cid] = pid
            children[pid].append(cid)
    else:
        logging.warning(
            "Could not find file mapping from symbol to their children for paper %s.",
            arxiv_id,
        )

    # Load contexts that the symbols appear in. Sort them by the symbol MathML.
    context_data_missing = False
    contexts_path = os.path.join(
        directories.arxiv_subdir("contexts-for-symbols", arxiv_id), "contexts.csv",
    )
    if not os.path.exists(contexts_path):
        logging.warning(  # pylint: disable=logging-not-lazy
            "Contexts have not been found for symbols for arXiv paper %s. "
            + "Symbol data will be uploaded without contexts.",
            arxiv_id,
        )
        context_data_missing = True

    symbol_contexts = {}
    mathml_contexts = defaultdict(list)
    if not context_data_missing:
        for context in file_utils.load_from_csv(contexts_path, Context):
            tex_path = context.tex_path
            symbol_id = f"{tex_path}-{context.entity_id}"
            symbol_contexts[symbol_id] = context
            symbol = symbols_by_id[symbol_id]
            mathml_contexts[symbol.mathml].append(context)

    # Prepare collections of formulae that each symbol was found in.
    symbol_formulas = {}
    mathml_formulas: Dict[str, Set[DefiningFormula]] = defaultdict(set)
    for symbol in symbols:
        if (
            symbol.is_definition
            and symbol.equation is not None
            and symbol.relative_start is not None
            and symbol.relative_end is not None
        ):
            highlighted = wrap_span(
                symbol.equation,
                symbol.relative_start,
                symbol.relative_end,
                before=r"\htmlClass{match-highlight}{",
                after="}",
                braces=True,
            )
            formula = DefiningFormula(
                tex=highlighted,
                tex_path=symbol.tex_path,
                equation_id=str(symbol.equation_index),
            )
            symbol_formulas[sid(symbol)] = formula
            mathml_formulas[symbol.mathml].add(formula)

    entity_infos = []
    for localized_entity in processing_summary.entities:

        symbol = cast(SerializableSymbol, localized_entity.entity)
        boxes = [
            BoundingBox(l.left, l.top, l.width, l.height, l.page)
            for l in localized_entity.locations
        ]

        # Get context and formula of the symbol, and other matching ones.
        symbol_context = symbol_contexts.get(sid(symbol))
        matching_contexts = mathml_contexts.get(symbol.mathml, [])
        other_context_texs = []
        other_context_sentence_ids = []
        for c in matching_contexts:
            matching_sentence_id = f"{c.tex_path}-{c.sentence_id}"
            if matching_sentence_id not in other_context_sentence_ids:
                other_context_texs.append(c.snippet)
                other_context_sentence_ids.append(matching_sentence_id)

        matching_formulas = mathml_formulas.get(symbol.mathml, set())
        other_formula_texs = []
        other_formula_ids = []
        for f in matching_formulas:
            equation_id = f"{f.tex_path}-{f.equation_id}"
            if equation_id not in other_formula_ids:
                other_formula_texs.append(f.tex)
                other_formula_ids.append(equation_id)

        # Package up data for the symbol.
        tags: List[str] = []
        MAX_BOX_HEIGHT = 0.1
        for b in boxes:
            if b.height > MAX_BOX_HEIGHT:
                logging.debug(  # pylint: disable=logging-not-lazy
                    "Detected large bounding box for symbol with height %f for entity %s of paper "
                    + "%s. Entity will be given a tag indicating it is unexpectedly large.",
                    b.height,
                    f"{localized_entity.entity.tex_path}-{localized_entity.entity.id_}",
                    arxiv_id,
                )
                tags.append("large")
                break

        data: EntityData = {
            "tex": f"${symbol.tex}$",
            "tex_start": symbol.start,
            "tex_end": symbol.end,
            "type": symbol.type_,
            "mathml": symbol.mathml,
            "mathml_near_matches": [m.matching_mathml for m in matches[symbol.mathml]],
            "snippets": other_context_texs,
            "defining_formulas": other_formula_texs,
            "is_definition": symbol.is_definition or False,
            "tags": tags,
        }

        # Create links between this symbol, its sentence, and related symbols.
        sentence_id = (
            f"{symbol_context.tex_path}-{symbol_context.sentence_id}"
            if symbol_context is not None
            else None
        )

        parent_id = parents.get(sid(symbol))
        child_ids = children.get(sid(symbol), [])

        relationships: EntityRelationships = {
            "equation": EntityReference(
                type_="equation", id_=f"{symbol.tex_path}-{symbol.equation_index}",
            ),
            "parent": EntityReference(type_="symbol", id_=parent_id),
            "children": [EntityReference(type_="symbol", id_=id_) for id_ in child_ids],
            "sentence": EntityReference(type_="sentence", id_=sentence_id)
            if sentence_id is not None
            else EntityReference(type_="sentence", id_=None),
            "defining_formula_equations": [
                EntityReference(type_="equation", id_=id_) for id_ in other_formula_ids
            ],
            "snippet_sentences": [
                EntityReference(type_="sentence", id_=id_)
                for id_ in other_context_sentence_ids
            ],
        }

        # Save all data for this symbol
        entity_information = EntityUploadInfo(
            id_=sid(symbol),
            type_="symbol",
            bounding_boxes=boxes,
            data=data,
            relationships=relationships,
        )
        entity_infos.append(entity_information)

    upload_entities(
        processing_summary.s2_id, arxiv_id, entity_infos, data_version,
    )
예제 #27
0
    def load(self) -> Iterator[Task]:
        for arxiv_id in self.arxiv_ids:

            output_dir = directories.arxiv_subdir("embellished-sentences",
                                                  arxiv_id)
            file_utils.clean_directory(output_dir)

            # Load equation data.
            equations: Equations = {}
            equations_path = os.path.join(
                directories.arxiv_subdir("detected-equations", arxiv_id),
                "entities.csv")
            try:
                equation_data = file_utils.load_from_csv(
                    equations_path, Equation)
                for equation in equation_data:
                    equations[(equation.tex_path,
                               int(equation.id_))] = equation
            except FileNotFoundError:
                logging.warning(  # pylint: disable=logging-not-lazy
                    "No equation data found for arXiv ID %s. It will not be " +
                    "possible to expand equations in sentences with symbol data. This should only "
                    +
                    "be a problem if it's expected that there are no symbols in paper %s.",
                    arxiv_id,
                )

            # Load symbols, for use in embellishing equations.
            symbols: Symbols = defaultdict(list)
            symbol_data = file_utils.load_symbols(arxiv_id)
            if symbol_data is not None:
                for id_, symbol in symbol_data:
                    symbols[(id_.tex_path, id_.equation_index)].append(symbol)
            else:
                logging.warning(  # pylint: disable=logging-not-lazy
                    "No symbol data found for arXiv ID %s. It will not be " +
                    "possible to expand equations in sentences with symbol data. This should only "
                    +
                    "be a problem if it's expected that there are no symbols in paper %s.",
                    arxiv_id,
                )

            # Load sentences.
            detected_sentences_path = os.path.join(
                directories.arxiv_subdir("detected-sentences", arxiv_id),
                "entities.csv",
            )
            try:
                sentences = file_utils.load_from_csv(detected_sentences_path,
                                                     Sentence)
            except FileNotFoundError:
                logging.warning(  # pylint: disable=logging-not-lazy
                    "No sentences data found for arXiv paper %s. Try re-running the pipeline, "
                    +
                    "this time enabling the processing of sentences. If that doesn't work, "
                    +
                    "there is likely an error in detcting sentences for this paper.",
                    arxiv_id,
                )
                continue

            for sentence in sentences:
                yield Task(arxiv_id, sentence, equations, symbols)
예제 #28
0
    def load(self) -> Iterator[PaperProcessingResult]:
        for arxiv_id in self.arxiv_ids:

            # Load the S2 ID for this paper
            s2_id_path = os.path.join(
                directories.arxiv_subdir("s2-metadata", arxiv_id), "s2_id")
            if not os.path.exists(s2_id_path):
                logging.warning("Could not find S2 ID file for %s. Skipping",
                                arxiv_id)
                continue
            with open(s2_id_path) as s2_id_file:
                s2_id = s2_id_file.read()

            # Load in all extracted entities. See note in 'colorize_tex.py' for why entities
            # might be saved in multiple files. If they are, for this upload function to work,
            # each of the entities need to have a unique pair of 'ID' and 'tex_path'.
            entities_dir = directories.arxiv_subdir(
                f"detected-{self.get_entity_name()}", arxiv_id)
            entities: List[SerializableEntity] = []
            for entities_path in glob.glob(
                    os.path.join(entities_dir, "entities*.csv")):
                entities.extend(
                    file_utils.load_from_csv(
                        entities_path,
                        self.get_detected_entity_type(
                            os.path.basename(entities_path)),
                    ))

            # Load locations for entities.
            locations_path = os.path.join(
                directories.arxiv_subdir(f"{self.get_entity_name()}-locations",
                                         arxiv_id),
                "entity_locations.csv",
            )
            if not os.path.exists(locations_path):
                logging.warning(  # pylint: disable=logging-not-lazy
                    "No locations have been saved for entities in command '%s' for paper %s. No entities "
                    + "will be uploaded for this paper.",
                    str(self.get_name()),
                    arxiv_id,
                )
                continue
            entity_location_infos = list(
                file_utils.load_from_csv(locations_path, EntityLocationInfo))

            # Load in contexts for all entities.
            contexts_loaded = False
            contexts_by_entity = {}
            if directories.registered(
                    f"contexts-for-{self.get_entity_name()}"):
                contexts_path = os.path.join(
                    directories.arxiv_subdir(
                        f"contexts-for-{self.get_entity_name()}", arxiv_id),
                    "contexts.csv",
                )
                if os.path.exists(contexts_path):
                    contexts = file_utils.load_from_csv(contexts_path, Context)
                    contexts_by_entity = {c.entity_id: c for c in contexts}
                    contexts_loaded = True

            if not contexts_loaded:
                logging.warning(  # pylint: disable=logging-not-lazy
                    "No contexts have been saved for entities in command '%s' for paper %s. No "
                    + "contexts will be saved for any of these entities.",
                    str(self.get_name()),
                    arxiv_id,
                )

            # Group each entity with its location and context. Then pass all entity information to
            # the upload function.
            entity_summaries = []
            for entity in entities:
                matching_locations = []
                for h in entity_location_infos:
                    if h.entity_id == entity.id_ and h.tex_path == entity.tex_path:
                        matching_locations.append(h)

                entity_summaries.append(
                    EntityExtractionResult(entity, matching_locations,
                                           contexts_by_entity.get(entity.id_)))

            yield PaperProcessingResult(
                arxiv_id=arxiv_id,
                s2_id=s2_id,
                entities=entity_summaries,
            )
예제 #29
0
    def load(self) -> Iterator[SymbolData]:
        for arxiv_id in self.arxiv_ids:

            s2_id = get_s2_id(arxiv_id)
            if s2_id is None:
                continue

            symbols_with_ids = file_utils.load_symbols(arxiv_id)
            if symbols_with_ids is None:
                continue

            symbols_by_id = {s.symbol_id: s.symbol for s in symbols_with_ids}

            boxes: Dict[SymbolId, BoundingBox] = {}
            boxes_path = os.path.join(
                directories.arxiv_subdir("symbol-locations", arxiv_id),
                "symbol_locations.csv",
            )
            if not os.path.exists(boxes_path):
                logging.warning(
                    "Could not find bounding boxes information for %s. Skipping",
                    arxiv_id,
                )
                continue
            for location in file_utils.load_from_csv(boxes_path,
                                                     SymbolLocation):
                symbol_id = SymbolId(
                    tex_path=location.tex_path,
                    equation_index=location.equation_index,
                    symbol_index=location.symbol_index,
                )
                box = BoundingBox(
                    page=int(location.page),
                    left=location.left,
                    top=location.top,
                    width=location.width,
                    height=location.height,
                )
                boxes[symbol_id] = box

            matches: Matches = {}
            matches_path = os.path.join(
                directories.arxiv_subdir("symbol-matches", arxiv_id),
                "matches.csv")
            if not os.path.exists(matches_path):
                logging.warning(
                    "Could not find symbol matches information for %s. Skipping",
                    arxiv_id,
                )
                continue
            for match in file_utils.load_from_csv(matches_path, Match):
                if match.queried_mathml not in matches:
                    matches[match.queried_mathml] = []
                matches[match.queried_mathml].append(match)

            context_data_missing = False
            contexts_path = os.path.join(
                directories.arxiv_subdir("contexts-for-symbols", arxiv_id),
                "contexts.csv",
            )
            if not os.path.exists(contexts_path):
                logging.warning(  # pylint: disable=logging-not-lazy
                    "Contexts have not been found for symbols for arXiv paper %s. "
                    + "Symbol data will be uploaded without contexts.",
                    arxiv_id,
                )
                context_data_missing = True

            symbol_contexts = {}
            mathml_contexts = defaultdict(list)
            if not context_data_missing:
                for context in file_utils.load_from_csv(
                        contexts_path, Context):
                    tex_path = context.tex_path
                    equation_index, symbol_index = [
                        int(t) for t in context.entity_id.split("-")
                    ]
                    symbol_id = SymbolId(tex_path, equation_index,
                                         symbol_index)
                    symbol_contexts[symbol_id] = context
                    symbol = symbols_by_id[symbol_id]
                    mathml_contexts[symbol.mathml].append(context)

            symbol_formulas = {}
            mathml_formulas = defaultdict(set)
            for id_, symbol in symbols_by_id.items():
                if (symbol.is_definition and symbol.equation is not None
                        and symbol.relative_start is not None
                        and symbol.relative_end is not None):
                    highlighted = wrap_span(
                        symbol.equation,
                        symbol.relative_start,
                        symbol.relative_end,
                        before=r"\htmlClass{match-highlight}{",
                        after="}",
                        braces=True,
                    )
                    formula = DefiningFormula(
                        tex=highlighted,
                        tex_path=id_.tex_path,
                        equation_id=id_.equation_index,
                    )
                    symbol_formulas[id_] = formula
                    mathml_formulas[symbol.mathml].add(formula)

            yield SymbolData(
                arxiv_id,
                s2_id,
                symbols_with_ids,
                boxes,
                symbol_contexts,
                symbol_formulas,
                mathml_contexts,
                mathml_formulas,
                matches,
            )
예제 #30
0
파일: upload.py 프로젝트: silky/scholarphi
def upload_terms(processing_summary: PaperProcessingResult,
                 data_version: Optional[int]) -> None:

    arxiv_id = processing_summary.arxiv_id
    contexts = file_utils.load_from_csv(
        os.path.join(
            directories.arxiv_subdir("contexts-for-glossary-terms", arxiv_id),
            "contexts.csv",
        ),
        Context,
    )
    contexts_by_entity = {(c.tex_path, c.entity_id): c for c in contexts}

    # Assemble contexts that should be shown for each term.
    contexts_by_term: Dict[str, List[Context]] = defaultdict(list)
    for entity_and_location in processing_summary.localized_entities:
        term = cast(Term, entity_and_location.entity)
        if (term.tex_path, term.id_) in contexts_by_entity:
            contexts_by_term[term.text].append(
                contexts_by_entity[(term.tex_path, term.id_)])

    entity_infos = []
    for entity_and_location in processing_summary.localized_entities:
        term = cast(Term, entity_and_location.entity)
        context = contexts_by_entity.get((term.tex_path, term.id_))
        boxes = [cast(BoundingBox, l) for l in entity_and_location.locations]

        # Cluster bounding boxes, in case any of these terms are defined as a macro (in which)
        # case all appearances of that term on the same page will have been lumped together.
        clusters = cluster_boxes(boxes, vertical_split=0.005)
        for i, cluster in enumerate(clusters):
            entity_info = EntityInformation(
                id_=f"{term.tex_path}-{term.id_}-{i}",
                type_="term",
                bounding_boxes=list(cluster),
                data={
                    "name":
                    term.text,
                    "definitions":
                    term.definitions,
                    "definition_texs":
                    term.definitions,
                    "sources":
                    term.sources,
                    "snippets":
                    [c.snippet for c in contexts_by_term.get(term.text, [])],
                },
                relationships={
                    "sentence":
                    EntityReference(
                        type_="sentence",
                        id_=f"{context.tex_path}-{context.sentence_id}"
                        if context is not None else None,
                    ),
                    "snippet_sentences": [
                        EntityReference(type_="sentence",
                                        id_=f"{c.tex_path}-{c.sentence_id}")
                        for c in contexts_by_term.get(term.text, [])
                    ],
                },
            )
            entity_infos.append(entity_info)

    upload_entities(
        processing_summary.s2_id,
        processing_summary.arxiv_id,
        entity_infos,
        data_version,
    )