Example #1
0
    def load(self) -> Iterator[SymbolSentencesTask]:

        for arxiv_id in self.arxiv_ids:

            output_dir = directories.arxiv_subdir("sentences-for-symbols",
                                                  arxiv_id)
            file_utils.clean_directory(output_dir)

            token_sentences_path = os.path.join(
                directories.arxiv_subdir("sentences-for-equation-tokens",
                                         arxiv_id),
                "entity_sentences.csv",
            )
            if not os.path.exists(token_sentences_path):
                logging.warning(  # pylint: disable=logging-not-lazy
                    "Could not find links between sentences and equation tokens at "
                    +
                    "path %s for arXiv paper %s. Skipping the detection of symbol sentences.",
                    token_sentences_path,
                    arxiv_id,
                )
                continue

            token_sentence_pairs = list(
                file_utils.load_from_csv(token_sentences_path,
                                         EntitySentencePairIds))

            symbols = file_utils.load_symbols(arxiv_id)
            if not symbols:
                continue

            # Filter to only those symbols for which tokens have been detected
            symbols = [s for s in symbols if len(s.symbol.characters) > 0]

            yield SymbolSentencesTask(arxiv_id, symbols, token_sentence_pairs)
Example #2
0
    def load(self) -> Iterator[LocationTask]:

        for arxiv_id in self.arxiv_ids:

            output_dir = directories.arxiv_subdir(
                "composite-symbols-locations", arxiv_id)
            file_utils.clean_directory(output_dir)

            token_locations = file_utils.load_equation_token_locations(
                arxiv_id)
            if token_locations is None:
                continue

            symbols_with_ids = file_utils.load_symbols(arxiv_id)
            if symbols_with_ids is None:
                continue

            for symbol_with_id in symbols_with_ids:
                # Symbols with affixes (e.g., arrows, hats) cannot be localized by taking the union
                # of their tokens' bounding boxes, because the bounding boxes of affix tokens
                # cannot be detected on their own.
                if not symbol_with_id.symbol.contains_affix:
                    yield LocationTask(
                        arxiv_id=arxiv_id,
                        token_locations=token_locations,
                        symbol_with_id=symbol_with_id,
                    )
Example #3
0
    def load(self) -> Iterator[MathMLForPaper]:

        for arxiv_id in self.arxiv_ids:

            output_dir = directories.arxiv_subdir("symbol-matches", arxiv_id)
            file_utils.clean_directory(output_dir)

            symbols_with_ids = file_utils.load_symbols(arxiv_id)
            if symbols_with_ids is None:
                continue

            symbols_mathml = {swi.symbol.mathml for swi in symbols_with_ids}

            yield MathMLForPaper(arxiv_id=arxiv_id,
                                 mathml_equations=symbols_mathml)
Example #4
0
    def load(self) -> Iterator[TexAndSymbols]:
        for arxiv_id in self.arxiv_ids:

            output_root = directories.arxiv_subdir(
                "sources-with-annotated-symbols", arxiv_id)
            file_utils.clean_directory(output_root)

            symbols_dir = directories.arxiv_subdir("detected-equation-tokens",
                                                   arxiv_id)
            tokens_path = os.path.join(symbols_dir, "entities.csv")
            if not os.path.exists(tokens_path):
                logging.info(
                    "No equation token data found for paper %s. Skipping.",
                    arxiv_id)
                continue

            symbols_with_ids = file_utils.load_symbols(arxiv_id)
            if symbols_with_ids is None:
                continue
            symbols = {swi.symbol_id: swi.symbol for swi in symbols_with_ids}

            tokens = file_utils.load_tokens(arxiv_id)
            if tokens is None:
                continue
            tex_paths = set({t.tex_path for t in tokens})

            characters: Dict[CharacterId, Character] = {}
            for token in tokens:
                character_id = CharacterId(token.tex_path,
                                           token.equation_index,
                                           token.token_index)
                characters[character_id] = Character(token.text,
                                                     token.token_index,
                                                     token.start, token.end)

            # Load original sources for TeX files that need to be colorized
            contents_by_file = {}
            for tex_path in tex_paths:
                absolute_tex_path = os.path.join(
                    directories.arxiv_subdir("sources", arxiv_id), tex_path)
                file_contents = file_utils.read_file_tolerant(
                    absolute_tex_path)
                if file_contents is not None:
                    contents_by_file[tex_path] = file_contents

            yield TexAndSymbols(arxiv_id, contents_by_file, symbols,
                                characters)
Example #5
0
    def load(self) -> Iterator[Task]:
        for arxiv_id in self.arxiv_ids:

            output_dir = directories.arxiv_subdir("sentence-tokens", arxiv_id)
            file_utils.clean_directory(output_dir)

            # Load symbols, for use in embellishing equations.
            symbols: Dict[str, List[Symbol]] = defaultdict(list)
            symbol_data = file_utils.load_symbols(arxiv_id)
            if symbol_data is not None:
                for id_, symbol in symbol_data:
                    symbols[id_.tex_path].append(symbol)
            else:
                logging.warning(  # pylint: disable=logging-not-lazy
                    "No symbol data found for arXiv ID %s. It will not be " +
                    "possible to expand equations in sentences with symbol data. This should only "
                    +
                    "be a problem if it's expected that there are no symbols in paper %s.",
                    arxiv_id,
                    arxiv_id,
                )

            # Load sentences.
            detected_sentences_path = os.path.join(
                directories.arxiv_subdir("detected-sentences", arxiv_id),
                "entities.csv",
            )
            if not os.path.exists(detected_sentences_path):
                logging.warning(  # pylint: disable=logging-not-lazy
                    "No sentences data found for arXiv paper %s. Try re-running the pipeline, "
                    +
                    "this time enabling the processing of sentences. If that doesn't work, "
                    +
                    "there is likely an error in detecting sentences for this paper.",
                    arxiv_id,
                )
                continue

            sentences = file_utils.load_from_csv(detected_sentences_path,
                                                 Sentence)
            for sentence in sentences:
                yield Task(arxiv_id, sentence, symbols[sentence.tex_path])
Example #6
0
    def load(self) -> Iterator[LocationTask]:

        for arxiv_id in self.arxiv_ids:

            output_dir = directories.arxiv_subdir("symbol-locations", arxiv_id)
            file_utils.clean_directory(output_dir)

            token_locations = file_utils.load_equation_token_locations(
                arxiv_id)
            if token_locations is None:
                continue

            symbols_with_ids = file_utils.load_symbols(arxiv_id)
            if symbols_with_ids is None:
                continue

            for symbol_with_id in symbols_with_ids:
                yield LocationTask(
                    arxiv_id=arxiv_id,
                    token_locations=token_locations,
                    symbol_with_id=symbol_with_id,
                )
Example #7
0
    def load(self) -> Iterator[Task]:
        for arxiv_id in self.arxiv_ids:

            output_dir = directories.arxiv_subdir("embellished-sentences",
                                                  arxiv_id)
            file_utils.clean_directory(output_dir)

            # Load equation data.
            equations: Equations = {}
            equations_path = os.path.join(
                directories.arxiv_subdir("detected-equations", arxiv_id),
                "entities.csv")
            try:
                equation_data = file_utils.load_from_csv(
                    equations_path, Equation)
                for equation in equation_data:
                    equations[(equation.tex_path,
                               int(equation.id_))] = equation
            except FileNotFoundError:
                logging.warning(  # pylint: disable=logging-not-lazy
                    "No equation data found for arXiv ID %s. It will not be " +
                    "possible to expand equations in sentences with symbol data. This should only "
                    +
                    "be a problem if it's expected that there are no symbols in paper %s.",
                    arxiv_id,
                )

            # Load symbols, for use in embellishing equations.
            symbols: Symbols = defaultdict(list)
            symbol_data = file_utils.load_symbols(arxiv_id)
            if symbol_data is not None:
                for id_, symbol in symbol_data:
                    symbols[(id_.tex_path, id_.equation_index)].append(symbol)
            else:
                logging.warning(  # pylint: disable=logging-not-lazy
                    "No symbol data found for arXiv ID %s. It will not be " +
                    "possible to expand equations in sentences with symbol data. This should only "
                    +
                    "be a problem if it's expected that there are no symbols in paper %s.",
                    arxiv_id,
                )

            # Load sentences.
            detected_sentences_path = os.path.join(
                directories.arxiv_subdir("detected-sentences", arxiv_id),
                "entities.csv",
            )
            try:
                sentences = file_utils.load_from_csv(detected_sentences_path,
                                                     Sentence)
            except FileNotFoundError:
                logging.warning(  # pylint: disable=logging-not-lazy
                    "No sentences data found for arXiv paper %s. Try re-running the pipeline, "
                    +
                    "this time enabling the processing of sentences. If that doesn't work, "
                    +
                    "there is likely an error in detcting sentences for this paper.",
                    arxiv_id,
                )
                continue

            for sentence in sentences:
                yield Task(arxiv_id, sentence, equations, symbols)
Example #8
0
    def load(self) -> Iterator[SymbolData]:
        for arxiv_id in self.arxiv_ids:

            s2_id = get_s2_id(arxiv_id)
            if s2_id is None:
                continue

            symbols_with_ids = file_utils.load_symbols(arxiv_id)
            if symbols_with_ids is None:
                continue

            symbols_by_id = {s.symbol_id: s.symbol for s in symbols_with_ids}

            boxes: Dict[SymbolId, BoundingBox] = {}
            boxes_path = os.path.join(
                directories.arxiv_subdir("symbol-locations", arxiv_id),
                "symbol_locations.csv",
            )
            if not os.path.exists(boxes_path):
                logging.warning(
                    "Could not find bounding boxes information for %s. Skipping",
                    arxiv_id,
                )
                continue
            for location in file_utils.load_from_csv(boxes_path,
                                                     SymbolLocation):
                symbol_id = SymbolId(
                    tex_path=location.tex_path,
                    equation_index=location.equation_index,
                    symbol_index=location.symbol_index,
                )
                box = BoundingBox(
                    page=int(location.page),
                    left=location.left,
                    top=location.top,
                    width=location.width,
                    height=location.height,
                )
                boxes[symbol_id] = box

            matches: Matches = {}
            matches_path = os.path.join(
                directories.arxiv_subdir("symbol-matches", arxiv_id),
                "matches.csv")
            if not os.path.exists(matches_path):
                logging.warning(
                    "Could not find symbol matches information for %s. Skipping",
                    arxiv_id,
                )
                continue
            for match in file_utils.load_from_csv(matches_path, Match):
                if match.queried_mathml not in matches:
                    matches[match.queried_mathml] = []
                matches[match.queried_mathml].append(match)

            context_data_missing = False
            contexts_path = os.path.join(
                directories.arxiv_subdir("contexts-for-symbols", arxiv_id),
                "contexts.csv",
            )
            if not os.path.exists(contexts_path):
                logging.warning(  # pylint: disable=logging-not-lazy
                    "Contexts have not been found for symbols for arXiv paper %s. "
                    + "Symbol data will be uploaded without contexts.",
                    arxiv_id,
                )
                context_data_missing = True

            symbol_contexts = {}
            mathml_contexts = defaultdict(list)
            if not context_data_missing:
                for context in file_utils.load_from_csv(
                        contexts_path, Context):
                    tex_path = context.tex_path
                    equation_index, symbol_index = [
                        int(t) for t in context.entity_id.split("-")
                    ]
                    symbol_id = SymbolId(tex_path, equation_index,
                                         symbol_index)
                    symbol_contexts[symbol_id] = context
                    symbol = symbols_by_id[symbol_id]
                    mathml_contexts[symbol.mathml].append(context)

            symbol_formulas = {}
            mathml_formulas = defaultdict(set)
            for id_, symbol in symbols_by_id.items():
                if (symbol.is_definition and symbol.equation is not None
                        and symbol.relative_start is not None
                        and symbol.relative_end is not None):
                    highlighted = wrap_span(
                        symbol.equation,
                        symbol.relative_start,
                        symbol.relative_end,
                        before=r"\htmlClass{match-highlight}{",
                        after="}",
                        braces=True,
                    )
                    formula = DefiningFormula(
                        tex=highlighted,
                        tex_path=id_.tex_path,
                        equation_id=id_.equation_index,
                    )
                    symbol_formulas[id_] = formula
                    mathml_formulas[symbol.mathml].add(formula)

            yield SymbolData(
                arxiv_id,
                s2_id,
                symbols_with_ids,
                boxes,
                symbol_contexts,
                symbol_formulas,
                mathml_contexts,
                mathml_formulas,
                matches,
            )
Example #9
0
    def load(self) -> Iterator[SymbolData]:
        for arxiv_id in self.arxiv_ids:

            s2_id = get_s2_id(arxiv_id)
            if s2_id is None:
                continue

            symbols_with_ids = file_utils.load_symbols(arxiv_id)
            if symbols_with_ids is None:
                continue

            boxes: Dict[SymbolId, BoundingBox] = {}
            boxes_path = os.path.join(
                directories.arxiv_subdir("symbol-locations", arxiv_id),
                "symbol_locations.csv",
            )
            if not os.path.exists(boxes_path):
                logging.warning(
                    "Could not find bounding boxes information for %s. Skipping",
                    arxiv_id,
                )
                continue
            for location in file_utils.load_from_csv(boxes_path,
                                                     SymbolLocation):
                symbol_id = SymbolId(
                    tex_path=location.tex_path,
                    equation_index=location.equation_index,
                    symbol_index=location.symbol_index,
                )
                box = BoundingBox(
                    page=int(location.page),
                    left=location.left,
                    top=location.top,
                    width=location.width,
                    height=location.height,
                )
                boxes[symbol_id] = box

            matches: Matches = {}
            matches_path = os.path.join(
                directories.arxiv_subdir("symbol-matches", arxiv_id),
                "matches.csv")
            if not os.path.exists(matches_path):
                logging.warning(
                    "Could not find symbol matches information for %s. Skipping",
                    arxiv_id,
                )
                continue
            for match in file_utils.load_from_csv(matches_path, Match):
                if match.queried_mathml not in matches:
                    matches[match.queried_mathml] = []
                matches[match.queried_mathml].append(match)

            sentence_data_missing = False
            sentences_path = os.path.join(
                directories.arxiv_subdir("sentences-for-symbols", arxiv_id),
                "entity_sentences.csv",
            )
            if not os.path.exists(sentences_path):
                logging.warning(  # pylint: disable=logging-not-lazy
                    "Symbols for arXiv paper %s have not been aligned to sentences. "
                    +
                    "Symbol data will be uploaded without links to sentences",
                    arxiv_id,
                )
                sentence_data_missing = True

            if not sentence_data_missing:
                symbol_sentences = {}
                for pair in file_utils.load_from_csv(sentences_path,
                                                     EntitySentencePairIds):
                    tex_path = pair.tex_path
                    equation_index, symbol_index = [
                        int(t) for t in pair.entity_id.split("-")
                    ]
                    sentence_key = SentenceKey(pair.tex_path, pair.sentence_id)
                    symbol_id = SymbolId(tex_path, equation_index,
                                         symbol_index)
                    symbol_sentences[symbol_id] = sentence_key

            yield SymbolData(
                arxiv_id,
                s2_id,
                symbols_with_ids,
                boxes,
                symbol_sentences,
                matches,
            )
Example #10
0
def upload_symbol_definitions(
    processing_summary: PaperProcessingResult, data_version: Optional[int]
) -> None:
    " Upload symbols and their definitions. "

    # Associate definitions with symbols as follows:
    # Definitions will be associated with entire equations as per the current implementation
    # of the definition detector. Conservatively, associate a definition for an equation
    # with a single symbol only if that symbol is the *only* top-level symbol in that equation.

    # Load symbols from files. Group symbols by equation to make it easy to detect whether a
    # symbol is the only top-level symbol in the equation.
    symbols_by_equation: Dict[
        Tuple[TexPath, EquationIndex], List[Symbol]
    ] = defaultdict(list)
    symbols: List[Symbol] = []

    symbols_with_ids = file_utils.load_symbols(processing_summary.arxiv_id)
    if symbols_with_ids is None:
        logging.info(  # pylint: disable=logging-not-lazy
            "No symbols were loaded for paper %s. Therefore, no definitions for symbols "
            + "will be uploaded for this paper.",
            processing_summary.arxiv_id,
        )
        return

    for _, symbol in symbols_with_ids:
        symbols_by_equation[symbol.tex_path, symbol.equation_index].append(symbol)
        symbols.append(symbol)

    # Group symbols by their MathML. These groups will be used to propagate definitions from
    # one defined symbol to all other appearances of that symbol.
    symbols_by_mathml: Dict[MathML, List[Symbol]] = defaultdict(list)
    for symbol in symbols:
        symbols_by_mathml[symbol.mathml].append(symbol)

    # Construct map from definitions to the sentences that contain them.
    contexts_by_definition: Dict[EntityId, Context] = {}
    for entity_summary in processing_summary.entities:
        entity_id = entity_summary.entity.id_
        context = entity_summary.context
        if (entity_id.startswith("definition")) and context is not None:
            contexts_by_definition[entity_id] = context

    # Fetch rows for all entities for this paper that have already been uploaded to the database.
    # This allows lookup of the row IDs for the sentence that contain definitions of symbols.
    entity_models = fetch_entity_models(processing_summary.s2_id, data_version)

    # Create a list of rows to insert into the database containing definition data.
    entity_data_models: List[EntityDataModel] = []
    for entity_summary in processing_summary.entities:
        entity = entity_summary.entity
        if not entity.id_.startswith("definiendum"):
            continue

        # Attempt to match definienda (defined terms) to symbols that are being defined.
        definiendum = cast(Definiendum, entity)
        defined_symbol = None
        for symbol in symbols:
            # Is the definiendum an equation?
            if definiendum.type_ != "symbol":
                continue
            # Does the symbol fall within in the range of characters being defined?
            if symbol.start < definiendum.start or symbol.end > definiendum.end:
                continue
            # Is the symbol a top-level symbol?
            if symbol.parent is not None:
                continue
            # Is it the *only* top-level symbol in its equation?
            top_level_symbols_in_equation = filter(
                lambda s: s.parent is not None,
                symbols_by_equation[(symbol.tex_path, symbol.equation_index)],
            )
            if len(list(top_level_symbols_in_equation)) > 1:
                continue

            defined_symbol = symbol
            logging.debug(  # pylint: disable=logging-not-lazy
                "Matched definiedum %s at position (%d, %d) to symbol %s at position "
                + "(%s, %s) for paper %s. A definition for this symbol will be uploaded.",
                definiendum.tex,
                definiendum.start,
                definiendum.end,
                symbol.tex,
                symbol.start,
                symbol.end,
                processing_summary.arxiv_id,
            )
            break

        if defined_symbol is None:
            continue

        # Assemble data about definitions for the symbol.
        definitions = definiendum.definitions
        definition_texs = definiendum.definition_texs
        sources = definiendum.sources
        definition_sentence_ids: List[Optional[str]] = []
        for definition_id in definiendum.definition_ids:
            context = contexts_by_definition.get(definition_id)
            if context is None:
                definition_sentence_ids.append(None)
            else:
                definition_sentence_ids.append(
                    f"{context.tex_path}-{context.sentence_id}"
                )

        # Find all symbols that are the same (i.e., that have the same MathML representation).
        # Then save definition data so that it applies all of those symbols.
        matching_symbols = symbols_by_mathml.get(defined_symbol.mathml)
        if matching_symbols is not None:
            for s in matching_symbols:
                entity_model = entity_models.get(("symbol", sid(s)))
                data: EntityData = {
                    "definitions": definitions,
                    "definition_texs": definition_texs,
                    "sources": sources,
                }
                entity_data_models.extend(make_data_models(None, entity_model, data))

                relationships: EntityRelationships = {
                    "definition_sentences": [
                        EntityReference(type_="sentence", id_=id_)
                        for id_ in definition_sentence_ids
                    ],
                }
                entity_data_models.extend(
                    make_relationship_models(
                        ("symbol", sid(s)), relationships, entity_models
                    )
                )

    with output_database.atomic():
        EntityDataModel.bulk_create(entity_data_models, 200)