Python PhraseExtractor Examples

Programming Language: Python

Namespace/Package Name: common.parse_tex

Class/Type: PhraseExtractor

Examples at hotexamples.com: 8

Python PhraseExtractor - 8 examples found. These are the top rated real world Python examples of common.parse_tex.PhraseExtractor extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

PhraseExtractor(8)

parse(8)

Frequently Used Methods

PhraseExtractor (8)

parse (8)

Example #1

Show file

File: test_parse_tex.py Project: alexkreidler/scholarphi

def test_extract_phrases_from_formatted_text():
    extractor = PhraseExtractor(["two-token phrase"])
    phrases = list(
        extractor.parse(
            "main.tex",
            r"In this \textbf{two-token phrase}, something happens."))
    assert len(phrases) == 1

Example #2

Show file

File: test_parse_tex.py Project: alexkreidler/scholarphi

def test_extract_phrases_starting_with_symbol():
    # This example is from arXiv paper 1811.11889.
    extractor = PhraseExtractor(["+D&M"])
    phrases = list(
        extractor.parse("main.tex", r"This sentence contains +D\&M."))
    assert len(phrases) == 1
    assert phrases[0].text == "+D&M"

Example #3

Show file

File: test_parse_tex.py Project: silky/scholarphi

def test_extract_phrases_containing_ampersands():
    # This example is from arXiv paper 1811.11889.
    extractor = PhraseExtractor(["D&M"])
    phrases = list(extractor.parse("main.tex", r"This sentence contains D\&M."))
    assert len(phrases) == 1
    assert phrases[0].text == "D&M"
    assert phrases[0].tex == "D\&M"

Example #4

Show file

File: test_parse_tex.py Project: alexkreidler/scholarphi

def test_extract_phrases():
    extractor = PhraseExtractor(["word", "two-token phrase"])
    phrases = list(
        extractor.parse("main.tex",
                        "This sentence contains word and a two-token phrase."))

    phrase1 = phrases[0]
    assert phrase1.start == 23
    assert phrase1.end == 27
    assert phrase1.text == "word"

    phrase2 = phrases[1]
    assert phrase2.start == 34
    assert phrase2.end == 50
    assert phrase2.text == "two-token phrase"

Example #5

Show file

File: extractor.py Project: z-314/scholarphi

 def parse(self, tex_path: str, tex: str) -> Iterator[Term]:
     phrase_extractor = PhraseExtractor(list(self.glossary.keys()))
     for i, phrase in enumerate(phrase_extractor.parse(tex_path, tex)):
         entries = self.glossary[phrase.text]
         definitions = [e.definition for e in entries]
         sources = [e.source for e in entries]
         yield Term(
             id_=f"glossary-term-{i}",
             start=phrase.start,
             end=phrase.end,
             tex=phrase.tex,
             text=phrase.text,
             type_=None,
             tex_path=tex_path,
             context_tex=phrase.context_tex,
             definitions=definitions,
             sources=sources,
             sentence_id=None,
         )

Example #6

Show file

File: test_parse_tex.py Project: alexkreidler/scholarphi

def test_extract_phrase_containing_single_letter():
    extractor = PhraseExtractor(["T"])
    phrases = list(
        extractor.parse("main.tex", "This sentence contains the letter T."))
    assert len(phrases) == 1
    assert phrases[0].text == "T"

Example #7

Show file

    def process(
        self, item: DetectDefinitionsTask
    ) -> Iterator[Union[Definiendum, Definition, TermReference]]:
        sentences_ordered = sorted(item.sentences, key=lambda s: s.start)
        num_sentences = len(sentences_ordered)
        end_position_of_last_sentence = sentences_ordered[-1].end

        if len(item.sentences) == 0:
            logging.warning(  # pylint: disable=logging-not-lazy
                "No sentences found for arXiv ID %s. Skipping detection of sentences "
                + "that contain entities.",
                item.arxiv_id,
            )
            return

        # Load the pre-trained definition detection model.
        prediction_type = "DocDef2+AI2020+W00"
        model = DefinitionDetectionModel(prediction_type)

        definition_index = 0
        features = []
        sentences: List[EmbellishedSentence] = []

        definiendums: Dict[TermName, List[Definiendum]] = defaultdict(list)
        term_phrases: List[str] = []
        abbreviations: List[str] = []
        symbol_nicks: List[str] = []
        definitions: Dict[DefinitionId, Definition] = {}

        with tqdm(total=num_sentences,
                  disable=(not self.args.show_progress)) as progress:

            for sentence_index, sentence in enumerate(sentences_ordered):
                progress.update(1)

                # Only attempt to process sentences that have been marked as likely to be proper
                # plaintext. Note that this means some sentences may be skipped that didn't pass
                # heuristics in the sentence extractor.
                if not sentence.validity_guess:
                    continue

                # Extract features from raw text.
                featurized_text = model.featurize(
                    sentence.legacy_definition_input)
                features.append(featurized_text)
                sentences.append(sentence)

                # Process sentences in batches.
                if (len(features) >= self.args.batch_size
                        or sentence_index == num_sentences - 1):

                    # Detect terms and definitions in each sentence with a pre-trained definition
                    # extraction model, from the featurized text.

                    (_, slots, slots_confidence) = model.predict_batch(
                        cast(List[Dict[Any, Any]], features))

                    # Package extracted terms and definitions into a representation that's
                    # easier to process.
                    for (
                            s,
                            sentence_features,
                            termdef_sentence_slots,
                            termdef_sentence_slots_confidence,
                            abbrexp_sentence_slots,
                            abbrexp_sentence_slots_confidence,
                            symnick_sentence_slots,
                            symnick_sentence_slots_confidence,
                    ) in zip(
                            sentences,
                            features,
                            slots["W00"],
                            slots_confidence["W00"],
                            slots["AI2020"],
                            slots_confidence["AI2020"],
                            slots["DocDef2"],
                            slots_confidence["DocDef2"],
                    ):
                        # Extract TeX for each symbol from a parallel representation of the
                        # sentence, so that the TeX for symbols can be saved.
                        # Types of [term and definition] pairs.
                        #   [nickname and definition] for symbols.
                        #   [abbreviation and expansion] for abbreviations.
                        #   [term and definition] for other types.
                        symbol_texs = get_symbol_texs(
                            s.legacy_definition_input, s.with_formulas_marked)

                        # Only process slots when they include both 'TERM' and 'DEFINITION'.
                        if ("TERM" not in termdef_sentence_slots
                                or "DEF" not in termdef_sentence_slots):
                            term_definition_pairs = []
                        else:
                            term_definition_pairs = consolidate_keyword_definitions(
                                s.legacy_definition_input,
                                sentence_features["tokens"],
                                termdef_sentence_slots,
                                termdef_sentence_slots_confidence,
                                "W00",
                            )

                        if ("TERM" not in abbrexp_sentence_slots
                                or "DEF" not in abbrexp_sentence_slots):
                            abbreviation_expansion_pairs = []
                        else:
                            abbreviation_expansion_pairs = consolidate_keyword_definitions(
                                s.legacy_definition_input,
                                sentence_features["tokens"],
                                abbrexp_sentence_slots,
                                abbrexp_sentence_slots_confidence,
                                "AI2020",
                            )

                        if ("TERM" not in symnick_sentence_slots
                                or "DEF" not in symnick_sentence_slots):
                            symbol_nickname_pairs = []
                        else:
                            symbol_nickname_pairs = consolidate_keyword_definitions(
                                s.legacy_definition_input,
                                sentence_features["tokens"],
                                symnick_sentence_slots,
                                symnick_sentence_slots_confidence,
                                "DocDef2",
                            )

                        pairs = (term_definition_pairs +
                                 symbol_nickname_pairs +
                                 abbreviation_expansion_pairs)
                        for pair in pairs:
                            tex_path = s.tex_path
                            definiendum_id = (
                                f"definiendum-{tex_path}-{definition_index}")
                            definition_id = f"definition-{tex_path}-{definition_index}"
                            definiendum_text = pair.term_text
                            definiendum_type = pair.term_type
                            definition_type = pair.definition_type

                            definiendum_confidence = pair.term_confidence
                            definition_confidence = pair.definition_confidence

                            # Map definiendum and definition start and end positions back to
                            # their original positions in the TeX.
                            offsets = s.legacy_definition_input_journal.initial_offsets(
                                pair.term_start, pair.term_end)
                            if offsets[0] is None or offsets[1] is None:
                                logging.warning(  # pylint: disable=logging-not-lazy
                                    "Could not find offsets of definiendum %s in original TeX "
                                    +
                                    "(from sentence %s, file %s, arXiv ID %s). Definiendum will not be saved.",
                                    pair.term_text,
                                    s.id_,
                                    s.tex_path,
                                    item.arxiv_id,
                                )
                                continue
                            definiendum_start = s.start + offsets[0]
                            definiendum_end = s.start + offsets[1]

                            offsets = s.legacy_definition_input_journal.initial_offsets(
                                pair.definition_start, pair.definition_end)
                            if offsets[0] is None or offsets[1] is None:
                                logging.warning(  # pylint: disable=logging-not-lazy
                                    "Could not find offsets of definition %s in original TeX "
                                    +
                                    "(from sentence %s, file %s, arXiv ID %s). Definiendum will not be saved.",
                                    pair.definition_text,
                                    s.id_,
                                    s.tex_path,
                                    item.arxiv_id,
                                )
                                continue
                            definition_start = s.start + offsets[0]
                            definition_end = s.start + offsets[1]

                            # Extract document-level features from sentence.
                            position_ratio = (definiendum_start /
                                              end_position_of_last_sentence)
                            section_name = s.section_name

                            try:
                                tex = item.tex_by_file[tex_path]
                            except KeyError:
                                logging.warning(  # pylint: disable=logging-not-lazy
                                    "Could not find TeX for %s. TeX will not be included in "
                                    +
                                    "the output data for definition '%s' for term '%s'",
                                    tex_path,
                                    pair.definition_text,
                                    definiendum_text,
                                )
                                definiendum_tex = "NOT AVAILABLE"
                                definition_tex = "NOT AVAILABLE"
                            else:
                                if (definiendum_type == "symbol"
                                        and symbol_texs is not None
                                        and pair.term_start in symbol_texs):
                                    definiendum_tex = symbol_texs[
                                        pair.term_start]
                                    definiendum_text = definiendum_tex
                                else:
                                    definiendum_tex = tex.contents[
                                        definiendum_start:definiendum_end]
                                definition_tex = tex.contents[
                                    definition_start:definition_end]

                            # Save the definition to file.
                            definition = Definition(
                                id_=definition_id,
                                start=definition_start,
                                end=definition_end,
                                definiendum=definiendum_text,
                                type_=definition_type,
                                tex_path=tex_path,
                                tex=definition_tex,
                                text=pair.definition_text,
                                context_tex=sentence.context_tex,
                                sentence_id=sentence.id_,
                                intent=True,
                                confidence=definition_confidence,
                            )
                            definitions[definition_id] = definition
                            yield definition

                            # Don't save the definiendum to file yet. Save it in memory first, and then
                            # save it to file once it's done being processed. It will need
                            # to be associated with other definitions. Also, other references
                            # to the term will be detected before this method is over.
                            definiendum = Definiendum(
                                id_=definiendum_id,
                                text=definiendum_text,
                                type_=definiendum_type,
                                confidence=definiendum_confidence,
                                # Link the definiendum to the text that defined it.
                                definition_id=definition_id,
                                # Because a term can be defined multiple places in the paper, these
                                # three lists of definition data will be filled out once all of the
                                # definitions have been found.
                                definition_ids=[],
                                definitions=[],
                                definition_texs=[],
                                sources=[],
                                start=definiendum_start,
                                end=definiendum_end,
                                tex_path=tex_path,
                                tex=definiendum_tex,
                                context_tex=sentence.context_tex,
                                sentence_id=sentence.id_,
                                # Document-level features below.
                                position_ratio=position_ratio,
                                position_ratios=[],
                                section_name=section_name,
                                section_names=[],
                            )
                            definiendums[definiendum_text].append(definiendum)
                            if definiendum.type_ == "term":
                                term_phrases.append(definiendum.text)
                            if definiendum.type_ == "abbreviation":
                                abbreviations.append(definiendum.text)
                            if definiendum.type_ == "symbol":
                                symbol_nicks.append(definiendum.text)

                            definition_index += 1

                    features = []
                    sentences = []

        logging.debug(  # pylint: disable=logging-not-lazy
            "Finished detecting definitions for paper %s. Now finding references to defined terms.",
            item.arxiv_id,
        )

        all_definiendums: List[Definiendum] = []
        for _, definiendum_list in definiendums.items():
            all_definiendums.extend(definiendum_list)

        definition_ids: Dict[TermName, List[DefinitionId]] = {}
        definition_texs: Dict[TermName, List[str]] = {}
        definition_texts: Dict[TermName, List[str]] = {}
        sources: Dict[TermName, List[str]] = {}
        position_ratios: Dict[TermName, List[float]] = {}
        section_names: Dict[TermName, List[str]] = {}

        # Associate terms with all definitions that apply to them.
        for term, definiendum_list in definiendums.items():
            definition_ids[term] = [
                definiendum.definition_id for definiendum in definiendum_list
            ]
            definition_texs[term] = [
                definitions[definiendum.definition_id].tex
                for definiendum in definiendum_list
            ]
            definition_texts[term] = [
                definitions[definiendum.definition_id].text
                for definiendum in definiendum_list
            ]
            sources[term] = ["model"] * len(definition_ids[term])
            position_ratios[term] = [
                definiendum.position_ratio for definiendum in definiendum_list
            ]
            section_names[term] = [
                definiendum.section_name for definiendum in definiendum_list
                if definiendum.section_name is not None
            ]

        # Associate each definiendum with all applicable definitions, and save them to file.
        for _, definiendum_list in definiendums.items():
            for definiendum in definiendum_list:
                definiendum.definition_ids.extend(
                    definition_ids[definiendum.text])
                definiendum.definition_texs.extend(
                    definition_texs[definiendum.text])
                definiendum.definitions.extend(
                    definition_texts[definiendum.text])
                definiendum.sources.extend(sources[definiendum.text])
                definiendum.position_ratios.extend(
                    position_ratios[definiendum.text])
                definiendum.section_names.extend(
                    section_names[definiendum.text])
                yield definiendum

        # Detect all other references to the defined terms. Detect references to textual
        # terms and abbreviations. References to symbols need not be found here; they
        # will be detected automatically in the symbol extraction code.
        term_index = 0

        for tex_path, file_contents in item.tex_by_file.items():
            term_extractor = PhraseExtractor(term_phrases + abbreviations)
            for t in term_extractor.parse(tex_path, file_contents.contents):

                # Don't save term references if they are already in the definiendums.
                if any([
                        overlaps(definiendum, t)
                        for definiendum in all_definiendums
                ]):
                    continue

                logging.debug(
                    "Found reference to term %s at (%d, %d) in %s for arXiv ID %s",
                    t.text,
                    t.start,
                    t.end,
                    t.tex_path,
                    item.arxiv_id,
                )
                type_ = ("abbreviation" if t.text in abbreviations else
                         "term" if t.text in term_phrases else
                         "symbol" if t.text in symbol_nicks else "unknown")
                yield TermReference(
                    id_=f"term-{t.tex_path}-{term_index}",
                    text=t.text,
                    type_=type_,
                    definition_ids=definition_ids[t.text],
                    definitions=definition_texts[t.text],
                    definition_texs=definition_texs[t.text],
                    sources=sources[t.text],
                    position_ratios=position_ratios[t.text],
                    section_names=section_names[t.text],
                    start=t.start,
                    end=t.end,
                    tex_path=t.tex_path,
                    tex=t.tex,
                    context_tex=t.context_tex,
                )
                term_index += 1

Example #8

Show file

    def process(
        self, item: DetectDefinitionsTask
    ) -> Iterator[Union[Definiendum, Definition, TermReference]]:
        sentences_ordered = sorted(item.sentences, key=lambda s: s.start)
        num_sentences = len(sentences_ordered)

        if len(item.sentences) == 0:
            logging.warning(  # pylint: disable=logging-not-lazy
                "No sentences found for arXiv ID %s. Skipping detection of sentences "
                + "that contain entities.",
                item.arxiv_id,
            )
            return

        # Load the pre-trained definition detection model.
        model = DefinitionDetectionModel()

        definition_index = 0
        features = []
        sentences = []

        definiendums: Dict[TermName, List[Definiendum]] = defaultdict(list)
        definitions: Dict[DefinitionId, Definition] = {}

        with tqdm(
            total=num_sentences, disable=(not self.args.show_progress)
        ) as progress:

            for si, sentence in enumerate(sentences_ordered):
                progress.update(1)

                # Only attempt to process sentences that have been marked as likely to be proper
                # plaintext. Note that this means some sentences may be skipped that didn't pass
                # heuristics in the sentence extractor.
                if not sentence.validity_guess:
                    continue

                # Extract features from raw text.
                featurized_text = model.featurize(sentence.legacy_definition_input)
                features.append(featurized_text)
                sentences.append(sentence)

                # Process sentences in batches.
                if len(features) >= self.args.batch_size or si == num_sentences - 1:

                    # Detect terms and definitions in each sentence with a pre-trained definition
                    # extraction model, from the featurized text.
                    intents, slots = model.predict_batch(
                        cast(List[Dict[Any, Any]], features)
                    )

                    for s, sentence_features, intent, sentence_slots in zip(
                        sentences, features, intents, slots
                    ):
                        # Only process slots when they includ both 'TERM' and 'DEFINITION'.
                        if "TERM" not in sentence_slots or "DEF" not in sentence_slots:
                            continue

                        # Package extracted terms and definitions into a representation that's
                        # easier to process.
                        pairs = get_term_definition_pairs(
                            s.legacy_definition_input,
                            sentence_features,
                            sentence_slots,
                        )

                        # Extract TeX for each symbol from a parallel representation of the
                        # sentence, so that the TeX for symbols can be saved.
                        symbol_texs = get_symbol_texs(
                            s.legacy_definition_input, s.with_equation_tex
                        )

                        for pair in pairs:

                            tex_path = s.tex_path
                            definiendum_id = (
                                f"definiendum-{tex_path}-{definition_index}"
                            )
                            definition_id = f"definition-{tex_path}-{definition_index}"
                            definiendum_text = pair.term_text
                            definiendum_type = (
                                "symbol" if "SYMBOL" in definiendum_text else "term"
                            )

                            # Map definiendum and definition start and end positions back to
                            # their original positions in the TeX.
                            offsets = s.legacy_definition_input_journal.initial_offsets(
                                pair.term_start, pair.term_end
                            )
                            if offsets[0] is None or offsets[1] is None:
                                logging.warning(  # pylint: disable=logging-not-lazy
                                    "Could not find offsets of definiendum %s in original TeX "
                                    + "(from sentence %s, file %s, arXiv ID %s). Definiendum will not be saved.",
                                    pair.term_text,
                                    s.id_,
                                    s.tex_path,
                                    item.arxiv_id,
                                )
                                continue
                            definiendum_start = s.start + offsets[0]
                            definiendum_end = s.start + offsets[1]

                            offsets = s.legacy_definition_input_journal.initial_offsets(
                                pair.definition_start, pair.definition_end
                            )
                            if offsets[0] is None or offsets[1] is None:
                                logging.warning(  # pylint: disable=logging-not-lazy
                                    "Could not find offsets of definition %s in original TeX "
                                    + "(from sentence %s, file %s, arXiv ID %s). Definiendum will not be saved.",
                                    pair.definition_text,
                                    s.id_,
                                    s.tex_path,
                                    item.arxiv_id,
                                )
                                continue
                            definition_start = s.start + offsets[0]
                            definition_end = s.start + offsets[1]

                            try:
                                tex = item.tex_by_file[tex_path]
                            except KeyError:
                                logging.warning(  # pylint: disable=logging-not-lazy
                                    "Could not find TeX for %s. TeX will not be included in "
                                    + "the output data for definition '%s' for term '%s'",
                                    tex_path,
                                    pair.definition_text,
                                    definiendum_text,
                                )
                                definiendum_tex = "NOT AVAILABLE"
                                definition_tex = "NOT AVAILABLE"
                            else:
                                if (
                                    definiendum_type == "symbol"
                                    and symbol_texs is not None
                                    and pair.term_start in symbol_texs
                                ):
                                    definiendum_tex = symbol_texs[pair.term_start]
                                    definiendum_text = definiendum_tex
                                else:
                                    definiendum_tex = tex.contents[
                                        definiendum_start:definiendum_end
                                    ]
                                definition_tex = tex.contents[
                                    definition_start:definition_end
                                ]

                            # Save the definition to file.
                            definition = Definition(
                                id_=definition_id,
                                start=definition_start,
                                end=definition_end,
                                definiendum=definiendum_text,
                                type_=None,
                                tex_path=tex_path,
                                tex=definition_tex,
                                text=pair.definition_text,
                                context_tex=s.context_tex,
                                sentence_id=s.id_,
                                intent=bool(intent),
                                confidence=None,
                            )
                            definitions[definition_id] = definition
                            yield definition

                            # Don't save the definiendum to file yet. Save it in memory first, and then
                            # save it to file once it's done being processed. It will need
                            # to be associated with other definitions. Also, other references
                            # to the term will be detected before this method is over.
                            definiendums[definiendum_text].append(
                                Definiendum(
                                    id_=definiendum_id,
                                    text=definiendum_text,
                                    type_=definiendum_type,
                                    confidence=None,
                                    # Link the definiendum to the text that defined it.
                                    definition_id=definition_id,
                                    # Because a term can be defined multiple places in the paper, these
                                    # three lists of definition data will be filled out once all of the
                                    # definitions have been found.
                                    definition_ids=[],
                                    definitions=[],
                                    definition_texs=[],
                                    sources=[],
                                    start=definiendum_start,
                                    end=definiendum_end,
                                    tex_path=tex_path,
                                    tex=definiendum_tex,
                                    context_tex=s.context_tex,
                                    sentence_id=s.id_,
                                )
                            )
                            definition_index += 1

                    features = []
                    sentences = []

        logging.debug(
            "Finished detecting definitions for paper %s. Now finding references to defined terms.",
            item.arxiv_id,
        )

        all_definiendums: List[Definiendum] = []
        for _, definiendum_list in definiendums.items():
            all_definiendums.extend(definiendum_list)
        term_phrases: List[TermName] = list(definiendums.keys())
        definition_ids: Dict[TermName, List[DefinitionId]] = {}
        definition_texs: Dict[TermName, List[str]] = {}
        definition_texts: Dict[TermName, List[str]] = {}
        sources: Dict[TermName, List[str]] = {}

        # Associate terms with all definitions that apply to them.
        for term, definiendum_list in definiendums.items():
            definition_ids[term] = [d.definition_id for d in definiendum_list]
            definition_texs[term] = [
                definitions[d.definition_id].tex for d in definiendum_list
            ]
            definition_texts[term] = [
                definitions[d.definition_id].text for d in definiendum_list
            ]
            sources[term] = ["model"] * len(definition_ids[term])

        # Associate each definiendum with all applicable definitions, and save them to file.
        for _, definiendum_list in definiendums.items():
            for d in definiendum_list:
                d.definition_ids.extend(definition_ids[d.text])
                d.definition_texs.extend(definition_texs[d.text])
                d.definitions.extend(definition_texts[d.text])
                d.sources.extend(sources[d.text])
                yield d

        # Detect all other references to the defined terms.
        term_index = 0
        sentence_entities: List[SerializableEntity] = cast(
            List[SerializableEntity], item.sentences
        )

        for tex_path, file_contents in item.tex_by_file.items():
            term_extractor = PhraseExtractor(term_phrases)
            for t in term_extractor.parse(tex_path, file_contents.contents):
                t_sentence = get_containing_entity(t, sentence_entities)

                # Don't save term references if they are already in the definiendums
                if any([overlaps(d, t) for d in all_definiendums]):
                    continue

                logging.debug(
                    "Found reference to term %s at (%d, %d) in %s for arXiv ID %s",
                    t.text,
                    t.start,
                    t.end,
                    t.tex_path,
                    item.arxiv_id,
                )
                yield TermReference(
                    id_=f"term-{t.tex_path}-{term_index}",
                    text=t.text,
                    type_=None,
                    definition_ids=definition_ids[t.text],
                    definitions=definition_texts[t.text],
                    definition_texs=definition_texs[t.text],
                    sources=sources[t.text],
                    start=t.start,
                    end=t.end,
                    tex_path=t.tex_path,
                    tex=t.tex,
                    context_tex=t.context_tex,
                    sentence_id=t_sentence.id_ if t_sentence is not None else None,
                )
                term_index += 1