コード例 #1
0
    def acronyms_and_definitions(self, **kwargs):
        """
        Extract a collection of acronyms and their most likely definitions,
        if available, from doc. If multiple definitions are found for a given acronym,
        only the most frequently occurring definition is returned.

        .. seealso:: :func:`extract.acronyms_and_definitions() <textacy.extract.acronyms_and_definitions>`
        for all function kwargs.
        """
        return extract.acronyms_and_definitions(self.spacy_doc, **kwargs)
コード例 #2
0
ファイル: texts.py プロジェクト: EricSchles/textacy
    def acronyms_and_definitions(self, **kwargs):
        """
        Extract a collection of acronyms and their most likely definitions,
        if available, from doc. If multiple definitions are found for a given acronym,
        only the most frequently occurring definition is returned.

        .. seealso:: :func:`extract.acronyms_and_definitions() <textacy.extract.acronyms_and_definitions>`
        for all function kwargs.
        """
        return extract.acronyms_and_definitions(self.spacy_doc, **kwargs)
コード例 #3
0
ファイル: texts.py プロジェクト: kevntao/textacy
    def acronyms_and_definitions(self, known_acro_defs=None):
        """
        Extract a collection of acronyms and their most likely definitions,
        if available, from doc. If multiple definitions are found for a given acronym,
        only the most frequently occurring definition is returned.

        Args:
            known_acro_defs (dict, optional): if certain acronym/definition pairs
                are known, pass them in as {acronym (str): definition (str)};
                algorithm will not attempt to find new definitions

        Returns:
            dict: unique acronyms (keys) with matched definitions (values)

        .. seealso:: :func:`extract.acronyms_and_definitions() <textacy.extract.acronyms_and_definitions>`
        for all function kwargs.
        """
        return extract.acronyms_and_definitions(self.spacy_doc, known_acro_defs=known_acro_defs)
コード例 #4
0
    def acronyms_and_definitions(self, known_acro_defs=None):
        """
        Extract a collection of acronyms and their most likely definitions,
        if available, from doc. If multiple definitions are found for a given acronym,
        only the most frequently occurring definition is returned.

        Args:
            known_acro_defs (dict, optional): if certain acronym/definition pairs
                are known, pass them in as {acronym (str): definition (str)};
                algorithm will not attempt to find new definitions

        Returns:
            dict: unique acronyms (keys) with matched definitions (values)

        .. seealso:: :func:`extract.acronyms_and_definitions() <textacy.extract.acronyms_and_definitions>`
        for all function kwargs.
        """
        return extract.acronyms_and_definitions(
            self.spacy_doc, known_acro_defs=known_acro_defs)
コード例 #5
0
ファイル: test_extract.py プロジェクト: EricSchles/textacy
 def test_acronyms_and_definitions(self):
     expected = {'I.M.F.': ''}
     observed = extract.acronyms_and_definitions(self.spacy_doc)
     self.assertEqual(observed, expected)
コード例 #6
0
def get_acronyms_and_definitions(doc, known_acro_defs=None):
    assert isinstance(doc, textacy.Doc) or isinstance(
        doc, spacy.tokens.Doc), "Only {} are supported".format(possible_docs)
    return extract.acronyms_and_definitions(doc, known_acro_defs)
コード例 #7
0
ファイル: test_extract.py プロジェクト: jakemcc/textacy
 def test_acronyms_and_definitions(self):
     expected = {'I.M.F.': ''}
     observed = extract.acronyms_and_definitions(self.spacy_doc)
     self.assertEqual(observed, expected)
コード例 #8
0
ファイル: test_extract.py プロジェクト: henridwyer/textacy
 def test_acronyms_and_definitions_known(self):
     expected = {'I.M.F.': 'International Monetary Fund'}
     observed = extract.acronyms_and_definitions(
         self.spacy_doc,
         known_acro_defs={'I.M.F.': 'International Monetary Fund'})
     self.assertEqual(observed, expected)
コード例 #9
0
 def test_known(self, spacy_doc):
     expected = {"I.M.F.": "International Monetary Fund"}
     observed = extract.acronyms_and_definitions(
         spacy_doc, known_acro_defs={"I.M.F.": "International Monetary Fund"}
     )
     assert observed == expected
コード例 #10
0
 def test_default(self, spacy_doc):
     # TODO: figure out if this function no longer works, ugh
     # expected = {"I.M.F.": "International Monetary Fund"}
     expected = {"I.M.F.": ""}
     observed = extract.acronyms_and_definitions(spacy_doc)
     assert observed == expected
コード例 #11
0
ファイル: model.py プロジェクト: wd6/concept-tagging-training
def extract_from_doc(doc):
    """
    Extract features from a spacy doc.

    Args:
        doc (spacy.doc): a doc processed by the spacy 'en' model

    Returns:
        terms_tagged (dict): features with their respective tags

    Examples:
        >>> from dsconcept.model import extract_from_doc
        >>> import spacy
        >>> nlp = spacy.load('en_core_web_sm')
        >>> txt = "The ship hung in the sky much the same way bricks don't."
        >>> doc = nlp(txt)
        >>> features = extract_from_doc(doc)
        >>> features
        {'ship': 'NOUN',
         'sky': 'NOUN',
         'way': 'NOUN',
         'brick': 'NOUN',
         'the ship': 'NOUN_CHUNK'}
    """
    # TODO: change this function such that it processes better but maintains the same interface.
    terms_tagged = dict()

    desired_parts_of_speech = ["NOUN", "PROPN"]
    # Get any 1-gram terms which are not % signs, or stop words.
    terms = {w.lemma_: w.pos_ for w in doc if should_keep(w, desired_parts_of_speech)}
    terms_tagged.update(terms)

    # Lemmatize each gram and join with a space.
    noun_chunks = {
        " ".join([w.lemma_ for w in nc if not w.is_stop]): nc.label_
        for nc in doc.noun_chunks
    }
    # filter our noun chunks that are already in terms set and not in excluded_list.
    excluded_list = ["-PRON-", ""]
    noun_chunks_filtered = {
        w.strip(): "NOUN_CHUNK"
        for w, lab in noun_chunks.items()
        if (w not in terms.keys()) and (w not in excluded_list)
    }
    terms_tagged.update(noun_chunks_filtered)

    # TODO: entities take precedence over noun chunks
    # Get entities from text and remove collisions with terms and noun chunks.
    ent_excluded_set = ["ORDINAL", "CARDINAL", "QUANTITY", "DATE", "PERCENT"]
    ents = {e.lemma_: e.label_ for e in doc.ents if e.label_ not in ent_excluded_set}
    ents_filtered = {
        ent: "ENT"
        for ent, lab in ents.items()
        if ent not in terms.keys() and ent not in noun_chunks_filtered.keys()
    }
    terms_tagged.update(ents_filtered)

    # Add acronyms which have definitions.
    # These acronyms could create Noise if they are not good. Maybe better to use their definitions.
    # This schema will only pull out identifical definitions. No lemmatizing, no fuzzy matching.
    # TODO: add lemmatizing and fuzzy matching for acrnoyms. This code exists in acronyms project.
    acronyms_with_defs = acronyms_and_definitions(doc)
    acronyms_filtered = {
        "{} - {}".format(ac, definition): "ACRONYM"
        for ac, definition in acronyms_with_defs.items()
        if definition != ""
    }
    terms_tagged.update(acronyms_filtered)

    return terms_tagged
コード例 #12
0
 def test_default(self, spacy_lang, text, known, exp):
     obs = extract.acronyms_and_definitions(spacy_lang(text),
                                            known_acro_defs=known)
     assert obs == exp
コード例 #13
0
 def test_default(self, spacy_lang, text, exp):
     obs = extract.acronyms_and_definitions(spacy_lang(text))
     assert obs == exp
コード例 #14
0
ファイル: test_extract.py プロジェクト: wangziyi2016/textacy
def test_acronyms_and_definitions_known(spacy_doc):
    expected = {'I.M.F.': 'International Monetary Fund'}
    observed = extract.acronyms_and_definitions(
        spacy_doc, known_acro_defs={'I.M.F.': 'International Monetary Fund'})
    assert observed == expected
コード例 #15
0
ファイル: test_extract.py プロジェクト: wangziyi2016/textacy
def test_acronyms_and_definitions(spacy_doc):
    expected = {'I.M.F.': ''}
    observed = extract.acronyms_and_definitions(spacy_doc)
    assert observed == expected