Beispiel #1
0
    def run(self, input_doc: Doc) -> Doc:
        output_doc = deepcopy(input_doc)
        output_doc.words = []
        tokenizer_obj = self.algorithm

        tokens = tokenizer_obj.tokenize(output_doc.raw)
        indices = tokenizer_obj.compute_indices(output_doc.raw, tokens)
        for index, token in enumerate(tokens):
            word_obj = Word(
                string=token,
                index_token=index,
                index_char_start=indices[index],
                index_char_stop=indices[index] + len(token),
            )
            output_doc.words.append(word_obj)
        return output_doc
 def test_nlp_latin_stops(self):
     lang = "lat"  # type: str
     cltk_nlp = NLP(language=lang)  # type: NLP
     self.assertIsInstance(cltk_nlp, NLP)
     lat_pipeline = cltk_nlp.pipeline  # type: Pipeline
     pipeline_just_stops = [
         proc for proc in lat_pipeline.processes
         if proc.__name__ == "StopsProcess"
     ]  # type: List[Process]
     self.assertEqual(len(pipeline_just_stops), 1)
     stops_class = pipeline_just_stops[0]  # type: StopsProcess
     self.assertIs(stops_class, StopsProcess)
     words = [
         Word(string=token)
         for token in split_punct_ws(get_example_text(lang))
     ]
     doc = Doc(words=words)
     stops_obj = stops_class(language=lang)
     output_doc = stops_obj.run(input_doc=doc)
     is_stops = [w.stop for w in output_doc.words]  # type: List[bool]
     self.assertEqual(len(words), len(is_stops))
     self.assertIsInstance(is_stops[0], bool)
Beispiel #3
0
    def test_embeddings_processes(self):
        language = "arc"
        example_text = get_example_text(language)
        tokens = [Word(string=token) for token in example_text.split(" ")]
        a_process = AramaicEmbeddingsProcess(
            input_doc=Doc(raw=get_example_text(language), words=tokens)
        )
        a_process.run()
        isinstance(a_process.output_doc.words[1].embedding, numpy.ndarray)

        language = "got"
        example_text = get_example_text(language)
        tokens = [Word(string=token) for token in example_text.split(" ")]
        a_process = GothicEmbeddingsProcess(
            input_doc=Doc(raw=get_example_text(language), words=tokens)
        )
        a_process.run()
        isinstance(a_process.output_doc.words[1].embedding, numpy.ndarray)

        language = "grc"
        example_text = get_example_text(language)
        tokens = [Word(string=token) for token in example_text.split(" ")]
        a_process = GreekEmbeddingsProcess(
            input_doc=Doc(raw=get_example_text(language), words=tokens)
        )
        a_process.run()
        isinstance(a_process.output_doc.words[1].embedding, numpy.ndarray)

        language = "lat"
        example_text = get_example_text(language)
        tokens = [Word(string=token) for token in example_text.split(" ")]
        a_process = LatinEmbeddingsProcess(
            input_doc=Doc(raw=get_example_text(language), words=tokens)
        )
        a_process.run()
        isinstance(a_process.output_doc.words[1].embedding, numpy.ndarray)

        language = "ang"
        example_text = get_example_text(language)
        tokens = [Word(string=token) for token in example_text.split(" ")]
        a_process = OldEnglishEmbeddingsProcess(
            input_doc=Doc(raw=get_example_text(language), words=tokens)
        )
        a_process.run()
        isinstance(a_process.output_doc.words[1].embedding, numpy.ndarray)

        language = "pli"
        example_text = get_example_text(language)
        tokens = [Word(string=token) for token in example_text.split(" ")]
        a_process = PaliEmbeddingsProcess(
            input_doc=Doc(raw=get_example_text(language), words=tokens)
        )
        a_process.run()
        isinstance(a_process.output_doc.words[1].embedding, numpy.ndarray)

        language = "san"
        example_text = get_example_text(language)
        tokens = [Word(string=token) for token in example_text.split(" ")]
        a_process = SanskritEmbeddingsProcess(
            input_doc=Doc(raw=get_example_text(language), words=tokens)
        )
        a_process.run()
        isinstance(a_process.output_doc.words[1].embedding, numpy.ndarray)
Beispiel #4
0
    def stanza_to_cltk_word_type(stanza_doc):
        """Take an entire ``stanza`` document, extract
        each word, and encode it in the way expected by
        the CLTK's ``Word`` type.

        >>> from cltk.dependency.processes import StanzaProcess
        >>> from cltk.languages.example_texts import get_example_text
        >>> process_stanza = StanzaProcess(language="lat")
        >>> cltk_words = process_stanza.run(Doc(raw=get_example_text("lat"))).words
        >>> isinstance(cltk_words, list)
        True
        >>> isinstance(cltk_words[0], Word)
        True
        >>> cltk_words[0]
        Word(index_char_start=None, index_char_stop=None, index_token=0, index_sentence=0, string='Gallia', pos='NOUN', \
lemma='mallis', stem=None, scansion=None, xpos='A1|grn1|casA|gen2', upos='NOUN', dependency_relation='nsubj', governor=3, \
features={'Case': 'Nom', 'Degree': 'Pos', 'Gender': 'Fem', 'Number': 'Sing'}, embedding=None, stop=None, named_entity=None)
        """
        words_list = list()  # type: List[Word]

        for sentence_index, sentence in enumerate(stanza_doc.sentences):
            sent_words = dict()  # type: Dict[int, Word]
            indices = list()  # type: List[Tuple[int, int]]

            for token_index, token in enumerate(sentence.tokens):
                stanza_word = token.words[0]  # type: stanza.pipeline.doc.Word
                # TODO: Figure out how to handle the token indexes, esp 0 (root) and None (?)
                cltk_word = Word(
                    index_token=int(stanza_word.id)
                    - 1,  # subtract 1 from id b/c snpl starts their index at 1
                    index_sentence=sentence_index,
                    string=stanza_word.text,  # same as ``token.text``
                    pos=stanza_word.pos,
                    xpos=stanza_word.xpos,
                    upos=stanza_word.upos,
                    lemma=stanza_word.lemma,
                    dependency_relation=stanza_word.deprel,
                    governor=stanza_word.head - 1
                    if stanza_word.head
                    else -1,  # note: if val becomes ``-1`` then no governor, ie word is root; ``fro`` gives None sometimes, what does this mean?
                    features=dict()
                    if not stanza_word.feats
                    else dict([f.split("=") for f in stanza_word.feats.split("|")]),
                )  # type: Word
                # sent_words[cltk_word.index_token] = cltk_word
                words_list.append(cltk_word)

                # # TODO: Fix this, I forget what we were tracking in this
                # indices.append(
                #     (
                #         int(stanza_word.governor)
                #         - 1,  # -1 to match CLTK Word.index_token
                #         int(stanza_word.parent_token.index)
                #         - 1,  # -1 to match CLTK Word.index_token
                #     )
                # )
            # # TODO: Confirm that cltk_word.parent is ever getting filled out. Only for some lang models?
            # for idx, cltk_word in enumerate(sent_words.values()):
            #     governor_index, parent_index = indices[idx]  # type: int, int
            #     cltk_word.governor = governor_index if governor_index >= 0 else None
            #     if cltk_word.index_token != sent_words[parent_index].index_token:
            #         cltk_word.parent = parent_index

        return words_list
Beispiel #5
0
    def stanza_to_cltk_word_type(stanza_doc):
        """Take an entire ``stanza`` document, extract
        each word, and encode it in the way expected by
        the CLTK's ``Word`` type.

        >>> from cltk.dependency.processes import StanzaProcess
        >>> from cltk.languages.example_texts import get_example_text
        >>> process_stanza = StanzaProcess(language="lat")
        >>> cltk_words = process_stanza.run(Doc(raw=get_example_text("lat"))).words
        >>> isinstance(cltk_words, list)
        True
        >>> isinstance(cltk_words[0], Word)
        True
        >>> cltk_words[0]
        Word(index_char_start=None, index_char_stop=None, index_token=0, index_sentence=0, string='Gallia', pos=noun, lemma='Gallia', stem=None, scansion=None, xpos='A1|grn1|casA|gen2', upos='NOUN', dependency_relation='nsubj', governor=1, features={Case: [nominative], Gender: [feminine], Number: [singular]}, category={F: [neg], N: [pos], V: [neg]}, stop=None, named_entity=None, syllables=None, phonetic_transcription=None, definition=None)

        """

        words_list = list()  # type: List[Word]

        for sentence_index, sentence in enumerate(stanza_doc.sentences):
            sent_words = dict()  # type: Dict[int, Word]
            indices = list()  # type: List[Tuple[int, int]]

            for token_index, token in enumerate(sentence.tokens):
                stanza_word = token.words[0]  # type: stanza.pipeline.doc.Word
                # TODO: Figure out how to handle the token indexes, esp 0 (root) and None (?)
                pos: Optional[MorphosyntacticFeature] = from_ud(
                    "POS", stanza_word.pos)
                cltk_word = Word(
                    index_token=int(stanza_word.id) -
                    1,  # subtract 1 from id b/c Stanza starts their index at 1
                    index_sentence=sentence_index,
                    string=stanza_word.text,  # same as ``token.text``
                    pos=pos,
                    xpos=stanza_word.xpos,
                    upos=stanza_word.upos,
                    lemma=stanza_word.lemma,
                    dependency_relation=stanza_word.deprel,
                    governor=stanza_word.head - 1 if stanza_word.head else
                    -1,  # note: if val becomes ``-1`` then no governor, ie word is root
                )  # type: Word

                # convert UD features to the normalized CLTK features
                raw_features = ([
                    tuple(f.split("=")) for f in stanza_word.feats.split("|")
                ] if stanza_word.feats else [])
                cltk_features = [
                    from_ud(feature_name, feature_value)
                    for feature_name, feature_value in raw_features
                ]
                cltk_word.features = MorphosyntacticFeatureBundle(
                    *cltk_features)
                cltk_word.category = to_categorial(cltk_word.pos)
                cltk_word.stanza_features = stanza_word.feats

                # sent_words[cltk_word.index_token] = cltk_word
                words_list.append(cltk_word)

                # # TODO: Fix this, I forget what we were tracking in this
                # indices.append(
                #     (
                #         int(stanza_word.governor)
                #         - 1,  # -1 to match CLTK Word.index_token
                #         int(stanza_word.parent_token.index)
                #         - 1,  # -1 to match CLTK Word.index_token
                #     )
                # )
            # # TODO: Confirm that cltk_word.parent is ever getting filled out. Only for some lang models?
            # for idx, cltk_word in enumerate(sent_words.values()):
            #     governor_index, parent_index = indices[idx]  # type: int, int
            #     cltk_word.governor = governor_index if governor_index >= 0 else None
            #     if cltk_word.index_token != sent_words[parent_index].index_token:
            #         cltk_word.parent = parent_index

        return words_list
Beispiel #6
0
    def test_main_analyze(self):
        """Testing methods from ``cltk/nlp.py``. Note that we
        change ``first_word.embedding`` into an empty list because
        otherwise we would have to add a long vector into our tests.
        """
        lang = "grc"
        cltk_nlp = NLP(language=lang)
        cltk_doc = cltk_nlp.analyze(text=get_example_text(lang))
        first_word = cltk_doc.words[0]
        self.assertIsInstance(first_word.embedding, numpy.ndarray)
        first_word.embedding = list()
        target = Word(
            index_char_start=None,
            index_char_stop=None,
            index_token=0,
            index_sentence=0,
            string="ὅτι",
            pos="ADV",
            lemma="ὅτι",
            scansion=None,
            xpos="Df",
            upos="ADV",
            dependency_relation="advmod",
            governor=6,
            features={},
            embedding=[],
            stop=True,
            named_entity=False,
        )
        self.assertEqual(first_word, target)

        lang = "chu"
        cltk_nlp = NLP(language=lang)
        cltk_doc = cltk_nlp.analyze(text=get_example_text(lang))
        first_word = cltk_doc.words[0]
        target = Word(
            index_char_start=None,
            index_char_stop=None,
            index_token=0,
            index_sentence=0,
            string="отьчє",
            pos="NOUN",
            lemma="отьць",
            scansion=None,
            xpos="Nb",
            upos="NOUN",
            dependency_relation="vocative",
            governor=7,
            features={"Case": "Voc", "Gender": "Masc", "Number": "Sing"},
            embedding=None,
            stop=None,
            named_entity=None,
        )
        self.assertEqual(first_word, target)

        lang = "fro"
        cltk_nlp = NLP(language=lang)
        cltk_doc = cltk_nlp.analyze(text=get_example_text(lang))
        first_word = cltk_doc.words[0]
        target = Word(
            index_char_start=None,
            index_char_stop=None,
            index_token=0,
            index_sentence=0,
            string="Une",
            pos="DET",
            lemma=None,
            scansion=None,
            xpos="DETndf",
            upos="DET",
            dependency_relation=None,
            governor=-1,
            features={"Definite": "Ind", "PronType": "Art"},
            embedding=None,
            stop=False,
            named_entity=False,
        )
        self.assertEqual(first_word, target)

        lang = "got"
        cltk_nlp = NLP(language=lang)
        cltk_doc = cltk_nlp.analyze(text=get_example_text(lang))
        first_word = cltk_doc.words[0]
        self.assertIsInstance(first_word.embedding, numpy.ndarray)
        first_word.embedding = list()
        target = Word(
            index_char_start=None,
            index_char_stop=None,
            index_token=0,
            index_sentence=0,
            string="swa",
            pos="ADV",
            lemma="swa",
            scansion=None,
            xpos="Df",
            upos="ADV",
            dependency_relation="advmod",
            governor=1,
            features={},
            embedding=[],
            stop=None,
            named_entity=None,
        )
        self.assertEqual(first_word, target)
        self.assertEqual(len(cltk_doc.sentences), 3)

        # TODO: Re-enable coptic
        # raises ``KeyError: 'pretrain_path'`` from ``_set_up_model``
        # lang = "cop"
        # cltk_nlp = NLP(language=lang)
        # cltk_doc = cltk_nlp.analyze(text=get_example_text(lang))
        # first_word = cltk_doc.words[0]
        # target = Word(
        #     index_char_start=None,
        #     index_char_stop=None,
        #     index_token=0,
        #     index_sentence=0,
        #     string="ⲧⲏⲛ",
        #     pos="VERB",
        #     lemma="ⲧⲏⲛ",
        #     scansion=None,
        #     xpos="VSTAT",
        #     upos="VERB",
        #     dependency_relation="root",
        #     governor=-1,
        #     features={"VerbForm": "Fin"},
        #     embedding=None,
        #     stop=None,
        #     named_entity=None,
        # )
        # self.assertEqual(first_word, target)

        lang = "lzh"
        cltk_nlp = NLP(language=lang)
        cltk_doc = cltk_nlp.analyze(text=get_example_text(lang))
        first_word = cltk_doc.words[0]
        target = Word(
            index_char_start=None,
            index_char_stop=None,
            index_token=0,
            index_sentence=0,
            string="黃",
            pos="NOUN",
            lemma="黃",
            scansion=None,
            xpos="n,名詞,描写,形質",
            upos="NOUN",
            dependency_relation="nmod",
            governor=1,
            features={},
            embedding=None,
            stop=None,
            named_entity=None,
        )
        self.assertEqual(first_word, target)
Beispiel #7
0
    def test_embeddings_processes(self):

        language = "ang"  # type: str
        example_text = get_example_text(language)  # type: str
        word_objs = [
            Word(string=word_obj) for word_obj in example_text.split(" ")
        ]  # type: List[Word]
        a_process = OldEnglishEmbeddingsProcess(
        )  # type: OldEnglishEmbeddingsProcess
        a_doc = a_process.run(input_doc=Doc(raw=get_example_text(language),
                                            words=word_objs))  # type: Doc
        isinstance(a_doc.words[1].embedding, numpy.ndarray)

        language = "arc"  # type: str
        example_text = get_example_text(language)  # type: str
        word_objs = [
            Word(string=word_obj) for word_obj in example_text.split(" ")
        ]  # type: List[Word]
        a_process = AramaicEmbeddingsProcess(
        )  # type: AramaicEmbeddingsProcess
        a_doc = a_process.run(input_doc=Doc(raw=get_example_text(language),
                                            words=word_objs))  # type: Doc
        isinstance(a_doc.words[1].embedding, numpy.ndarray)

        language = "got"  # type: str
        example_text = get_example_text(language)  # str
        word_objs = [
            Word(string=word_obj) for word_obj in example_text.split(" ")
        ]  # type: List[Word]
        a_process = GothicEmbeddingsProcess()  # type: GothicEmbeddingsProcess
        a_doc = a_process.run(input_doc=Doc(raw=get_example_text(language),
                                            words=word_objs))  # type: Doc
        isinstance(a_doc.words[1].embedding, numpy.ndarray)

        language = "grc"  # type: str
        example_text = get_example_text(language)  # type: str
        word_objs = [
            Word(string=word_obj) for word_obj in example_text.split(" ")
        ]  # type: List[Word]
        a_process = GreekEmbeddingsProcess()  # type: GreekEmbeddingsProcess
        a_doc = a_process.run(input_doc=Doc(raw=get_example_text(language),
                                            words=word_objs))  # type: Doc
        isinstance(a_doc.words[1].embedding, numpy.ndarray)

        language = "lat"  # type: str
        example_text = get_example_text(language)  # type: str
        word_objs = [
            Word(string=word_obj) for word_obj in example_text.split(" ")
        ]  # type: List[Word]
        a_process = LatinEmbeddingsProcess()  # type: LatinEmbeddingsProcess
        a_doc = a_process.run(input_doc=Doc(raw=get_example_text(language),
                                            words=word_objs))  # type: Doc
        isinstance(a_doc.words[1].embedding, numpy.ndarray)

        language = "pli"  # type: str
        example_text = get_example_text(language)  # type: str
        word_objs = [
            Word(string=word_obj) for word_obj in example_text.split(" ")
        ]  # type: List[Word]
        a_process = PaliEmbeddingsProcess()  # type: PaliEmbeddingsProcess
        a_doc = a_process.run(
            input_doc=Doc(raw=get_example_text(language), words=word_objs))
        isinstance(a_doc.words[1].embedding, numpy.ndarray)

        language = "san"  # type: str
        example_text = get_example_text(language)  # type: str
        word_objs = [
            Word(string=word_obj) for word_obj in example_text.split(" ")
        ]  # type: List[Word]
        a_process = SanskritEmbeddingsProcess(
        )  # type: SanskritEmbeddingsProcess
        a_doc = a_process.run(input_doc=Doc(raw=get_example_text(language),
                                            words=word_objs))  # type: Doc
        isinstance(a_doc.words[1].embedding, numpy.ndarray)
Beispiel #8
0
    def test_main_analyze(self):
        """Testing methods from ``cltk/nlp.py``. Note that we
        change ``first_word.embedding`` into an empty list because
        otherwise we would have to add a long vector into our tests.
        """

        lang = "chu"
        cltk_nlp = NLP(language=lang)
        cltk_doc = cltk_nlp.analyze(text=get_example_text(lang))
        first_word = cltk_doc.words[0]
        target = Word(
            index_char_start=None,
            index_char_stop=None,
            index_token=0,
            index_sentence=0,
            string="отьчє",
            pos="noun",
            lemma="отьць",
            stem=None,
            scansion=None,
            xpos="Nb",
            upos="NOUN",
            dependency_relation="vocative",
            governor=7,
            features={"Case": "Voc", "Gender": "Masc", "Number": "Sing"},
            embedding=None,
            stop=None,
            named_entity=None,
            syllables=None,
            phonetic_transcription=None,
        )
        self._word_assertions(first_word, target)

        # Re-enable later. Raises error upon run, at least on build server
        # Should probably be reported back to Stanza
        # https://travis-ci.org/github/cltk/cltk/jobs/721808293#L636
        # lang = "cop"
        # cltk_nlp = NLP(language=lang)
        # cltk_doc = cltk_nlp.analyze(text=get_example_text(lang))
        # first_word = cltk_doc.words[0]
        # target = Word(
        #     index_char_start=None,
        #     index_char_stop=None,
        #     index_token=0,
        #     index_sentence=0,
        #     string="ⲧⲏⲛ",
        #     pos="VERB",
        #     lemma="ⲧⲏⲛ",
        #     stem=None,
        #     scansion=None,
        #     xpos="VSTAT",
        #     upos="VERB",
        #     dependency_relation="root",
        #     governor=-1,
        #     features={"VerbForm": "Fin"},
        #     embedding=None,
        #     stop=None,
        #     named_entity=None,
        # )
        # self.assertEqual(first_word, target)

        lang = "fro"
        cltk_nlp = NLP(language=lang)
        cltk_doc = cltk_nlp.analyze(text=get_example_text(lang))
        first_word = cltk_doc.words[0]
        target = Word(
            index_char_start=None,
            index_char_stop=None,
            index_token=0,
            index_sentence=0,
            string="Une",
            pos="DET",
            lemma="Une",
            stem=None,
            scansion=None,
            xpos="DETndf",
            upos="DET",
            dependency_relation=None,
            governor=-1,
            features={"Definite": "Ind", "PronType": "Art"},
            embedding=None,
            stop=False,
            named_entity=False,
            syllables=None,
            phonetic_transcription=None,
        )
        self._word_assertions(first_word, target)

        lang = "got"
        cltk_nlp = NLP(language=lang)
        cltk_doc = cltk_nlp.analyze(text=get_example_text(lang))
        first_word = cltk_doc.words[0]
        self.assertIsInstance(first_word.embedding, numpy.ndarray)
        first_word.embedding = list()
        target = Word(
            index_char_start=None,
            index_char_stop=None,
            index_token=0,
            index_sentence=0,
            string="swa",
            pos="ADV",
            lemma="swa",
            stem=None,
            scansion=None,
            xpos="Df",
            upos="ADV",
            dependency_relation="advmod",
            governor=1,
            features={},
            embedding=[],
            stop=None,
            named_entity=None,
            syllables=None,
            phonetic_transcription=None,
        )
        self._word_assertions(first_word, target)
        self.assertEqual(len(cltk_doc.sentences), 3)

        lang = "grc"
        cltk_nlp = NLP(language=lang)
        cltk_doc = cltk_nlp.analyze(text=get_example_text(lang))
        first_word = cltk_doc.words[0]
        self.assertIsInstance(first_word.embedding, numpy.ndarray)
        first_word.embedding = list()  # clear out the array, for easier checking
        target = Word(
            index_char_start=None,
            index_char_stop=None,
            index_token=0,
            index_sentence=0,
            string="ὅτι",
            pos="ADV",
            lemma="ὅτι",
            stem=None,
            scansion=None,
            xpos="Df",
            upos="ADV",
            dependency_relation="advmod",
            governor=6,
            features={},
            embedding=[],
            stop=False,
            named_entity=False,
            syllables=None,
            phonetic_transcription=None,
        )
        self._word_assertions(first_word, target)

        lang = "lzh"
        cltk_nlp = NLP(language=lang)
        cltk_doc = cltk_nlp.analyze(text=get_example_text(lang))
        first_word = cltk_doc.words[0]
        target = Word(
            index_char_start=None,
            index_char_stop=None,
            index_token=0,
            index_sentence=0,
            string="黃",
            pos="NOUN",
            lemma="黃",
            stem=None,
            scansion=None,
            xpos="n,名詞,描写,形質",
            upos="NOUN",
            dependency_relation="nmod",
            governor=1,
            features={},
            embedding=None,
            stop=None,
            named_entity=None,
            syllables=None,
            phonetic_transcription=None,
        )
        self._word_assertions(first_word, target)