def run(self, input_doc: Doc) -> Doc: output_doc = deepcopy(input_doc) output_doc.words = [] tokenizer_obj = self.algorithm tokens = tokenizer_obj.tokenize(output_doc.raw) indices = tokenizer_obj.compute_indices(output_doc.raw, tokens) for index, token in enumerate(tokens): word_obj = Word( string=token, index_token=index, index_char_start=indices[index], index_char_stop=indices[index] + len(token), ) output_doc.words.append(word_obj) return output_doc
def test_nlp_latin_stops(self): lang = "lat" # type: str cltk_nlp = NLP(language=lang) # type: NLP self.assertIsInstance(cltk_nlp, NLP) lat_pipeline = cltk_nlp.pipeline # type: Pipeline pipeline_just_stops = [ proc for proc in lat_pipeline.processes if proc.__name__ == "StopsProcess" ] # type: List[Process] self.assertEqual(len(pipeline_just_stops), 1) stops_class = pipeline_just_stops[0] # type: StopsProcess self.assertIs(stops_class, StopsProcess) words = [ Word(string=token) for token in split_punct_ws(get_example_text(lang)) ] doc = Doc(words=words) stops_obj = stops_class(language=lang) output_doc = stops_obj.run(input_doc=doc) is_stops = [w.stop for w in output_doc.words] # type: List[bool] self.assertEqual(len(words), len(is_stops)) self.assertIsInstance(is_stops[0], bool)
def test_embeddings_processes(self): language = "arc" example_text = get_example_text(language) tokens = [Word(string=token) for token in example_text.split(" ")] a_process = AramaicEmbeddingsProcess( input_doc=Doc(raw=get_example_text(language), words=tokens) ) a_process.run() isinstance(a_process.output_doc.words[1].embedding, numpy.ndarray) language = "got" example_text = get_example_text(language) tokens = [Word(string=token) for token in example_text.split(" ")] a_process = GothicEmbeddingsProcess( input_doc=Doc(raw=get_example_text(language), words=tokens) ) a_process.run() isinstance(a_process.output_doc.words[1].embedding, numpy.ndarray) language = "grc" example_text = get_example_text(language) tokens = [Word(string=token) for token in example_text.split(" ")] a_process = GreekEmbeddingsProcess( input_doc=Doc(raw=get_example_text(language), words=tokens) ) a_process.run() isinstance(a_process.output_doc.words[1].embedding, numpy.ndarray) language = "lat" example_text = get_example_text(language) tokens = [Word(string=token) for token in example_text.split(" ")] a_process = LatinEmbeddingsProcess( input_doc=Doc(raw=get_example_text(language), words=tokens) ) a_process.run() isinstance(a_process.output_doc.words[1].embedding, numpy.ndarray) language = "ang" example_text = get_example_text(language) tokens = [Word(string=token) for token in example_text.split(" ")] a_process = OldEnglishEmbeddingsProcess( input_doc=Doc(raw=get_example_text(language), words=tokens) ) a_process.run() isinstance(a_process.output_doc.words[1].embedding, numpy.ndarray) language = "pli" example_text = get_example_text(language) tokens = [Word(string=token) for token in example_text.split(" ")] a_process = PaliEmbeddingsProcess( input_doc=Doc(raw=get_example_text(language), words=tokens) ) a_process.run() isinstance(a_process.output_doc.words[1].embedding, numpy.ndarray) language = "san" example_text = get_example_text(language) tokens = [Word(string=token) for token in example_text.split(" ")] a_process = SanskritEmbeddingsProcess( input_doc=Doc(raw=get_example_text(language), words=tokens) ) a_process.run() isinstance(a_process.output_doc.words[1].embedding, numpy.ndarray)
def stanza_to_cltk_word_type(stanza_doc): """Take an entire ``stanza`` document, extract each word, and encode it in the way expected by the CLTK's ``Word`` type. >>> from cltk.dependency.processes import StanzaProcess >>> from cltk.languages.example_texts import get_example_text >>> process_stanza = StanzaProcess(language="lat") >>> cltk_words = process_stanza.run(Doc(raw=get_example_text("lat"))).words >>> isinstance(cltk_words, list) True >>> isinstance(cltk_words[0], Word) True >>> cltk_words[0] Word(index_char_start=None, index_char_stop=None, index_token=0, index_sentence=0, string='Gallia', pos='NOUN', \ lemma='mallis', stem=None, scansion=None, xpos='A1|grn1|casA|gen2', upos='NOUN', dependency_relation='nsubj', governor=3, \ features={'Case': 'Nom', 'Degree': 'Pos', 'Gender': 'Fem', 'Number': 'Sing'}, embedding=None, stop=None, named_entity=None) """ words_list = list() # type: List[Word] for sentence_index, sentence in enumerate(stanza_doc.sentences): sent_words = dict() # type: Dict[int, Word] indices = list() # type: List[Tuple[int, int]] for token_index, token in enumerate(sentence.tokens): stanza_word = token.words[0] # type: stanza.pipeline.doc.Word # TODO: Figure out how to handle the token indexes, esp 0 (root) and None (?) cltk_word = Word( index_token=int(stanza_word.id) - 1, # subtract 1 from id b/c snpl starts their index at 1 index_sentence=sentence_index, string=stanza_word.text, # same as ``token.text`` pos=stanza_word.pos, xpos=stanza_word.xpos, upos=stanza_word.upos, lemma=stanza_word.lemma, dependency_relation=stanza_word.deprel, governor=stanza_word.head - 1 if stanza_word.head else -1, # note: if val becomes ``-1`` then no governor, ie word is root; ``fro`` gives None sometimes, what does this mean? features=dict() if not stanza_word.feats else dict([f.split("=") for f in stanza_word.feats.split("|")]), ) # type: Word # sent_words[cltk_word.index_token] = cltk_word words_list.append(cltk_word) # # TODO: Fix this, I forget what we were tracking in this # indices.append( # ( # int(stanza_word.governor) # - 1, # -1 to match CLTK Word.index_token # int(stanza_word.parent_token.index) # - 1, # -1 to match CLTK Word.index_token # ) # ) # # TODO: Confirm that cltk_word.parent is ever getting filled out. Only for some lang models? # for idx, cltk_word in enumerate(sent_words.values()): # governor_index, parent_index = indices[idx] # type: int, int # cltk_word.governor = governor_index if governor_index >= 0 else None # if cltk_word.index_token != sent_words[parent_index].index_token: # cltk_word.parent = parent_index return words_list
def stanza_to_cltk_word_type(stanza_doc): """Take an entire ``stanza`` document, extract each word, and encode it in the way expected by the CLTK's ``Word`` type. >>> from cltk.dependency.processes import StanzaProcess >>> from cltk.languages.example_texts import get_example_text >>> process_stanza = StanzaProcess(language="lat") >>> cltk_words = process_stanza.run(Doc(raw=get_example_text("lat"))).words >>> isinstance(cltk_words, list) True >>> isinstance(cltk_words[0], Word) True >>> cltk_words[0] Word(index_char_start=None, index_char_stop=None, index_token=0, index_sentence=0, string='Gallia', pos=noun, lemma='Gallia', stem=None, scansion=None, xpos='A1|grn1|casA|gen2', upos='NOUN', dependency_relation='nsubj', governor=1, features={Case: [nominative], Gender: [feminine], Number: [singular]}, category={F: [neg], N: [pos], V: [neg]}, stop=None, named_entity=None, syllables=None, phonetic_transcription=None, definition=None) """ words_list = list() # type: List[Word] for sentence_index, sentence in enumerate(stanza_doc.sentences): sent_words = dict() # type: Dict[int, Word] indices = list() # type: List[Tuple[int, int]] for token_index, token in enumerate(sentence.tokens): stanza_word = token.words[0] # type: stanza.pipeline.doc.Word # TODO: Figure out how to handle the token indexes, esp 0 (root) and None (?) pos: Optional[MorphosyntacticFeature] = from_ud( "POS", stanza_word.pos) cltk_word = Word( index_token=int(stanza_word.id) - 1, # subtract 1 from id b/c Stanza starts their index at 1 index_sentence=sentence_index, string=stanza_word.text, # same as ``token.text`` pos=pos, xpos=stanza_word.xpos, upos=stanza_word.upos, lemma=stanza_word.lemma, dependency_relation=stanza_word.deprel, governor=stanza_word.head - 1 if stanza_word.head else -1, # note: if val becomes ``-1`` then no governor, ie word is root ) # type: Word # convert UD features to the normalized CLTK features raw_features = ([ tuple(f.split("=")) for f in stanza_word.feats.split("|") ] if stanza_word.feats else []) cltk_features = [ from_ud(feature_name, feature_value) for feature_name, feature_value in raw_features ] cltk_word.features = MorphosyntacticFeatureBundle( *cltk_features) cltk_word.category = to_categorial(cltk_word.pos) cltk_word.stanza_features = stanza_word.feats # sent_words[cltk_word.index_token] = cltk_word words_list.append(cltk_word) # # TODO: Fix this, I forget what we were tracking in this # indices.append( # ( # int(stanza_word.governor) # - 1, # -1 to match CLTK Word.index_token # int(stanza_word.parent_token.index) # - 1, # -1 to match CLTK Word.index_token # ) # ) # # TODO: Confirm that cltk_word.parent is ever getting filled out. Only for some lang models? # for idx, cltk_word in enumerate(sent_words.values()): # governor_index, parent_index = indices[idx] # type: int, int # cltk_word.governor = governor_index if governor_index >= 0 else None # if cltk_word.index_token != sent_words[parent_index].index_token: # cltk_word.parent = parent_index return words_list
def test_main_analyze(self): """Testing methods from ``cltk/nlp.py``. Note that we change ``first_word.embedding`` into an empty list because otherwise we would have to add a long vector into our tests. """ lang = "grc" cltk_nlp = NLP(language=lang) cltk_doc = cltk_nlp.analyze(text=get_example_text(lang)) first_word = cltk_doc.words[0] self.assertIsInstance(first_word.embedding, numpy.ndarray) first_word.embedding = list() target = Word( index_char_start=None, index_char_stop=None, index_token=0, index_sentence=0, string="ὅτι", pos="ADV", lemma="ὅτι", scansion=None, xpos="Df", upos="ADV", dependency_relation="advmod", governor=6, features={}, embedding=[], stop=True, named_entity=False, ) self.assertEqual(first_word, target) lang = "chu" cltk_nlp = NLP(language=lang) cltk_doc = cltk_nlp.analyze(text=get_example_text(lang)) first_word = cltk_doc.words[0] target = Word( index_char_start=None, index_char_stop=None, index_token=0, index_sentence=0, string="отьчє", pos="NOUN", lemma="отьць", scansion=None, xpos="Nb", upos="NOUN", dependency_relation="vocative", governor=7, features={"Case": "Voc", "Gender": "Masc", "Number": "Sing"}, embedding=None, stop=None, named_entity=None, ) self.assertEqual(first_word, target) lang = "fro" cltk_nlp = NLP(language=lang) cltk_doc = cltk_nlp.analyze(text=get_example_text(lang)) first_word = cltk_doc.words[0] target = Word( index_char_start=None, index_char_stop=None, index_token=0, index_sentence=0, string="Une", pos="DET", lemma=None, scansion=None, xpos="DETndf", upos="DET", dependency_relation=None, governor=-1, features={"Definite": "Ind", "PronType": "Art"}, embedding=None, stop=False, named_entity=False, ) self.assertEqual(first_word, target) lang = "got" cltk_nlp = NLP(language=lang) cltk_doc = cltk_nlp.analyze(text=get_example_text(lang)) first_word = cltk_doc.words[0] self.assertIsInstance(first_word.embedding, numpy.ndarray) first_word.embedding = list() target = Word( index_char_start=None, index_char_stop=None, index_token=0, index_sentence=0, string="swa", pos="ADV", lemma="swa", scansion=None, xpos="Df", upos="ADV", dependency_relation="advmod", governor=1, features={}, embedding=[], stop=None, named_entity=None, ) self.assertEqual(first_word, target) self.assertEqual(len(cltk_doc.sentences), 3) # TODO: Re-enable coptic # raises ``KeyError: 'pretrain_path'`` from ``_set_up_model`` # lang = "cop" # cltk_nlp = NLP(language=lang) # cltk_doc = cltk_nlp.analyze(text=get_example_text(lang)) # first_word = cltk_doc.words[0] # target = Word( # index_char_start=None, # index_char_stop=None, # index_token=0, # index_sentence=0, # string="ⲧⲏⲛ", # pos="VERB", # lemma="ⲧⲏⲛ", # scansion=None, # xpos="VSTAT", # upos="VERB", # dependency_relation="root", # governor=-1, # features={"VerbForm": "Fin"}, # embedding=None, # stop=None, # named_entity=None, # ) # self.assertEqual(first_word, target) lang = "lzh" cltk_nlp = NLP(language=lang) cltk_doc = cltk_nlp.analyze(text=get_example_text(lang)) first_word = cltk_doc.words[0] target = Word( index_char_start=None, index_char_stop=None, index_token=0, index_sentence=0, string="黃", pos="NOUN", lemma="黃", scansion=None, xpos="n,名詞,描写,形質", upos="NOUN", dependency_relation="nmod", governor=1, features={}, embedding=None, stop=None, named_entity=None, ) self.assertEqual(first_word, target)
def test_embeddings_processes(self): language = "ang" # type: str example_text = get_example_text(language) # type: str word_objs = [ Word(string=word_obj) for word_obj in example_text.split(" ") ] # type: List[Word] a_process = OldEnglishEmbeddingsProcess( ) # type: OldEnglishEmbeddingsProcess a_doc = a_process.run(input_doc=Doc(raw=get_example_text(language), words=word_objs)) # type: Doc isinstance(a_doc.words[1].embedding, numpy.ndarray) language = "arc" # type: str example_text = get_example_text(language) # type: str word_objs = [ Word(string=word_obj) for word_obj in example_text.split(" ") ] # type: List[Word] a_process = AramaicEmbeddingsProcess( ) # type: AramaicEmbeddingsProcess a_doc = a_process.run(input_doc=Doc(raw=get_example_text(language), words=word_objs)) # type: Doc isinstance(a_doc.words[1].embedding, numpy.ndarray) language = "got" # type: str example_text = get_example_text(language) # str word_objs = [ Word(string=word_obj) for word_obj in example_text.split(" ") ] # type: List[Word] a_process = GothicEmbeddingsProcess() # type: GothicEmbeddingsProcess a_doc = a_process.run(input_doc=Doc(raw=get_example_text(language), words=word_objs)) # type: Doc isinstance(a_doc.words[1].embedding, numpy.ndarray) language = "grc" # type: str example_text = get_example_text(language) # type: str word_objs = [ Word(string=word_obj) for word_obj in example_text.split(" ") ] # type: List[Word] a_process = GreekEmbeddingsProcess() # type: GreekEmbeddingsProcess a_doc = a_process.run(input_doc=Doc(raw=get_example_text(language), words=word_objs)) # type: Doc isinstance(a_doc.words[1].embedding, numpy.ndarray) language = "lat" # type: str example_text = get_example_text(language) # type: str word_objs = [ Word(string=word_obj) for word_obj in example_text.split(" ") ] # type: List[Word] a_process = LatinEmbeddingsProcess() # type: LatinEmbeddingsProcess a_doc = a_process.run(input_doc=Doc(raw=get_example_text(language), words=word_objs)) # type: Doc isinstance(a_doc.words[1].embedding, numpy.ndarray) language = "pli" # type: str example_text = get_example_text(language) # type: str word_objs = [ Word(string=word_obj) for word_obj in example_text.split(" ") ] # type: List[Word] a_process = PaliEmbeddingsProcess() # type: PaliEmbeddingsProcess a_doc = a_process.run( input_doc=Doc(raw=get_example_text(language), words=word_objs)) isinstance(a_doc.words[1].embedding, numpy.ndarray) language = "san" # type: str example_text = get_example_text(language) # type: str word_objs = [ Word(string=word_obj) for word_obj in example_text.split(" ") ] # type: List[Word] a_process = SanskritEmbeddingsProcess( ) # type: SanskritEmbeddingsProcess a_doc = a_process.run(input_doc=Doc(raw=get_example_text(language), words=word_objs)) # type: Doc isinstance(a_doc.words[1].embedding, numpy.ndarray)
def test_main_analyze(self): """Testing methods from ``cltk/nlp.py``. Note that we change ``first_word.embedding`` into an empty list because otherwise we would have to add a long vector into our tests. """ lang = "chu" cltk_nlp = NLP(language=lang) cltk_doc = cltk_nlp.analyze(text=get_example_text(lang)) first_word = cltk_doc.words[0] target = Word( index_char_start=None, index_char_stop=None, index_token=0, index_sentence=0, string="отьчє", pos="noun", lemma="отьць", stem=None, scansion=None, xpos="Nb", upos="NOUN", dependency_relation="vocative", governor=7, features={"Case": "Voc", "Gender": "Masc", "Number": "Sing"}, embedding=None, stop=None, named_entity=None, syllables=None, phonetic_transcription=None, ) self._word_assertions(first_word, target) # Re-enable later. Raises error upon run, at least on build server # Should probably be reported back to Stanza # https://travis-ci.org/github/cltk/cltk/jobs/721808293#L636 # lang = "cop" # cltk_nlp = NLP(language=lang) # cltk_doc = cltk_nlp.analyze(text=get_example_text(lang)) # first_word = cltk_doc.words[0] # target = Word( # index_char_start=None, # index_char_stop=None, # index_token=0, # index_sentence=0, # string="ⲧⲏⲛ", # pos="VERB", # lemma="ⲧⲏⲛ", # stem=None, # scansion=None, # xpos="VSTAT", # upos="VERB", # dependency_relation="root", # governor=-1, # features={"VerbForm": "Fin"}, # embedding=None, # stop=None, # named_entity=None, # ) # self.assertEqual(first_word, target) lang = "fro" cltk_nlp = NLP(language=lang) cltk_doc = cltk_nlp.analyze(text=get_example_text(lang)) first_word = cltk_doc.words[0] target = Word( index_char_start=None, index_char_stop=None, index_token=0, index_sentence=0, string="Une", pos="DET", lemma="Une", stem=None, scansion=None, xpos="DETndf", upos="DET", dependency_relation=None, governor=-1, features={"Definite": "Ind", "PronType": "Art"}, embedding=None, stop=False, named_entity=False, syllables=None, phonetic_transcription=None, ) self._word_assertions(first_word, target) lang = "got" cltk_nlp = NLP(language=lang) cltk_doc = cltk_nlp.analyze(text=get_example_text(lang)) first_word = cltk_doc.words[0] self.assertIsInstance(first_word.embedding, numpy.ndarray) first_word.embedding = list() target = Word( index_char_start=None, index_char_stop=None, index_token=0, index_sentence=0, string="swa", pos="ADV", lemma="swa", stem=None, scansion=None, xpos="Df", upos="ADV", dependency_relation="advmod", governor=1, features={}, embedding=[], stop=None, named_entity=None, syllables=None, phonetic_transcription=None, ) self._word_assertions(first_word, target) self.assertEqual(len(cltk_doc.sentences), 3) lang = "grc" cltk_nlp = NLP(language=lang) cltk_doc = cltk_nlp.analyze(text=get_example_text(lang)) first_word = cltk_doc.words[0] self.assertIsInstance(first_word.embedding, numpy.ndarray) first_word.embedding = list() # clear out the array, for easier checking target = Word( index_char_start=None, index_char_stop=None, index_token=0, index_sentence=0, string="ὅτι", pos="ADV", lemma="ὅτι", stem=None, scansion=None, xpos="Df", upos="ADV", dependency_relation="advmod", governor=6, features={}, embedding=[], stop=False, named_entity=False, syllables=None, phonetic_transcription=None, ) self._word_assertions(first_word, target) lang = "lzh" cltk_nlp = NLP(language=lang) cltk_doc = cltk_nlp.analyze(text=get_example_text(lang)) first_word = cltk_doc.words[0] target = Word( index_char_start=None, index_char_stop=None, index_token=0, index_sentence=0, string="黃", pos="NOUN", lemma="黃", stem=None, scansion=None, xpos="n,名詞,描写,形質", upos="NOUN", dependency_relation="nmod", governor=1, features={}, embedding=None, stop=None, named_entity=None, syllables=None, phonetic_transcription=None, ) self._word_assertions(first_word, target)