def test_nlp_latin(self): TO = time.time() print("Starting complete `NLP()` test for 'lat' ...") lang = "lat" # type: str self.assertIsInstance(LatinPipeline.description, str) self.assertIsInstance(LatinPipeline.language, Language) self.assertIsInstance(LatinPipeline.language.family_id, str) self.assertIsInstance(LatinPipeline.language.glottolog_id, str) self.assertIsInstance(LatinPipeline.language.iso_639_3_code, str) self.assertIsInstance(LatinPipeline.language.latitude, float) self.assertIsInstance(LatinPipeline.language.level, str) self.assertIsInstance(LatinPipeline.language.longitude, float) self.assertIsInstance(LatinPipeline.language.parent_id, str) self.assertIsInstance(LatinPipeline.language.type, str) text = get_example_text(iso_code=lang) self.assertIsInstance(text, str) cltk_nlp = NLP(language=lang) # type: NLP self.assertIsInstance(cltk_nlp, NLP) cltk_doc = cltk_nlp.analyze(text=text) self.assertIsInstance(cltk_doc, Doc) self.assertIsInstance(cltk_doc.raw, str) self.assertEqual(cltk_doc.language, lang) self.assertIsInstance(cltk_doc.stanza_doc, Document) self.assertTrue(len(cltk_doc.words) > 0) all_words_pres = all( [isinstance(word, Word) for word in cltk_doc.words]) self.assertTrue(all_words_pres) word = cltk_doc.words[0] self.assertIsInstance(word.category, MorphosyntacticFeatureBundle) self.assertIsInstance(word.dependency_relation, str) self.assertIsInstance(word.embedding, np.ndarray) self.assertIsInstance(word.governor, int) self.assertIsInstance(word.index_token, int) self.assertIsInstance(word.lemma, str) self.assertIsInstance(word.named_entity, str) self.assertIsInstance(word.pos, POS) self.assertIsInstance(word.stanza_features, str) self.assertIsInstance(word.stop, bool) self.assertIsInstance(word.string, str) self.assertIsInstance(word.upos, str) self.assertIsInstance(word.xpos, str) print(f"Finished complete test of `NLP()` in {time.time() - TO} secs.")
def test_dependency_tree(self): cltk_nlp = NLP(language="lat") doc = cltk_nlp.analyze(text=get_example_text("lat")) one_word = doc.words[0] one_word.embedding = list() f = Form.to_form(word=one_word) form_str = f.full_str() target = "Gallia_0 [lemma=mallis,pos=noun,upos=NOUN,xpos=A1|grn1|casA|gen2,Case=nominative,Degree=positive,Gender=feminine,Number=singular]" self.assertEqual(form_str, target) t = DependencyTree.to_tree(doc.sentences[0]) self.assertEqual(len(t.get_dependencies()), 28) t = DependencyTree.to_tree(doc.words[:25]) self.assertIsInstance(t.findall("."), list) self.assertIsInstance(t.findall(".")[0], Form)
def test_main_analyze(self): """Testing methods from ``cltk/nlp.py``. Note that we change ``first_word.embedding`` into an empty list because otherwise we would have to add a long vector into our tests. """ lang = "grc" cltk_nlp = NLP(language=lang) cltk_doc = cltk_nlp.analyze(text=get_example_text(lang)) first_word = cltk_doc.words[0] self.assertIsInstance(first_word.embedding, numpy.ndarray) first_word.embedding = list() target = Word( index_char_start=None, index_char_stop=None, index_token=0, index_sentence=0, string="ὅτι", pos="ADV", lemma="ὅτι", scansion=None, xpos="Df", upos="ADV", dependency_relation="advmod", governor=6, features={}, embedding=[], stop=True, named_entity=False, ) self.assertEqual(first_word, target) lang = "chu" cltk_nlp = NLP(language=lang) cltk_doc = cltk_nlp.analyze(text=get_example_text(lang)) first_word = cltk_doc.words[0] target = Word( index_char_start=None, index_char_stop=None, index_token=0, index_sentence=0, string="отьчє", pos="NOUN", lemma="отьць", scansion=None, xpos="Nb", upos="NOUN", dependency_relation="vocative", governor=7, features={"Case": "Voc", "Gender": "Masc", "Number": "Sing"}, embedding=None, stop=None, named_entity=None, ) self.assertEqual(first_word, target) lang = "fro" cltk_nlp = NLP(language=lang) cltk_doc = cltk_nlp.analyze(text=get_example_text(lang)) first_word = cltk_doc.words[0] target = Word( index_char_start=None, index_char_stop=None, index_token=0, index_sentence=0, string="Une", pos="DET", lemma=None, scansion=None, xpos="DETndf", upos="DET", dependency_relation=None, governor=-1, features={"Definite": "Ind", "PronType": "Art"}, embedding=None, stop=False, named_entity=False, ) self.assertEqual(first_word, target) lang = "got" cltk_nlp = NLP(language=lang) cltk_doc = cltk_nlp.analyze(text=get_example_text(lang)) first_word = cltk_doc.words[0] self.assertIsInstance(first_word.embedding, numpy.ndarray) first_word.embedding = list() target = Word( index_char_start=None, index_char_stop=None, index_token=0, index_sentence=0, string="swa", pos="ADV", lemma="swa", scansion=None, xpos="Df", upos="ADV", dependency_relation="advmod", governor=1, features={}, embedding=[], stop=None, named_entity=None, ) self.assertEqual(first_word, target) self.assertEqual(len(cltk_doc.sentences), 3) # TODO: Re-enable coptic # raises ``KeyError: 'pretrain_path'`` from ``_set_up_model`` # lang = "cop" # cltk_nlp = NLP(language=lang) # cltk_doc = cltk_nlp.analyze(text=get_example_text(lang)) # first_word = cltk_doc.words[0] # target = Word( # index_char_start=None, # index_char_stop=None, # index_token=0, # index_sentence=0, # string="ⲧⲏⲛ", # pos="VERB", # lemma="ⲧⲏⲛ", # scansion=None, # xpos="VSTAT", # upos="VERB", # dependency_relation="root", # governor=-1, # features={"VerbForm": "Fin"}, # embedding=None, # stop=None, # named_entity=None, # ) # self.assertEqual(first_word, target) lang = "lzh" cltk_nlp = NLP(language=lang) cltk_doc = cltk_nlp.analyze(text=get_example_text(lang)) first_word = cltk_doc.words[0] target = Word( index_char_start=None, index_char_stop=None, index_token=0, index_sentence=0, string="黃", pos="NOUN", lemma="黃", scansion=None, xpos="n,名詞,描写,形質", upos="NOUN", dependency_relation="nmod", governor=1, features={}, embedding=None, stop=None, named_entity=None, ) self.assertEqual(first_word, target)
beta_text = [] for line in bs_content.findAll("p"): beta_text.append(line.text) # for line in beta_text[:3]: # print(line) text = [] for line in beta_text: text.append(beta_code.beta_code_to_greek(line)) # for line in text[:2]: # print(line) one_string_text = ''.join(text) # print(one_string_text[:2000]) cltk_nlp_grc = NLP(language='grc') cltk_doc_grc = cltk_nlp_grc.analyze(text=one_string_text) # print(cltk_doc_grc.lemmata[:254]) # print(cltk_doc_grc.pos[:254]) # for i in range(2000): # print(cltk_doc_grc.lemmata[i], cltk_doc_grc.pos[i]) print("==== the end of tokenization ====") # GEONAMES def search_geonames(name, style="medium", exact=False, max_rows=10, fuzz=1, feature_classes=None, feature_codes=None,
class Preprocesser: """Wrapper class for all morphology preprocessing. Attributes: language: language of corpus being processed """ def __init__(self, language): self.language = language self.spacy_analyzers = { "shake": "en_core_web_sm", "ger": "de_core_news_sm", "ita": "it_core_news_sm", "span": "es_core_news_sm" } self.cltk_analyzers = {"rom": "lat", "greek": "grc"} if language == "rus": self.analyzer = MorphAnalyzer() elif language in self.spacy_analyzers.keys(): self.analyzer = spacy.load(self.spacy_analyzers[language]) elif language in self.cltk_analyzers.keys(): self.analyzer = NLP(language=self.cltk_analyzers[language]) def lemmatize(self, line): self.lemmas = [] play_lemmas = [] if self.language == "rus": play_lemmas = [ self.analyzer.parse(token)[0].normal_form for token in simple_word_tokenize(line) ] elif self.language in self.spacy_analyzers.keys(): play_lemmas = [token.lemma_ for token in self.analyzer(line)] elif self.language in self.cltk_analyzers.keys(): print(self.language) play_lemmas = self.analyzer.analyze(text=line).lemmata self.lemmas += play_lemmas return " ".join(play_lemmas) def pos(self, line): self.pos_dict = {} # parsing if self.language == "rus": play_pos = [ self.analyzer.parse(token)[0].tag.POS for token in simple_word_tokenize(line) ] elif self.language in self.spacy_analyzers.keys(): play_pos = [token.pos_ for token in self.analyzer(line)] elif self.language in self.cltk_analyzers.keys(): print(self.language) play_pos = self.analyzer.analyze(text=line).pos return play_pos def count_items(self, play_items): item_dict = {} for item in play_items.split(): if item not in item_dict: item_dict[item] = 1 else: item_dict[item] += 1 # percentages/shares instead of absolute values for item in item_dict: item_dict[item] = item_dict[item] / len(play_items) return item_dict
def test_main_analyze(self): """Testing methods from ``cltk/nlp.py``. Note that we change ``first_word.embedding`` into an empty list because otherwise we would have to add a long vector into our tests. """ lang = "chu" cltk_nlp = NLP(language=lang) cltk_doc = cltk_nlp.analyze(text=get_example_text(lang)) first_word = cltk_doc.words[0] target = Word( index_char_start=None, index_char_stop=None, index_token=0, index_sentence=0, string="отьчє", pos="noun", lemma="отьць", stem=None, scansion=None, xpos="Nb", upos="NOUN", dependency_relation="vocative", governor=7, features={"Case": "Voc", "Gender": "Masc", "Number": "Sing"}, embedding=None, stop=None, named_entity=None, syllables=None, phonetic_transcription=None, ) self._word_assertions(first_word, target) # Re-enable later. Raises error upon run, at least on build server # Should probably be reported back to Stanza # https://travis-ci.org/github/cltk/cltk/jobs/721808293#L636 # lang = "cop" # cltk_nlp = NLP(language=lang) # cltk_doc = cltk_nlp.analyze(text=get_example_text(lang)) # first_word = cltk_doc.words[0] # target = Word( # index_char_start=None, # index_char_stop=None, # index_token=0, # index_sentence=0, # string="ⲧⲏⲛ", # pos="VERB", # lemma="ⲧⲏⲛ", # stem=None, # scansion=None, # xpos="VSTAT", # upos="VERB", # dependency_relation="root", # governor=-1, # features={"VerbForm": "Fin"}, # embedding=None, # stop=None, # named_entity=None, # ) # self.assertEqual(first_word, target) lang = "fro" cltk_nlp = NLP(language=lang) cltk_doc = cltk_nlp.analyze(text=get_example_text(lang)) first_word = cltk_doc.words[0] target = Word( index_char_start=None, index_char_stop=None, index_token=0, index_sentence=0, string="Une", pos="DET", lemma="Une", stem=None, scansion=None, xpos="DETndf", upos="DET", dependency_relation=None, governor=-1, features={"Definite": "Ind", "PronType": "Art"}, embedding=None, stop=False, named_entity=False, syllables=None, phonetic_transcription=None, ) self._word_assertions(first_word, target) lang = "got" cltk_nlp = NLP(language=lang) cltk_doc = cltk_nlp.analyze(text=get_example_text(lang)) first_word = cltk_doc.words[0] self.assertIsInstance(first_word.embedding, numpy.ndarray) first_word.embedding = list() target = Word( index_char_start=None, index_char_stop=None, index_token=0, index_sentence=0, string="swa", pos="ADV", lemma="swa", stem=None, scansion=None, xpos="Df", upos="ADV", dependency_relation="advmod", governor=1, features={}, embedding=[], stop=None, named_entity=None, syllables=None, phonetic_transcription=None, ) self._word_assertions(first_word, target) self.assertEqual(len(cltk_doc.sentences), 3) lang = "grc" cltk_nlp = NLP(language=lang) cltk_doc = cltk_nlp.analyze(text=get_example_text(lang)) first_word = cltk_doc.words[0] self.assertIsInstance(first_word.embedding, numpy.ndarray) first_word.embedding = list() # clear out the array, for easier checking target = Word( index_char_start=None, index_char_stop=None, index_token=0, index_sentence=0, string="ὅτι", pos="ADV", lemma="ὅτι", stem=None, scansion=None, xpos="Df", upos="ADV", dependency_relation="advmod", governor=6, features={}, embedding=[], stop=False, named_entity=False, syllables=None, phonetic_transcription=None, ) self._word_assertions(first_word, target) lang = "lzh" cltk_nlp = NLP(language=lang) cltk_doc = cltk_nlp.analyze(text=get_example_text(lang)) first_word = cltk_doc.words[0] target = Word( index_char_start=None, index_char_stop=None, index_token=0, index_sentence=0, string="黃", pos="NOUN", lemma="黃", stem=None, scansion=None, xpos="n,名詞,描写,形質", upos="NOUN", dependency_relation="nmod", governor=1, features={}, embedding=None, stop=None, named_entity=None, syllables=None, phonetic_transcription=None, ) self._word_assertions(first_word, target)