Exemple #1
0
    def test_nlp_latin(self):
        TO = time.time()
        print("Starting complete `NLP()` test for 'lat' ...")

        lang = "lat"  # type: str

        self.assertIsInstance(LatinPipeline.description, str)
        self.assertIsInstance(LatinPipeline.language, Language)
        self.assertIsInstance(LatinPipeline.language.family_id, str)
        self.assertIsInstance(LatinPipeline.language.glottolog_id, str)
        self.assertIsInstance(LatinPipeline.language.iso_639_3_code, str)
        self.assertIsInstance(LatinPipeline.language.latitude, float)
        self.assertIsInstance(LatinPipeline.language.level, str)
        self.assertIsInstance(LatinPipeline.language.longitude, float)
        self.assertIsInstance(LatinPipeline.language.parent_id, str)
        self.assertIsInstance(LatinPipeline.language.type, str)

        text = get_example_text(iso_code=lang)
        self.assertIsInstance(text, str)

        cltk_nlp = NLP(language=lang)  # type: NLP
        self.assertIsInstance(cltk_nlp, NLP)

        cltk_doc = cltk_nlp.analyze(text=text)
        self.assertIsInstance(cltk_doc, Doc)
        self.assertIsInstance(cltk_doc.raw, str)
        self.assertEqual(cltk_doc.language, lang)
        self.assertIsInstance(cltk_doc.stanza_doc, Document)

        self.assertTrue(len(cltk_doc.words) > 0)
        all_words_pres = all(
            [isinstance(word, Word) for word in cltk_doc.words])
        self.assertTrue(all_words_pres)
        word = cltk_doc.words[0]
        self.assertIsInstance(word.category, MorphosyntacticFeatureBundle)
        self.assertIsInstance(word.dependency_relation, str)
        self.assertIsInstance(word.embedding, np.ndarray)
        self.assertIsInstance(word.governor, int)
        self.assertIsInstance(word.index_token, int)
        self.assertIsInstance(word.lemma, str)
        self.assertIsInstance(word.named_entity, str)
        self.assertIsInstance(word.pos, POS)
        self.assertIsInstance(word.stanza_features, str)
        self.assertIsInstance(word.stop, bool)
        self.assertIsInstance(word.string, str)
        self.assertIsInstance(word.upos, str)
        self.assertIsInstance(word.xpos, str)

        print(f"Finished complete test of `NLP()` in {time.time() - TO} secs.")
 def __init__(self, language):
     self.language = language
     self.spacy_analyzers = {
         "shake": "en_core_web_sm",
         "ger": "de_core_news_sm",
         "ita": "it_core_news_sm",
         "span": "es_core_news_sm"
     }
     self.cltk_analyzers = {"rom": "lat", "greek": "grc"}
     if language == "rus":
         self.analyzer = MorphAnalyzer()
     elif language in self.spacy_analyzers.keys():
         self.analyzer = spacy.load(self.spacy_analyzers[language])
     elif language in self.cltk_analyzers.keys():
         self.analyzer = NLP(language=self.cltk_analyzers[language])
Exemple #3
0
    def test_dependency_tree(self):
        cltk_nlp = NLP(language="lat")
        doc = cltk_nlp.analyze(text=get_example_text("lat"))
        one_word = doc.words[0]
        one_word.embedding = list()
        f = Form.to_form(word=one_word)
        form_str = f.full_str()
        target = "Gallia_0 [lemma=mallis,pos=noun,upos=NOUN,xpos=A1|grn1|casA|gen2,Case=nominative,Degree=positive,Gender=feminine,Number=singular]"
        self.assertEqual(form_str, target)

        t = DependencyTree.to_tree(doc.sentences[0])
        self.assertEqual(len(t.get_dependencies()), 28)

        t = DependencyTree.to_tree(doc.words[:25])
        self.assertIsInstance(t.findall("."), list)
        self.assertIsInstance(t.findall(".")[0], Form)
 def test_nlp_latin_stops(self):
     lang = "lat"  # type: str
     cltk_nlp = NLP(language=lang)  # type: NLP
     self.assertIsInstance(cltk_nlp, NLP)
     lat_pipeline = cltk_nlp.pipeline  # type: Pipeline
     pipeline_just_stops = [
         proc for proc in lat_pipeline.processes
         if proc.__name__ == "StopsProcess"
     ]  # type: List[Process]
     self.assertEqual(len(pipeline_just_stops), 1)
     stops_class = pipeline_just_stops[0]  # type: StopsProcess
     self.assertIs(stops_class, StopsProcess)
     words = [
         Word(string=token)
         for token in split_punct_ws(get_example_text(lang))
     ]
     doc = Doc(words=words)
     stops_obj = stops_class(language=lang)
     output_doc = stops_obj.run(input_doc=doc)
     is_stops = [w.stop for w in output_doc.words]  # type: List[bool]
     self.assertEqual(len(words), len(is_stops))
     self.assertIsInstance(is_stops[0], bool)
Exemple #5
0
    def test_main_analyze(self):
        """Testing methods from ``cltk/nlp.py``. Note that we
        change ``first_word.embedding`` into an empty list because
        otherwise we would have to add a long vector into our tests.
        """
        lang = "grc"
        cltk_nlp = NLP(language=lang)
        cltk_doc = cltk_nlp.analyze(text=get_example_text(lang))
        first_word = cltk_doc.words[0]
        self.assertIsInstance(first_word.embedding, numpy.ndarray)
        first_word.embedding = list()
        target = Word(
            index_char_start=None,
            index_char_stop=None,
            index_token=0,
            index_sentence=0,
            string="ὅτι",
            pos="ADV",
            lemma="ὅτι",
            scansion=None,
            xpos="Df",
            upos="ADV",
            dependency_relation="advmod",
            governor=6,
            features={},
            embedding=[],
            stop=True,
            named_entity=False,
        )
        self.assertEqual(first_word, target)

        lang = "chu"
        cltk_nlp = NLP(language=lang)
        cltk_doc = cltk_nlp.analyze(text=get_example_text(lang))
        first_word = cltk_doc.words[0]
        target = Word(
            index_char_start=None,
            index_char_stop=None,
            index_token=0,
            index_sentence=0,
            string="отьчє",
            pos="NOUN",
            lemma="отьць",
            scansion=None,
            xpos="Nb",
            upos="NOUN",
            dependency_relation="vocative",
            governor=7,
            features={"Case": "Voc", "Gender": "Masc", "Number": "Sing"},
            embedding=None,
            stop=None,
            named_entity=None,
        )
        self.assertEqual(first_word, target)

        lang = "fro"
        cltk_nlp = NLP(language=lang)
        cltk_doc = cltk_nlp.analyze(text=get_example_text(lang))
        first_word = cltk_doc.words[0]
        target = Word(
            index_char_start=None,
            index_char_stop=None,
            index_token=0,
            index_sentence=0,
            string="Une",
            pos="DET",
            lemma=None,
            scansion=None,
            xpos="DETndf",
            upos="DET",
            dependency_relation=None,
            governor=-1,
            features={"Definite": "Ind", "PronType": "Art"},
            embedding=None,
            stop=False,
            named_entity=False,
        )
        self.assertEqual(first_word, target)

        lang = "got"
        cltk_nlp = NLP(language=lang)
        cltk_doc = cltk_nlp.analyze(text=get_example_text(lang))
        first_word = cltk_doc.words[0]
        self.assertIsInstance(first_word.embedding, numpy.ndarray)
        first_word.embedding = list()
        target = Word(
            index_char_start=None,
            index_char_stop=None,
            index_token=0,
            index_sentence=0,
            string="swa",
            pos="ADV",
            lemma="swa",
            scansion=None,
            xpos="Df",
            upos="ADV",
            dependency_relation="advmod",
            governor=1,
            features={},
            embedding=[],
            stop=None,
            named_entity=None,
        )
        self.assertEqual(first_word, target)
        self.assertEqual(len(cltk_doc.sentences), 3)

        # TODO: Re-enable coptic
        # raises ``KeyError: 'pretrain_path'`` from ``_set_up_model``
        # lang = "cop"
        # cltk_nlp = NLP(language=lang)
        # cltk_doc = cltk_nlp.analyze(text=get_example_text(lang))
        # first_word = cltk_doc.words[0]
        # target = Word(
        #     index_char_start=None,
        #     index_char_stop=None,
        #     index_token=0,
        #     index_sentence=0,
        #     string="ⲧⲏⲛ",
        #     pos="VERB",
        #     lemma="ⲧⲏⲛ",
        #     scansion=None,
        #     xpos="VSTAT",
        #     upos="VERB",
        #     dependency_relation="root",
        #     governor=-1,
        #     features={"VerbForm": "Fin"},
        #     embedding=None,
        #     stop=None,
        #     named_entity=None,
        # )
        # self.assertEqual(first_word, target)

        lang = "lzh"
        cltk_nlp = NLP(language=lang)
        cltk_doc = cltk_nlp.analyze(text=get_example_text(lang))
        first_word = cltk_doc.words[0]
        target = Word(
            index_char_start=None,
            index_char_stop=None,
            index_token=0,
            index_sentence=0,
            string="黃",
            pos="NOUN",
            lemma="黃",
            scansion=None,
            xpos="n,名詞,描写,形質",
            upos="NOUN",
            dependency_relation="nmod",
            governor=1,
            features={},
            embedding=None,
            stop=None,
            named_entity=None,
        )
        self.assertEqual(first_word, target)
Exemple #6
0
    bs_content = bs(content, "lxml")
beta_text = []
for line in bs_content.findAll("p"):
    beta_text.append(line.text)
# for line in beta_text[:3]:
#     print(line)

text = []
for line in beta_text:
    text.append(beta_code.beta_code_to_greek(line))
# for line in text[:2]:
#     print(line)
one_string_text = ''.join(text)
# print(one_string_text[:2000])

cltk_nlp_grc = NLP(language='grc')
cltk_doc_grc = cltk_nlp_grc.analyze(text=one_string_text)
# print(cltk_doc_grc.lemmata[:254])
# print(cltk_doc_grc.pos[:254])
# for i in range(2000):
#         print(cltk_doc_grc.lemmata[i], cltk_doc_grc.pos[i])
print("==== the end of tokenization ====")


# GEONAMES
def search_geonames(name,
                    style="medium",
                    exact=False,
                    max_rows=10,
                    fuzz=1,
                    feature_classes=None,
class Preprocesser:
    """Wrapper class for all morphology preprocessing.

    Attributes:
        language: language of corpus being processed
    """
    def __init__(self, language):
        self.language = language
        self.spacy_analyzers = {
            "shake": "en_core_web_sm",
            "ger": "de_core_news_sm",
            "ita": "it_core_news_sm",
            "span": "es_core_news_sm"
        }
        self.cltk_analyzers = {"rom": "lat", "greek": "grc"}
        if language == "rus":
            self.analyzer = MorphAnalyzer()
        elif language in self.spacy_analyzers.keys():
            self.analyzer = spacy.load(self.spacy_analyzers[language])
        elif language in self.cltk_analyzers.keys():
            self.analyzer = NLP(language=self.cltk_analyzers[language])

    def lemmatize(self, line):
        self.lemmas = []
        play_lemmas = []
        if self.language == "rus":
            play_lemmas = [
                self.analyzer.parse(token)[0].normal_form
                for token in simple_word_tokenize(line)
            ]
        elif self.language in self.spacy_analyzers.keys():
            play_lemmas = [token.lemma_ for token in self.analyzer(line)]
        elif self.language in self.cltk_analyzers.keys():
            print(self.language)
            play_lemmas = self.analyzer.analyze(text=line).lemmata
        self.lemmas += play_lemmas
        return " ".join(play_lemmas)

    def pos(self, line):
        self.pos_dict = {}
        # parsing
        if self.language == "rus":
            play_pos = [
                self.analyzer.parse(token)[0].tag.POS
                for token in simple_word_tokenize(line)
            ]
        elif self.language in self.spacy_analyzers.keys():
            play_pos = [token.pos_ for token in self.analyzer(line)]
        elif self.language in self.cltk_analyzers.keys():
            print(self.language)
            play_pos = self.analyzer.analyze(text=line).pos
        return play_pos

    def count_items(self, play_items):
        item_dict = {}
        for item in play_items.split():
            if item not in item_dict:
                item_dict[item] = 1
            else:
                item_dict[item] += 1
        # percentages/shares instead of absolute values
        for item in item_dict:
            item_dict[item] = item_dict[item] / len(play_items)
        return item_dict
Exemple #8
0
    def test_main_analyze(self):
        """Testing methods from ``cltk/nlp.py``. Note that we
        change ``first_word.embedding`` into an empty list because
        otherwise we would have to add a long vector into our tests.
        """

        lang = "chu"
        cltk_nlp = NLP(language=lang)
        cltk_doc = cltk_nlp.analyze(text=get_example_text(lang))
        first_word = cltk_doc.words[0]
        target = Word(
            index_char_start=None,
            index_char_stop=None,
            index_token=0,
            index_sentence=0,
            string="отьчє",
            pos="noun",
            lemma="отьць",
            stem=None,
            scansion=None,
            xpos="Nb",
            upos="NOUN",
            dependency_relation="vocative",
            governor=7,
            features={"Case": "Voc", "Gender": "Masc", "Number": "Sing"},
            embedding=None,
            stop=None,
            named_entity=None,
            syllables=None,
            phonetic_transcription=None,
        )
        self._word_assertions(first_word, target)

        # Re-enable later. Raises error upon run, at least on build server
        # Should probably be reported back to Stanza
        # https://travis-ci.org/github/cltk/cltk/jobs/721808293#L636
        # lang = "cop"
        # cltk_nlp = NLP(language=lang)
        # cltk_doc = cltk_nlp.analyze(text=get_example_text(lang))
        # first_word = cltk_doc.words[0]
        # target = Word(
        #     index_char_start=None,
        #     index_char_stop=None,
        #     index_token=0,
        #     index_sentence=0,
        #     string="ⲧⲏⲛ",
        #     pos="VERB",
        #     lemma="ⲧⲏⲛ",
        #     stem=None,
        #     scansion=None,
        #     xpos="VSTAT",
        #     upos="VERB",
        #     dependency_relation="root",
        #     governor=-1,
        #     features={"VerbForm": "Fin"},
        #     embedding=None,
        #     stop=None,
        #     named_entity=None,
        # )
        # self.assertEqual(first_word, target)

        lang = "fro"
        cltk_nlp = NLP(language=lang)
        cltk_doc = cltk_nlp.analyze(text=get_example_text(lang))
        first_word = cltk_doc.words[0]
        target = Word(
            index_char_start=None,
            index_char_stop=None,
            index_token=0,
            index_sentence=0,
            string="Une",
            pos="DET",
            lemma="Une",
            stem=None,
            scansion=None,
            xpos="DETndf",
            upos="DET",
            dependency_relation=None,
            governor=-1,
            features={"Definite": "Ind", "PronType": "Art"},
            embedding=None,
            stop=False,
            named_entity=False,
            syllables=None,
            phonetic_transcription=None,
        )
        self._word_assertions(first_word, target)

        lang = "got"
        cltk_nlp = NLP(language=lang)
        cltk_doc = cltk_nlp.analyze(text=get_example_text(lang))
        first_word = cltk_doc.words[0]
        self.assertIsInstance(first_word.embedding, numpy.ndarray)
        first_word.embedding = list()
        target = Word(
            index_char_start=None,
            index_char_stop=None,
            index_token=0,
            index_sentence=0,
            string="swa",
            pos="ADV",
            lemma="swa",
            stem=None,
            scansion=None,
            xpos="Df",
            upos="ADV",
            dependency_relation="advmod",
            governor=1,
            features={},
            embedding=[],
            stop=None,
            named_entity=None,
            syllables=None,
            phonetic_transcription=None,
        )
        self._word_assertions(first_word, target)
        self.assertEqual(len(cltk_doc.sentences), 3)

        lang = "grc"
        cltk_nlp = NLP(language=lang)
        cltk_doc = cltk_nlp.analyze(text=get_example_text(lang))
        first_word = cltk_doc.words[0]
        self.assertIsInstance(first_word.embedding, numpy.ndarray)
        first_word.embedding = list()  # clear out the array, for easier checking
        target = Word(
            index_char_start=None,
            index_char_stop=None,
            index_token=0,
            index_sentence=0,
            string="ὅτι",
            pos="ADV",
            lemma="ὅτι",
            stem=None,
            scansion=None,
            xpos="Df",
            upos="ADV",
            dependency_relation="advmod",
            governor=6,
            features={},
            embedding=[],
            stop=False,
            named_entity=False,
            syllables=None,
            phonetic_transcription=None,
        )
        self._word_assertions(first_word, target)

        lang = "lzh"
        cltk_nlp = NLP(language=lang)
        cltk_doc = cltk_nlp.analyze(text=get_example_text(lang))
        first_word = cltk_doc.words[0]
        target = Word(
            index_char_start=None,
            index_char_stop=None,
            index_token=0,
            index_sentence=0,
            string="黃",
            pos="NOUN",
            lemma="黃",
            stem=None,
            scansion=None,
            xpos="n,名詞,描写,形質",
            upos="NOUN",
            dependency_relation="nmod",
            governor=1,
            features={},
            embedding=None,
            stop=None,
            named_entity=None,
            syllables=None,
            phonetic_transcription=None,
        )
        self._word_assertions(first_word, target)