Python WordTokenizer.tokenize Examples, cltk.tokenize.word.WordTokenizer.tokenize Python Examples

Example #1

0

Show file

File: test_tokenize.py Project: ManviG/cltk

 def test_latin_word_tokenizer(self):
     """Test Latin-specific word tokenizer."""
     word_tokenizer = WordTokenizer('latin')
     
     #Test sources:
     # - V. Aen. 1.1
     # - Prop. 2.5.1-2
     # - Ov. Am. 1.8.65-66
     # - Cic. Phillip. 13.14
     
     tests = ['Arma virumque cano, Troiae qui primus ab oris.',
                 'Hoc verumst, tota te ferri, Cynthia, Roma, et non ignota vivere nequitia?',
                 'Nec te decipiant veteres circum atria cerae. Tolle tuos tecum, pauper amator, avos!',
                 'Neque enim, quod quisque potest, id ei licet, nec, si non obstatur, propterea etiam permittitur.']
     
     results = []
     
     for test in tests:
         result = word_tokenizer.tokenize(test)
         results.append(result)
                 
     target = [['Arma', 'que', 'virum', 'cano', ',', 'Troiae', 'qui', 'primus', 'ab', 'oris.'],
                 ['Hoc', 'verum', 'est', ',', 'tota', 'te', 'ferri', ',', 'Cynthia', ',', 'Roma', ',', 'et', 'non', 'ignota', 'vivere', 'nequitia', '?'],
                 ['Nec', 'te', 'decipiant', 'veteres', 'circum', 'atria', 'cerae.', 'Tolle', 'tuos', 'cum', 'te', ',', 'pauper', 'amator', ',', 'avos', '!'],
                 ['que', 'Ne', 'enim', ',', 'quod', 'quisque', 'potest', ',', 'id', 'ei', 'licet', ',', 'c', 'ne', ',', 'si', 'non', 'obstatur', ',', 'propterea', 'etiam', 'permittitur.']]
                 
     self.assertEqual(results, target)

Example #2

0

Show file

File: stem.py Project: TylerKirby/cltk

def stemmer_middle_high_german(text_l, rem_umlauts = True, exceptions = exc_dict):
	"""text_l: text in string format
	   rem_umlauts: choose whether to remove umlauts from string
	   exceptions: hard-coded dictionary for the cases the algorithm fails"""
	
	#Normalize text
	text_l = normalize_middle_high_german(text_l, to_lower_all = False, to_lower_beginning = True)
	
	#Tokenize text
	word_tokenizer = WordTokenizer("middle_high_german")
	text_l = word_tokenizer.tokenize(text_l)
	text = []

	
	for word in text_l:
		try:
			text.append(exceptions[word]) #test if word in exception dictionary
			
		except:
			if word[0].isupper():
				#MHG only uses upper case for locations, people, etc. So any word that starts with a capital
				#letter while not being at the start of a sentence will automatically be excluded.
				text.append(word)
				
			elif word in MHG_STOPS: 
				text.append(word) #Filter stop words
				
			else:
				text.append(stem_helper(word, rem_umlaut = rem_umlauts))
	return text

Example #3

0

Show file

File: test_tokenize.py Project: cltk/cltk

    def test_latin_word_tokenizer_base(self):
        """Test Latin-specific word tokenizer."""
        word_tokenizer = WordTokenizer('latin')

        #Test sources:
        # - V. Aen. 1.1
        # - Prop. 2.5.1-2
        # - Ov. Am. 1.8.65-66
        # - Cic. Phillip. 13.14
        # - Plaut. Capt. 937
        # - Lucr. DRN. 5.1351-53
        # - Plaut. Bacch. 837-38
        # - Plaut. Amph. 823
        # - Caes. Bel. 6.29.2

        tests = ['Arma virumque cano, Troiae qui primus ab oris.',
                    'Hoc verumst, tota te ferri, Cynthia, Roma, et non ignota vivere nequitia?',
                    'Nec te decipiant veteres circum atria cerae. Tolle tuos tecum, pauper amator, avos!',
                    'Neque enim, quod quisque potest, id ei licet, nec, si non obstatur, propterea etiam permittitur.',
                    'Quid opust verbis? lingua nullast qua negem quidquid roges.',
                    'Textile post ferrumst, quia ferro tela paratur, nec ratione alia possunt tam levia gigni insilia ac fusi, radii, scapique sonantes.',  # pylint: disable=line-too-long
                    'Dic sodes mihi, bellan videtur specie mulier?',
                    'Cenavin ego heri in navi in portu Persico?',
                    'quae ripas Ubiorum contingebat in longitudinem pedum ducentorum rescindit']

        results = []

        for test in tests:
            result = word_tokenizer.tokenize(test)
            results.append(result)

        target = [['Arma', 'virumque', 'cano', ',', 'Troiae', 'qui', 'primus', 'ab', 'oris', '.'], ['Hoc', 'verumst', ',', 'tota', 'te', 'ferri', ',', 'Cynthia', ',', 'Roma', ',', 'et', 'non', 'ignota', 'vivere', 'nequitia', '?'], ['Nec', 'te', 'decipiant', 'veteres', 'circum', 'atria', 'cerae.', 'Tolle', 'tuos', 'tecum', ',', 'pauper', 'amator', ',', 'avos', '!'], ['Neque', 'enim', ',', 'quod', 'quisque', 'potest', ',', 'id', 'ei', 'licet', ',', 'nec', ',', 'si', 'non', 'obstatur', ',', 'propterea', 'etiam', 'permittitur', '.'], ['Quid', 'opust', 'verbis', '?', 'lingua', 'nullast', 'qua', 'negem', 'quidquid', 'roges', '.'], ['Textile', 'post', 'ferrumst', ',', 'quia', 'ferro', 'tela', 'paratur', ',', 'nec', 'ratione', 'alia', 'possunt', 'tam', 'levia', 'gigni', 'insilia', 'ac', 'fusi', ',', 'radii', ',', 'scapique', 'sonantes', '.'], ['Dic', 'sodes', 'mihi', ',', 'bellan', 'videtur', 'specie', 'mulier', '?'], ['Cenavin', 'ego', 'heri', 'in', 'navi', 'in', 'portu', 'Persico', '?'], ['quae', 'ripas', 'Ubiorum', 'contingebat', 'in', 'longitudinem', 'pedum', 'ducentorum', 'rescindit']]

        self.assertEqual(results, target)

Example #4

0

Show file

File: test_middle_english.py Project: TylerKirby/cltk

 def test_middle_english_tokenizer(self):
     text = "    Fers am I ferd of oure fare;\n Fle we ful fast þer-fore. \n Can Y no cownsel bot care.\n\n"
     target = ['Fers', 'am', 'I', 'ferd', 'of', 'oure', 'fare', ';', 'Fle', 'we', 'ful', 'fast', 'þer', '-', 'fore', '.',
               'Can', 'Y', 'no', 'cownsel', 'bot', 'care', '.']
     tokenizer = WordTokenizer('middle_english')
     tokenized = tokenizer.tokenize(text)
     self.assertTrue(tokenized == target)

Example #5

0

Show file

File: test_tokenize.py Project: TylerKirby/cltk

    def test_tokenize_arabic_words(self):
        word_tokenizer = WordTokenizer('arabic')
        tests = ['اللُّغَةُ الْعَرَبِيَّةُ جَمِيلَةٌ.',
                 'انما الْمُؤْمِنُونَ اخوه فاصلحوا بَيْنَ اخويكم',
                 'الْعَجُزُ عَنِ الْإِدْرَاكِ إِدْرَاكٌ، وَالْبَحْثَ فِي ذاتِ اللَّه اشراك.',
                 'اللَّهُمُّ اُسْتُرْ عُيُوبَنَا وَأَحْسَنَ خَوَاتِيمَنَا الْكَاتِبِ: نَبِيلُ جلهوم',
                 'الرَّأْي قَبْلَ شَجَاعَة الشّجعَانِ',
                 'فَأَنْزَلْنَا مِنْ السَّمَاء مَاء فَأَسْقَيْنَاكُمُوهُ',
                 'سُئِلَ بَعْضُ الْكُتَّابِ عَنِ الْخَطّ، مَتَى يَسْتَحِقُّ أَنْ يُوصَفَ بِالْجَوْدَةِ ؟'
                ]

        results = []
        for test in tests:
            result = word_tokenizer.tokenize(test)
            results.append(result)

        target = [['اللُّغَةُ', 'الْعَرَبِيَّةُ', 'جَمِيلَةٌ', '.'],
                  ['انما', 'الْمُؤْمِنُونَ', 'اخوه', 'فاصلحوا', 'بَيْنَ', 'اخويكم'],
                  ['الْعَجُزُ', 'عَنِ', 'الْإِدْرَاكِ', 'إِدْرَاكٌ', '،', 'وَالْبَحْثَ', 'فِي', 'ذاتِ', 'اللَّه', 'اشراك', '.'],  # pylint: disable=line-too-long
                  ['اللَّهُمُّ', 'اُسْتُرْ', 'عُيُوبَنَا', 'وَأَحْسَنَ', 'خَوَاتِيمَنَا', 'الْكَاتِبِ', ':', 'نَبِيلُ', 'جلهوم'],  # pylint: disable=line-too-long
                  ['الرَّأْي', 'قَبْلَ', 'شَجَاعَة', 'الشّجعَانِ'],
                  ['فَأَنْزَلْنَا', 'مِنْ', 'السَّمَاء', 'مَاء', 'فَأَسْقَيْنَاكُمُوهُ'],
                  ['سُئِلَ', 'بَعْضُ', 'الْكُتَّابِ', 'عَنِ', 'الْخَطّ', '،', 'مَتَى', 'يَسْتَحِقُّ', 'أَنْ', 'يُوصَفَ', 'بِالْجَوْدَةِ', '؟']  # pylint: disable=line-too-long
                 ]
        self.assertEqual(results, target)

Example #6

0

Show file

File: test_tokenize.py Project: eamonnbell/cltk

 def test_latin_word_tokenizer(self):
     """Test Latin-specific word tokenizer."""
     word_tokenizer = WordTokenizer('latin')
     text = 'atque haec abuterque nihil'
     tokens = word_tokenizer.tokenize(text)
     target = ['atque', 'haec', 'abuter', '-que', 'nihil']
     self.assertEqual(tokens, target)

Example #7

0

Show file

File: test_tokenize.py Project: TylerKirby/cltk

 def test_old_norse_word_tokenizer(self):
     text = "Gylfi konungr var maðr vitr ok fjölkunnigr. " \
            "Hann undraðist þat mjök, er ásafólk var svá kunnigt, at allir hlutir gengu at vilja þeira."
     target = ['Gylfi', 'konungr', 'var', 'maðr', 'vitr', 'ok', 'fjölkunnigr', '.', 'Hann', 'undraðist', 'þat',
               'mjök', ',', 'er', 'ásafólk', 'var', 'svá', 'kunnigt', ',', 'at', 'allir', 'hlutir', 'gengu', 'at',
               'vilja', 'þeira', '.']
     word_tokenizer = WordTokenizer('old_norse')
     result = word_tokenizer.tokenize(text)
     self.assertTrue(result == target)

Example #8

0

Show file

File: test_lemmatize.py Project: TylerKirby/cltk

 def test_french_lemmatizer(self):
     text = "Li rois pense que par folie, Sire Tristran, vos aie amé ; Mais Dé plevis ma loiauté, Qui sor mon cors mete flaele, S'onques fors cil qui m’ot pucele Out m'amistié encor nul jor !"
     text = str.lower(text)
     tokenizer = WordTokenizer('french')
     lemmatizer = LemmaReplacer()
     tokens = tokenizer.tokenize(text)
     lemmas = lemmatizer.lemmatize(tokens)
     target = [('li', 'li'), ('rois', 'rois'), ('pense', 'pense'), ('que', 'que'), ('par', 'par'), ('folie', 'folie'), (',', ['PUNK']), ('sire', 'sire'), ('tristran', 'None'), (',', ['PUNK']), ('vos', 'vos'), ('aie', ['avoir']), ('amé', 'amer'), (';', ['PUNK']), ('mais', 'mais'), ('dé', 'dé'), ('plevis', 'plevir'), ('ma', 'ma'), ('loiauté', 'loiauté'), (',', ['PUNK']), ('qui', 'qui'), ('sor', 'sor'), ('mon', 'mon'), ('cors', 'cors'), ('mete', 'mete'), ('flaele', 'flaele'), (',', ['PUNK']), ("s'", "s'"), ('onques', 'onques'), ('fors', 'fors'), ('cil', 'cil'), ('qui', 'qui'), ("m'", "m'"), ('ot', 'ot'), ('pucele', 'pucele'), ('out', ['avoir']), ("m'", "m'"), ('amistié', 'amistié'), ('encor', 'encor'), ('nul', 'nul'), ('jor', 'jor'), ('!', ['PUNK'])]
     self.assertEqual(lemmas, target)

Example #9

0

Show file

File: test_stop.py Project: cltk/cltk

    def test_middle_high_german_stopwords(self):
        """Test filtering  Middle High German stopwords."""

        sentence = "Swer was ze Bêârosche komn, doch hete Gâwân dâ genomn den prîs ze bêder sît al ein wan daz dervor ein ritter schein, bî rôtem wâpen unrekant, des prîs man in die hœhe bant."
        lowered = sentence.lower()
        tokenizer = WordTokenizer('middle_high_german')
        tokens = tokenizer.tokenize(lowered)
        no_stops = [w for w in tokens if w not in MHG_STOPS]
        target_list = ['swer', 'bêârosche', 'komn', ',', 'gâwân', 'genomn', 'prîs', 'bêder', 'sît', 'dervor', 'ritter', 'schein', ',', 'rôtem', 'wâpen', 'unrekant', ',', 'prîs', 'hœhe', 'bant', '.']
        self.assertEqual(no_stops,target_list)

Example #10

0

Show file

File: formatter.py Project: TylerKirby/cltk

def normalize_fr(string):
    string = string.lower()
    word_tokenizer = WordTokenizer('french')
    tokens = word_tokenizer.tokenize(string)
    normalized_text = []
    for token in tokens:
        for matches_rule, apply_rule in rules:
            if matches_rule(token):
                normalized = apply_rule(token)
                normalized_text.append(normalized)
    return normalized_text

Example #11

0

Show file

File: test_middle_high_german.py Project: cltk/cltk

    def test_middle_high_german_tokenize(self):
        """
        Test tokenizing Middle High German
        """
        word_tokenizer = WordTokenizer('middle_high_german')
        text = "Mīn ougen   wurden liebes alsō vol, \n\n\ndō ich die minneclīchen ērst gesach,\ndaȥ eȥ mir hiute und   iemer mē tuot wol."

        tokenized = word_tokenizer.tokenize(text)
        target = ['Mīn', 'ougen', 'wurden', 'liebes', 'alsō', 'vol', ',', 'dō', 'ich', 'die', 'minneclīchen', 'ērst', 'gesach', ',', 'daȥ', 'eȥ', 'mir', 'hiute', 'und', 'iemer', 'mē', 'tuot', 'wol', '.']

        self.assertEqual(tokenized, target)

Example #12

0

Show file

File: test_tokenize.py Project: cltk/cltk

 def test_akkadian_word_tokenizer(self):
     """
     Tests word_tokenizer.
     """
     tokenizer = WordTokenizer('akkadian')
     line = 'u2-wa-a-ru at-ta e2-kal2-la-ka _e2_-ka wu-e-er'
     output = tokenizer.tokenize(line)
     goal = [('u2-wa-a-ru', 'akkadian'), ('at-ta', 'akkadian'),
             ('e2-kal2-la-ka', 'akkadian'),
             ('_e2_-ka', 'sumerian'), ('wu-e-er', 'akkadian')]
     self.assertEqual(output, goal)

Example #13

0

Show file

File: test_lemmatize.py Project: TylerKirby/cltk

 def test_bigram_pos_lemmatizer(self):
     train = [[('dixissem', 'dico', 'v')], [('de', 'de', 'r'), ('te', 'tu', 'p'), ('autem', 'autem', 'c'), (',', 'punc', 'u'), ('catilina', 'catilina', 'n'), (',', 'punc', 'u'), ('cum', 'cum2', 'c'), ('quiescunt', 'quiesco', 'v'), (',', 'punc', 'u'), ('probant', 'probo', 'v'), (',', 'punc', 'u'), ('cum', 'cum2', 'c'), ('patiuntur', 'patior', 'v'), (',', 'punc', 'u'), ('decernunt', 'decerno', 'v'), (',', 'punc', 'u'), ('cum', 'cum2', 'c'), ('tacent', 'taceo', 'v'), (',', 'punc', 'u'), ('clamant', 'clamo', 'v'), (',', 'punc', 'u'), ('neque', 'neque', 'c'), ('hi', 'hic', 'p'), ('solum', 'solus', 'd'), ('quorum', 'qui', 'p'), ('tibi', 'tu', 'p'), ('auctoritas', 'auctoritas', 'n'), ('est', 'sum', 'v'), ('uidelicet', 'uidelicet', 'd'), ('cara', 'carus', 'a'), (',', 'punc', 'u'), ('uita', 'uita', 'n'), ('uilissima', 'uilis', 'a'), (',', 'punc', 'u'), ('sed', 'sed', 'c'), ('etiam', 'etiam', 'c'), ('illi', 'ille', 'p'), ('equites', 'eques', 'n'), ('romani', 'romanus', 'a'), (',', 'punc', 'u'), ('honestissimi', 'honestus', 'a'), ('atque', 'atque', 'c'), ('optimi', 'bonus', 'a'), ('uiri', 'uir', 'n'), (',', 'punc', 'u'), ('ceteri', 'ceterus', 'a'), ('-que', '-que', 'c'), ('fortissimi', 'fortis', 'a'), ('ciues', 'ciuis', 'n'), ('qui', 'qui', 'p'), ('circumstant', 'circumsto', 'v'), ('senatum', 'senatus', 'n'), (',', 'punc', 'u'), ('quorum', 'qui', 'p'), ('tu', 'tu', 'p'), ('et', 'et', 'c'), ('frequentiam', 'frequentia', 'n'), ('uidere', 'uideo', 'v'), ('et', 'et', 'c'), ('studia', 'studium', 'n'), ('perspicere', 'perspicio', 'v'), ('et', 'et', 'c'), ('uoces', 'uox', 'n'), ('paulo', 'paulus', 'd'), ('ante', 'ante', 'd'), ('exaudire', 'exaudio', 'v'), ('potuisti', 'possum', 'v'), ('.', 'punc', 'u')]]
     lemmatizer = BigramPOSLemmatizer(train=train, include=['cum'])
     test_str = """Quod cum esset intellectum et animadversum fecit animo libentissimo populus Romanus"""
     target = [('quod', None), ('cum', 'cum2'), ('esset', None), ('intellectum', None), ('et', None), ('animaduersum', None), ('fecit', None), ('animo', None), ('libentissimo', None), ('populus', None), ('romanus', None)]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)

Example #14

0

Show file

File: stopword_filter.py Project: TylerKirby/cltk

def stopwords_filter(string):

    text = string
    # strip tashkeel because the stop words list contains voweled words
    text = araby.strip_tashkeel(text)
    word_tokenizer = WordTokenizer("arabic")
    tokens = word_tokenizer.tokenize(text)

    # filter stop words
    no_stops = [w for w in tokens if w not in ARABIC_STOPS]

    return no_stops

Example #15

0

Show file

File: test_lemmatize.py Project: TylerKirby/cltk

 def test_identity_lemmatizer(self):
     """Test identity_lemmatizer()"""
     lemmatizer = IdentityLemmatizer()
     test_str = 'Ceterum antequam destinata componam'
     target = [('ceterum', 'ceterum'), ('antequam', 'antequam'), ('destinata', 'destinata'), ('componam', 'componam')]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)

Example #16

0

Show file

File: test_semantics.py Project: TylerKirby/cltk

 def test_latin_lemmata(self):
     """Test Lemmata class lookup() method"""
     lemmatizer = Lemmata(dictionary = 'lemmata', language = 'latin')
     test_str = 'Ceterum antequam destinata componam'
     target = [('ceterum', [('ceterus', 1.0)]), ('antequam', [('antequam', 1.0)]), ('destinata', [('destinatus', 0.25), ('destinatum', 0.25), ('destinata', 0.25), ('destino', 0.25)]), ('componam', [('compono', 1.0)])]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lookup(tokens)
     self.assertEqual(lemmas, target)

Example #17

0

Show file

File: test_lemmatize.py Project: TylerKirby/cltk

 def test_roman_numeral_lemmatizer_with_default(self):
     """Test roman_numeral_lemmatizer()"""
     rn_patterns = [(r'(?=^[MDCLXVUI]+$)(?=^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|IU|V?I{0,3}|U?I{0,3})$)', 'NUM'), (r'(?=^[mdclxvui]+$)(?=^m{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|iu|v?i{0,3}|u?i{0,3})$)', 'NUM')]
     lemmatizer = RomanNumeralLemmatizer(rn_patterns, default="RN")
     test_str = 'i ii'
     target = [('i', 'RN'), ('ii', 'RN')]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)

Example #18

0

Show file

File: test_lemmatize.py Project: cltk/cltk

 def test_backoff_latin_lemmatizer_verbose(self):
     """Test backoffLatinLemmatizer"""
     train = [[('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]]  # pylint: disable=line-too-long
     lemmatizer = BackoffLatinLemmatizer(verbose=True)
     test_str = """Ceterum antequam destinata componam"""
     target = [('ceterum', 'ceterum', '<UnigramLemmatizer: CLTK Sentence Training Data>'), ('antequam', 'antequam', '<UnigramLemmatizer: CLTK Sentence Training Data>'), ('destinata', 'destino', '<UnigramLemmatizer: CLTK Sentence Training Data>'), ('componam', 'compono', '<DictLemmatizer: Morpheus Lemmas>')]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)

Example #19

0

Show file

File: test_lemmatize.py Project: TylerKirby/cltk

 def test_model_lemmatizer(self):
     """Test model_lemmatizer()"""
     model = {'ceterum': 'ceterus', 'antequam': 'antequam', 'destinata': 'destino', 'componam': 'compono'}  # pylint: disable=line-too-long
     lemmatizer = TrainLemmatizer(model=model)
     test_str = 'Ceterum antequam destinata componam'
     target = [('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)

Example #20

0

Show file

File: test_lemmatize.py Project: TylerKirby/cltk

 def test_unigram_lemmatizer(self):
     """Test unigram_lemmatizer()"""
     train = [[('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]]  # pylint: disable=line-too-long
     lemmatizer = UnigramLemmatizer(train=train)
     test_str = """Ceterum antequam destinata componam"""
     target = [('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)

Example #21

0

Show file

File: test_lemmatize.py Project: TylerKirby/cltk

 def test_regex_lemmatizer(self):
     """Test regex_lemmatizer()"""
     sub = [('(.)ab(o|is|it|imus|itis|unt)$', r'\1o')]
     lemmatizer = RegexpLemmatizer(sub)
     test_str = 'amabimus'
     target = [('amabimus', 'amo')]
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)

Example #22

0

Show file

File: test_lemmatize.py Project: lukehollis/cltk

 def test_regex_lemmatizer(self):
     """Test regex_lemmatizer()"""
     pattern = [(r'(\w*)abimus', 'o')]
     lemmatizer = RegexpLemmatizer(pattern)
     test_str = 'amabimus'
     target = [('amabimus', 'amo')]
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)

Example #23

0

Show file

File: test_tokenize.py Project: TylerKirby/cltk

    def test_greek_word_tokenizer(self):
        """Test Latin-specific word tokenizer."""
        word_tokenizer = WordTokenizer('greek')
        
        # Test sources:
        # - Thuc. 1.1.1       
        
        test = "Θουκυδίδης Ἀθηναῖος ξυνέγραψε τὸν πόλεμον τῶν Πελοποννησίων καὶ Ἀθηναίων, ὡς ἐπολέμησαν πρὸς ἀλλήλους, ἀρξάμενος εὐθὺς καθισταμένου καὶ ἐλπίσας μέγαν τε ἔσεσθαι καὶ ἀξιολογώτατον τῶν προγεγενημένων, τεκμαιρόμενος ὅτι ἀκμάζοντές τε ᾖσαν ἐς αὐτὸν ἀμφότεροι παρασκευῇ τῇ πάσῃ καὶ τὸ ἄλλο Ἑλληνικὸν ὁρῶν ξυνιστάμενον πρὸς ἑκατέρους, τὸ μὲν εὐθύς, τὸ δὲ καὶ διανοούμενον."

        target = ['Θουκυδίδης', 'Ἀθηναῖος', 'ξυνέγραψε', 'τὸν', 'πόλεμον', 'τῶν', 'Πελοποννησίων', 'καὶ', 'Ἀθηναίων', ',', 'ὡς', 'ἐπολέμησαν', 'πρὸς', 'ἀλλήλους', ',', 'ἀρξάμενος', 'εὐθὺς', 'καθισταμένου', 'καὶ', 'ἐλπίσας', 'μέγαν', 'τε', 'ἔσεσθαι', 'καὶ', 'ἀξιολογώτατον', 'τῶν', 'προγεγενημένων', ',', 'τεκμαιρόμενος', 'ὅτι', 'ἀκμάζοντές', 'τε', 'ᾖσαν', 'ἐς', 'αὐτὸν', 'ἀμφότεροι', 'παρασκευῇ', 'τῇ', 'πάσῃ', 'καὶ', 'τὸ', 'ἄλλο', 'Ἑλληνικὸν', 'ὁρῶν', 'ξυνιστάμενον', 'πρὸς', 'ἑκατέρους', ',', 'τὸ', 'μὲν', 'εὐθύς', ',', 'τὸ', 'δὲ', 'καὶ', 'διανοούμενον', '.']

        result = word_tokenizer.tokenize(test)

        self.assertEqual(result, target)

Example #24

0

Show file

    def test_greek_word_tokenizer(self):
        """Test Latin-specific word tokenizer."""
        word_tokenizer = WordTokenizer('greek')
        
        # Test sources:
        # - Thuc. 1.1.1       
        
        test = "Θουκυδίδης Ἀθηναῖος ξυνέγραψε τὸν πόλεμον τῶν Πελοποννησίων καὶ Ἀθηναίων, ὡς ἐπολέμησαν πρὸς ἀλλήλους, ἀρξάμενος εὐθὺς καθισταμένου καὶ ἐλπίσας μέγαν τε ἔσεσθαι καὶ ἀξιολογώτατον τῶν προγεγενημένων, τεκμαιρόμενος ὅτι ἀκμάζοντές τε ᾖσαν ἐς αὐτὸν ἀμφότεροι παρασκευῇ τῇ πάσῃ καὶ τὸ ἄλλο Ἑλληνικὸν ὁρῶν ξυνιστάμενον πρὸς ἑκατέρους, τὸ μὲν εὐθύς, τὸ δὲ καὶ διανοούμενον."

        target = ['Θουκυδίδης', 'Ἀθηναῖος', 'ξυνέγραψε', 'τὸν', 'πόλεμον', 'τῶν', 'Πελοποννησίων', 'καὶ', 'Ἀθηναίων', ',', 'ὡς', 'ἐπολέμησαν', 'πρὸς', 'ἀλλήλους', ',', 'ἀρξάμενος', 'εὐθὺς', 'καθισταμένου', 'καὶ', 'ἐλπίσας', 'μέγαν', 'τε', 'ἔσεσθαι', 'καὶ', 'ἀξιολογώτατον', 'τῶν', 'προγεγενημένων', ',', 'τεκμαιρόμενος', 'ὅτι', 'ἀκμάζοντές', 'τε', 'ᾖσαν', 'ἐς', 'αὐτὸν', 'ἀμφότεροι', 'παρασκευῇ', 'τῇ', 'πάσῃ', 'καὶ', 'τὸ', 'ἄλλο', 'Ἑλληνικὸν', 'ὁρῶν', 'ξυνιστάμενον', 'πρὸς', 'ἑκατέρους', ',', 'τὸ', 'μὲν', 'εὐθύς', ',', 'τὸ', 'δὲ', 'καὶ', 'διανοούμενον', '.']

        result = word_tokenizer.tokenize(test)

        self.assertEqual(result, target)

Example #25

0

Show file

 def test_old_norse_word_tokenizer(self):
     """Word tokenization"""
     text = "Gylfi konungr var maðr vitr ok fjölkunnigr. " \
            "Hann undraðist þat mjök, er ásafólk var svá kunnigt, at allir hlutir gengu at vilja þeira."
     target = [
         'Gylfi', 'konungr', 'var', 'maðr', 'vitr', 'ok', 'fjölkunnigr',
         '.', 'Hann', 'undraðist', 'þat', 'mjök', ',', 'er', 'ásafólk',
         'var', 'svá', 'kunnigt', ',', 'at', 'allir', 'hlutir', 'gengu',
         'at', 'vilja', 'þeira', '.'
     ]
     word_tokenizer = WordTokenizer('old_norse')
     result = word_tokenizer.tokenize(text)
     # print(result)
     self.assertTrue(result == target)

Example #26

0

Show file

File: test_lemmatize.py Project: TylerKirby/cltk

 def test_latin_pp_lemmatizer(self):
     """Test latin_pp_lemmatizer()"""
     pattern = [(r'(\w*)[a|ie]bimus\b', 1)]
     pps = { 'amo': [1, 'am', 'amare', 'amau', 'amat'] }
     lemmatizer = PPLemmatizer(pattern, pps=pps)
     test_str = 'amabimus'
     target = [('amabimus', 'amo')]
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)

Example #27

0

Show file

File: test_tokenize.py Project: TylerKirby/cltk

    def test_word_tokenizer_french(self):
        word_tokenizer = WordTokenizer('french')

        tests = ["S'a table te veulz maintenir, Honnestement te dois tenir Et garder les enseignemens Dont cilz vers sont commancemens."]  # pylint: disable=line-too-long

        results = []

        for test in tests:
            result = word_tokenizer.tokenize(test)
            results.append(result)

        target = [["S'", 'a', 'table', 'te', 'veulz', 'maintenir', ',', 'Honnestement', 'te', 'dois', 'tenir', 'Et', 'garder', 'les', 'enseignemens', 'Dont', 'cilz', 'vers', 'sont', 'commancemens', '.']]  # pylint: disable=line-too-long

        self.assertEqual(results, target)

Example #28

0

Show file

 def test_latin_pp_lemmatizer(self):
     """Test latin_pp_lemmatizer()"""
     pattern = [(r'(\w*)[a|ie]bimus\b', 1)]
     pps = {'amo': [1, 'am', 'amare', 'amau', 'amat']}
     lemmatizer = PPLemmatizer(pattern, pps=pps)
     test_str = 'amabimus'
     target = [('amabimus', 'amo')]
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)

Example #29

0

Show file

File: test_tokenize.py Project: snerus11/cltk

    def test_word_tokenizer_french(self):
        word_tokenizer = WordTokenizer('french')

        tests = ["S'a table te veulz maintenir, Honnestement te dois tenir Et garder les enseignemens Dont cilz vers sont commancemens."]  # pylint: disable=line-too-long

        results = []

        for test in tests:
            result = word_tokenizer.tokenize(test)
            results.append(result)

        target = [["S'", 'a', 'table', 'te', 'veulz', 'maintenir', ',', 'Honnestement', 'te', 'dois', 'tenir', 'Et', 'garder', 'les', 'enseignemens', 'Dont', 'cilz', 'vers', 'sont', 'commancemens', '.']]  # pylint: disable=line-too-long

        self.assertEqual(results, target)

Example #30

0

Show file

    def test_middle_high_german_tokenize(self):
        """
        Test tokenizing Middle High German
        """
        word_tokenizer = WordTokenizer('middle_high_german')
        text = "Mīn ougen   wurden liebes alsō vol, \n\n\ndō ich die minneclīchen ērst gesach,\ndaȥ eȥ mir hiute und   iemer mē tuot wol."

        tokenized = word_tokenizer.tokenize(text)
        target = [
            'Mīn', 'ougen', 'wurden', 'liebes', 'alsō', 'vol', ',', 'dō',
            'ich', 'die', 'minneclīchen', 'ērst', 'gesach', ',', 'daȥ', 'eȥ',
            'mir', 'hiute', 'und', 'iemer', 'mē', 'tuot', 'wol', '.'
        ]

        self.assertEqual(tokenized, target)

Example #31

0

Show file

File: utils.py Project: amasotti/AncientGreek_NLP

def createCorpus(text, save=True):
    '''
    :params text - the raw text

    returns  + the corpus, a list of list with tokenized sentences
             + the vocab (a dictionary with the frequency of the tokens scaled by the total number of words.

    '''
    with open('../../data/stopwords.txt', 'r', encoding="UTF-8") as src:
        stopwords = src.read()

    stopwords = stopwords.split('\n')
    stopwords.extend([".", ",", "?", "!", "-", ":", ";", "·"])

    Stokenizer = TokenizeSentence('greek')
    Wtokenizer = WordTokenizer('greek')
    sentences = Stokenizer.tokenize(text)
    new_sentences = []
    vocab = dict()
    print('Building corpus and freqDictionary')
    for sent in tqdm(sentences, desc="Sentences"):
        new_sent = Wtokenizer.tokenize(sent)
        # Stopword deletion
        new_sent = [w for w in new_sent if w not in stopwords]
        new_sentences.append(new_sent)
        for w in new_sent:
            if w not in vocab:
                vocab[w] = 1
            else:
                vocab[w] += 1

    vocab_size = len(vocab)
    for k, v in vocab.items():
        # Subsampling, see paper by Goldberg & Levy
        frac = v / vocab_size
        p_w = (1 + np.sqrt(frac * 0.001)) * 0.001 / frac
        # update the value for the word
        vocab[k] = p_w
    if save:
        print('Saving the frequencies')
        with open('../../data/vocabularies/Homer_word_frequencies.json',
                  'w',
                  encoding='utf-8') as fp:
            json.dump(vocab, fp, ensure_ascii=False)
        print('Saving the corpus')
        arr = np.array(new_sentences, dtype=object)
        np.save('../../data/Homer_tokenized_corpus.npy', arr)
    return new_sentences, vocab

Example #32

0

Show file

    def test_syllabification_old_norse(self):
        """Syllabification"""
        s = Syllabifier(language="old_norse", break_geminants=True)
        text = "Gefjun dró frá Gylfa glöð djúpröðul óðla, svá at af rennirauknum rauk, Danmarkar auka. Báru öxn ok " \
               "átta ennitungl, þars gengu fyrir vineyjar víðri valrauf, fjögur höfuð."
        tokenizer = WordTokenizer('old_norse')
        words = tokenizer.tokenize(text)
        s.set_invalid_onsets(invalid_onsets)
        syllabified_words = [s.syllabify_ssp(word.lower())
                             for word in words if word not in ",."]

        target = [['gef', 'jun'], ['dró'], ['frá'], ['gyl', 'fa'], ['glöð'], ['djúp', 'rö', 'ðul'], ['óðl', 'a'],
                  ['svá'], ['at'], ['af'], ['ren', 'ni', 'rauk', 'num'], ['rauk'], ['dan', 'mar', 'kar'], ['auk', 'a'],
                  ['bár', 'u'], ['öxn'], ['ok'], ['át', 'ta'], ['en', 'ni', 'tungl'], ['þars'], ['geng', 'u'],
                  ['fy', 'rir'], ['vi', 'ney', 'jar'], ['víðr', 'i'], ['val', 'rauf'], ['fjö', 'gur'], ['hö', 'fuð']]
        self.assertListEqual(syllabified_words, target)

Example #33

0

Show file

def tokenize(text, language="latin"):
    jv_replacer = JVReplacer()
    text = jv_replacer.replace(text.lower())

    t = WordTokenizer(language)
    l = LemmaReplacer(language)

    text_word_tokens = t.tokenize(text)

    # Garde les mots de plus de trois characters
    ## text_word_tokens = [token for token in text_word_tokens if token not in ['.', ',', ':', ';','*']]
    text_word_tokens = [token for token in text_word_tokens if len(token) > 3]

    text_word_tokens = l.lemmatize(text_word_tokens)

    return text_word_tokens

Example #34

0

Show file

File: kt_tokenizer.py Project: tyz1z/ubi

def french_tokenizer(docs, MAX_NB_WORDS, max_seq_len):
    # tokenizing input data
    word_tokenizer = WordTokenizer('french')
    tokens = []
    for doc in docs:
        tokens.append(word_tokenizer.tokenize(doc))

    tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=False)
    tokenizer.fit_on_texts(tokens)
    word_seq = tokenizer.texts_to_sequences(tokens)
    word_index = tokenizer.word_index
    print("dictionary size: ", len(word_index))

    word_seq = sequence.pad_sequences(word_seq, maxlen=max_seq_len)

    return word_seq, word_index

Example #35

0

Show file

File: test_old_norse.py Project: cltk/cltk

    def test_syllabification_old_norse(self):
        """Syllabification"""
        s = Syllabifier(language="old_norse", break_geminants=True)
        text = "Gefjun dró frá Gylfa glöð djúpröðul óðla, svá at af rennirauknum rauk, Danmarkar auka. Báru öxn ok " \
               "átta ennitungl, þars gengu fyrir vineyjar víðri valrauf, fjögur höfuð."
        tokenizer = WordTokenizer('old_norse')
        words = tokenizer.tokenize(text)
        s.set_invalid_onsets(invalid_onsets)
        syllabified_words = [s.syllabify_ssp(word.lower())
                             for word in words if word not in ",."]

        target = [['gef', 'jun'], ['dró'], ['frá'], ['gyl', 'fa'], ['glöð'], ['djúp', 'rö', 'ðul'], ['óðl', 'a'],
                  ['svá'], ['at'], ['af'], ['ren', 'ni', 'rauk', 'num'], ['rauk'], ['dan', 'mar', 'kar'], ['auk', 'a'],
                  ['bár', 'u'], ['öxn'], ['ok'], ['át', 'ta'], ['en', 'ni', 'tungl'], ['þars'], ['geng', 'u'],
                  ['fy', 'rir'], ['vi', 'ney', 'jar'], ['víðr', 'i'], ['val', 'rauf'], ['fjö', 'gur'], ['hö', 'fuð']]
        self.assertListEqual(syllabified_words, target)

Example #36

0

Show file

File: test_semantics.py Project: TylerKirby/cltk

 def test_latin_translations(self):
     """Test Synonym class lookup() function and Lemmata class isolate() method"""
     #first build the lemma list as in test_latin_lemmata()
     lemmatizer = Lemmata(dictionary = 'lemmata', language = 'latin')
     test_str = 'Ceterum antequam destinata componam'
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lookup(tokens)
     #now isolate the list of lemmas
     lemmas = lemmatizer.isolate(lemmas)
     translations = Synonyms(dictionary = 'translations', language = 'latin')
     translations = translations.lookup_synonyms(lemmas)
     target = [('destino', [('σκοπός', 1.0)]), ('compono', [('συντίθημι', 1.0)])]
     self.assertEqual(translations, target)

Example #37

0

Show file

    def test_middle_high_german_stopwords(self):
        """
        Test filtering  Middle High German stopwords.
        """

        sentence = "Swer was ze Bêârosche komn, doch hete Gâwân dâ genomn den prîs ze bêder sît al ein wan daz dervor ein ritter schein, bî rôtem wâpen unrekant, des prîs man in die hœhe bant."
        lowered = sentence.lower()
        tokenizer = WordTokenizer('middle_high_german')
        tokens = tokenizer.tokenize(lowered)
        no_stops = [w for w in tokens if w not in MIDDLE_HIGH_GERMAN_STOPS]
        target_list = [
            'swer', 'bêârosche', 'komn', ',', 'gâwân', 'genomn', 'prîs',
            'bêder', 'sît', 'dervor', 'ritter', 'schein', ',', 'rôtem',
            'wâpen', 'unrekant', ',', 'prîs', 'hœhe', 'bant', '.'
        ]

        self.assertEqual(no_stops, target_list)

Example #38

0

Show file

File: test_semantics.py Project: TylerKirby/cltk

 def test_latin_synonyms(self):
     """Test Synonym class lookup() function and Lemmata class isolate() method"""
     #first build the lemma list as in test_latin_lemmata()
     lemmatizer = Lemmata(dictionary = 'lemmata', language = 'latin')
     test_str = 'Ceterum antequam destinata componam'
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lookup(tokens)
     #now isolate the list of lemmas
     lemmas = lemmatizer.isolate(lemmas)
     synonyms = Synonyms(dictionary = 'synonyms', language = 'latin')
     syns = synonyms.lookup_synonyms(lemmas)
     target = [('ceterus', [('ceteroqui', 0.5), ('perquiesco', 0.5)]), ('compono', [('struo', 0.5), ('condo', 0.5)])]
     self.assertEqual(syns, target)

Example #39

0

Show file

File: test_lemmatize.py Project: usmanmuhd/cltk

 def test_bigram_pos_lemmatizer(self):
     train = [[('dixissem', 'dico', 'v')],
              [('de', 'de', 'r'), ('te', 'tu', 'p'),
               ('autem', 'autem', 'c'), (',', 'punc', 'u'),
               ('catilina', 'catilina', 'n'), (',', 'punc', 'u'),
               ('cum', 'cum2', 'c'), ('quiescunt', 'quiesco', 'v'),
               (',', 'punc', 'u'), ('probant', 'probo', 'v'),
               (',', 'punc', 'u'), ('cum', 'cum2', 'c'),
               ('patiuntur', 'patior', 'v'), (',', 'punc', 'u'),
               ('decernunt', 'decerno', 'v'), (',', 'punc', 'u'),
               ('cum', 'cum2', 'c'), ('tacent', 'taceo', 'v'),
               (',', 'punc', 'u'), ('clamant', 'clamo', 'v'),
               (',', 'punc', 'u'), ('neque', 'neque', 'c'),
               ('hi', 'hic', 'p'), ('solum', 'solus', 'd'),
               ('quorum', 'qui', 'p'), ('tibi', 'tu', 'p'),
               ('auctoritas', 'auctoritas', 'n'), ('est', 'sum', 'v'),
               ('uidelicet', 'uidelicet', 'd'), ('cara', 'carus', 'a'),
               (',', 'punc', 'u'), ('uita', 'uita', 'n'),
               ('uilissima', 'uilis', 'a'), (',', 'punc', 'u'),
               ('sed', 'sed', 'c'), ('etiam', 'etiam', 'c'),
               ('illi', 'ille', 'p'), ('equites', 'eques', 'n'),
               ('romani', 'romanus', 'a'), (',', 'punc', 'u'),
               ('honestissimi', 'honestus', 'a'), ('atque', 'atque', 'c'),
               ('optimi', 'bonus', 'a'), ('uiri', 'uir', 'n'),
               (',', 'punc', 'u'), ('ceteri', 'ceterus', 'a'),
               ('-que', '-que', 'c'), ('fortissimi', 'fortis', 'a'),
               ('ciues', 'ciuis', 'n'), ('qui', 'qui', 'p'),
               ('circumstant', 'circumsto', 'v'),
               ('senatum', 'senatus', 'n'), (',', 'punc', 'u'),
               ('quorum', 'qui', 'p'), ('tu', 'tu', 'p'), ('et', 'et', 'c'),
               ('frequentiam', 'frequentia', 'n'), ('uidere', 'uideo', 'v'),
               ('et', 'et', 'c'), ('studia', 'studium', 'n'),
               ('perspicere', 'perspicio', 'v'), ('et', 'et', 'c'),
               ('uoces', 'uox', 'n'), ('paulo', 'paulus', 'd'),
               ('ante', 'ante', 'd'), ('exaudire', 'exaudio', 'v'),
               ('potuisti', 'possum', 'v'), ('.', 'punc', 'u')]]
     lemmatizer = BigramPOSLemmatizer(train=train, include=['cum'])
     test_str = """Quod cum esset intellectum et animadversum fecit animo libentissimo populus Romanus"""
     target = [('quod', None), ('cum', 'cum2'), ('esset', None), ('intellectum', None), ('et', None), ('animaduersum', None), ('fecit', None), ('animo', None), ('libentissimo', None), ('populus', None), ('romanus', None)]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)

Example #40

0

Show file

File: test_lemmatize.py Project: wmshort/cltkv1

 def test_identity_lemmatizer(self):
     """Test identity_lemmatizer()"""
     lemmatizer = IdentityLemmatizer()
     test_str = "Ceterum antequam destinata componam"
     target = [
         ("ceterum", "ceterum"),
         ("antequam", "antequam"),
         ("destinata", "destinata"),
         ("componam", "componam"),
     ]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer("latin")
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)

Example #41

0

Show file

    def test_syllabification_old_norse(self):
        old_norse_syllabifier = Syllabifier(language="old_norse", break_geminants=True)
        text = (
            "Gefjun dró frá Gylfa glöð djúpröðul óðla, svá at af rennirauknum rauk, Danmarkar auka. Báru öxn ok "
            "átta ennitungl, þars gengu fyrir vineyjar víðri valrauf, fjögur höfuð."
        )
        tokenizer = WordTokenizer("old_norse")
        words = tokenizer.tokenize(text)
        old_norse_syllabifier.set_invalid_onsets(invalid_onsets)

        syllabified_words = [
            old_norse_syllabifier.syllabify_ssp(word.lower())
            for word in words
            if word not in ",."
        ]

        target = [
            ["gef", "jun"],
            ["dró"],
            ["frá"],
            ["gyl", "fa"],
            ["glöð"],
            ["djúp", "rö", "ðul"],
            ["óðl", "a"],
            ["svá"],
            ["at"],
            ["af"],
            ["ren", "ni", "rauk", "num"],
            ["rauk"],
            ["dan", "mar", "kar"],
            ["auk", "a"],
            ["bár", "u"],
            ["öxn"],
            ["ok"],
            ["át", "ta"],
            ["en", "ni", "tungl"],
            ["þars"],
            ["geng", "u"],
            ["fy", "rir"],
            ["vi", "ney", "jar"],
            ["víðr", "i"],
            ["val", "rauf"],
            ["fjö", "gur"],
            ["hö", "fuð"],
        ]
        self.assertListEqual(syllabified_words, target)

Example #42

0

Show file

def stem(text):
    """make string lower-case"""
    text = text.lower()
    """Stem each word of the French text."""

    stemmed_text = ""

    word_tokenizer = WordTokenizer("french")
    tokenized_text = word_tokenizer.tokenize(text)
    for word in tokenized_text:
        """remove the simple endings from the target word"""
        word, was_stemmed = matchremove_noun_endings(word)
        """if word didn't match the simple endings, try verb endings"""
        if not was_stemmed:
            word = matchremove_verb_endings(word)
        """add the stemmed word to the text"""
        stemmed_text += word + " "
    return stemmed_text

Example #43

0

Show file

def preprocess(doc):
    assert (type(doc) == str)
    word_tokenizer = WordTokenizer('latin')
    doc_word_tokens = word_tokenizer.tokenize(doc)
    doc_word_tokens_no_punt = [
        token.lower() for token in doc_word_tokens
        if token not in ['.', ',', ':', ';']
    ]

    # lemmeatization
    corpus_importer = CorpusImporter('latin')
    corpus_importer.import_corpus('latin_models_cltk')
    jv_replacer = JVReplacer()

    lemmatizer = LemmaReplacer('latin')
    lemmata = lemmatizer.lemmatize(" ".join(doc_word_tokens_no_punt))
    cleaned = remove_latin_library_items(" ".join(lemmata))
    return cleaned

Example #44

0

Show file

File: test_lemmatize.py Project: usmanmuhd/cltk

 def test_roman_numeral_lemmatizer_with_default(self):
     """Test roman_numeral_lemmatizer()"""
     rn_patterns = [
         (r'(?=^[MDCLXVUI]+$)(?=^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|IU|V?I{0,3}|U?I{0,3})$)',
          'NUM'),
         (r'(?=^[mdclxvui]+$)(?=^m{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|iu|v?i{0,3}|u?i{0,3})$)',
          'NUM')
     ]
     lemmatizer = RomanNumeralLemmatizer(rn_patterns, default="RN")
     test_str = 'i ii'
     target = [('i', 'RN'), ('ii', 'RN')]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)

Example #45

0

Show file

File: test_semantics.py Project: tnmsahu/cltk

 def test_latin_translations(self):
     """Test Synonym class lookup() function and Lemmata class isolate() method"""
     #first build the lemma list as in test_latin_lemmata()
     lemmatizer = Lemmata(dictionary='lemmata', language='latin')
     test_str = 'Ceterum antequam destinata componam'
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lookup(tokens)
     #now isolate the list of lemmas
     lemmas = lemmatizer.isolate(lemmas)
     translations = Synonyms(dictionary='translations', language='latin')
     translations = translations.lookup_synonyms(lemmas)
     target = [('destino', [('σκοπός', 1.0)]),
               ('compono', [('συντίθημι', 1.0)])]
     self.assertEqual(translations, target)

Example #46

0

Show file

File: test_semantics.py Project: tnmsahu/cltk

 def test_latin_synonyms(self):
     """Test Synonym class lookup() function and Lemmata class isolate() method"""
     #first build the lemma list as in test_latin_lemmata()
     lemmatizer = Lemmata(dictionary='lemmata', language='latin')
     test_str = 'Ceterum antequam destinata componam'
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lookup(tokens)
     #now isolate the list of lemmas
     lemmas = lemmatizer.isolate(lemmas)
     synonyms = Synonyms(dictionary='synonyms', language='latin')
     syns = synonyms.lookup_synonyms(lemmas)
     target = [('ceterus', [('ceteroqui', 0.5), ('perquiesco', 0.5)]),
               ('compono', [('struo', 0.5), ('condo', 0.5)])]
     self.assertEqual(syns, target)

Example #47

0

Show file

File: stem.py Project: TylerKirby/cltk

def stem(text):
    """make string lower-case"""
    text = text.lower()
    """Stem each word of the French text."""

    stemmed_text = ''

    word_tokenizer = WordTokenizer('french')
    tokenized_text = word_tokenizer.tokenize(text)
    for word in tokenized_text:
        """remove the simple endings from the target word"""
        word, was_stemmed = matchremove_noun_endings(word)
        """if word didn't match the simple endings, try verb endings"""
        if not was_stemmed:
            word = matchremove_verb_endings(word)
        """add the stemmed word to the text"""
        stemmed_text += word + ' '
    return stemmed_text

Example #48

0

Show file

    def test_latin_word_tokenizer(self):
        """Test Latin-specific word tokenizer."""
        word_tokenizer = WordTokenizer('latin')

        #Test sources:
        # - V. Aen. 1.1
        # - Prop. 2.5.1-2
        # - Ov. Am. 1.8.65-66
        # - Cic. Phillip. 13.14
        # - Plaut. Capt. 937
        # - Lucr. DRN. 5.1351-53
        # - Plaut. Bacch. 837-38
        # - Plaut. Amph. 823
        # - Caes. Bel. 6.29.2

        tests = ['Arma virumque cano, Troiae qui primus ab oris.',
                    'Hoc verumst, tota te ferri, Cynthia, Roma, et non ignota vivere nequitia?',
                    'Nec te decipiant veteres circum atria cerae. Tolle tuos tecum, pauper amator, avos!',
                    'Neque enim, quod quisque potest, id ei licet, nec, si non obstatur, propterea etiam permittitur.',
                    'Quid opust verbis? lingua nullast qua negem quidquid roges.',
                    'Textile post ferrumst, quia ferro tela paratur, nec ratione alia possunt tam levia gigni insilia ac fusi, radii, scapique sonantes.',  # pylint: disable=line-too-long
                    'Dic sodes mihi, bellan videtur specie mulier?',
                    'Cenavin ego heri in navi in portu Persico?',
                    'quae ripas Ubiorum contingebat in longitudinem pedum ducentorum rescindit']

        results = []

        for test in tests:
            result = word_tokenizer.tokenize(test)
            results.append(result)

        target = [['Arma', 'virum', '-que', 'cano', ',', 'Troiae', 'qui', 'primus', 'ab', 'oris', '.'],
                  ['Hoc', 'verum', 'est', ',', 'tota', 'te', 'ferri', ',', 'Cynthia', ',', 'Roma', ',', 'et', 'non', 'ignota', 'vivere', 'nequitia', '?'],  # pylint: disable=line-too-long
                  ['Nec', 'te', 'decipiant', 'veteres', 'circum', 'atria', 'cerae', '.', 'Tolle', 'tuos', 'cum', 'te', ',', 'pauper', 'amator', ',', 'avos', '!'],  # pylint: disable=line-too-long
                  ['Neque', 'enim', ',', 'quod', 'quisque', 'potest', ',', 'id', 'ei', 'licet', ',', 'nec', ',', 'si', 'non', 'obstatur', ',', 'propterea', 'etiam', 'permittitur', '.'],  # pylint: disable=line-too-long
                  ['Quid', 'opus', 'est', 'verbis', '?', 'lingua', 'nulla', 'est', 'qua', 'negem', 'quidquid', 'roges', '.'],  # pylint: disable=line-too-long
                  ['Textile', 'post', 'ferrum', 'est', ',', 'quia', 'ferro', 'tela', 'paratur', ',', 'nec', 'ratione', 'alia', 'possunt', 'tam', 'levia', 'gigni', 'insilia', 'ac', 'fusi', ',', 'radii', ',', 'scapi', '-que', 'sonantes', '.'],  # pylint: disable=line-too-long
                  ['Dic', 'si', 'audes', 'mihi', ',', 'bella', '-ne', 'videtur', 'specie', 'mulier', '?'],
                  ['Cenavi', '-ne', 'ego', 'heri', 'in', 'navi', 'in', 'portu', 'Persico', '?'],
                  ['quae', "ripas", "Ubiorum", "contingebat", "in", "longitudinem", "pedum", "ducentorum", "rescindit"]
                  ]

        self.assertEqual(results, target)

Example #49

0

Show file

    def test_tokenize_arabic_words_base(self):
        word_tokenizer = WordTokenizer('arabic')
        tests = [
            'اللُّغَةُ الْعَرَبِيَّةُ جَمِيلَةٌ.',
            'انما الْمُؤْمِنُونَ اخوه فاصلحوا بَيْنَ اخويكم',
            'الْعَجُزُ عَنِ الْإِدْرَاكِ إِدْرَاكٌ، وَالْبَحْثَ فِي ذاتِ اللَّه اشراك.',
            'اللَّهُمُّ اُسْتُرْ عُيُوبَنَا وَأَحْسَنَ خَوَاتِيمَنَا الْكَاتِبِ: نَبِيلُ جلهوم',
            'الرَّأْي قَبْلَ شَجَاعَة الشّجعَانِ',
            'فَأَنْزَلْنَا مِنْ السَّمَاء مَاء فَأَسْقَيْنَاكُمُوهُ',
            'سُئِلَ بَعْضُ الْكُتَّابِ عَنِ الْخَطّ، مَتَى يَسْتَحِقُّ أَنْ يُوصَفَ بِالْجَوْدَةِ ؟'
        ]

        results = []
        for test in tests:
            result = word_tokenizer.tokenize(test)
            results.append(result)

        target = [
            ['اللُّغَةُ', 'الْعَرَبِيَّةُ', 'جَمِيلَةٌ', '.'],
            ['انما', 'الْمُؤْمِنُونَ', 'اخوه', 'فاصلحوا', 'بَيْنَ', 'اخويكم'],
            [
                'الْعَجُزُ', 'عَنِ', 'الْإِدْرَاكِ', 'إِدْرَاكٌ', '،',
                'وَالْبَحْثَ', 'فِي', 'ذاتِ', 'اللَّه', 'اشراك', '.'
            ],  # pylint: disable=line-too-long
            [
                'اللَّهُمُّ', 'اُسْتُرْ', 'عُيُوبَنَا', 'وَأَحْسَنَ',
                'خَوَاتِيمَنَا', 'الْكَاتِبِ', ':', 'نَبِيلُ', 'جلهوم'
            ],  # pylint: disable=line-too-long
            ['الرَّأْي', 'قَبْلَ', 'شَجَاعَة', 'الشّجعَانِ'],
            [
                'فَأَنْزَلْنَا', 'مِنْ', 'السَّمَاء', 'مَاء',
                'فَأَسْقَيْنَاكُمُوهُ'
            ],
            [
                'سُئِلَ', 'بَعْضُ', 'الْكُتَّابِ', 'عَنِ', 'الْخَطّ', '،',
                'مَتَى', 'يَسْتَحِقُّ', 'أَنْ', 'يُوصَفَ', 'بِالْجَوْدَةِ', '؟'
            ]
            # pylint: disable=line-too-long
        ]
        self.assertEqual(results, target)

Example #50

0

Show file

File: ner.py Project: cltk/cltk

    def tag_ner_fr(self, input_text, output_type=list):

        entities = self.entities

        for entity in entities:
            (name, kind) = entity

        word_tokenizer = WordTokenizer('french')
        tokenized_text = word_tokenizer.tokenize(input_text)
        ner_tuple_list = []

        match = False
        for word in tokenized_text:
            for name, kind in entities:
                if word == name:
                    named_things = ([(name, 'entity', kind)])
                    ner_tuple_list.append(named_things)
                    match = True
                    break
            else:
                ner_tuple_list.append((word,))
        return ner_tuple_list

Example #51

0

Show file

File: test_lemmatize.py Project: usmanmuhd/cltk

 def test_french_lemmatizer(self):
     text = "Li rois pense que par folie, Sire Tristran, vos aie amé ; Mais Dé plevis ma loiauté, Qui sor mon cors mete flaele, S'onques fors cil qui m’ot pucele Out m'amistié encor nul jor !"
     text = str.lower(text)
     tokenizer = WordTokenizer('french')
     lemmatizer = LemmaReplacer()
     tokens = tokenizer.tokenize(text)
     lemmas = lemmatizer.lemmatize(tokens)
     target = [('li', 'li'), ('rois', 'rois'), ('pense', 'pense'),
               ('que', 'que'), ('par', 'par'), ('folie', 'folie'),
               (',', ['PUNK']), ('sire', 'sire'), ('tristran', 'None'),
               (',', ['PUNK']), ('vos', 'vos'), ('aie', ['avoir']),
               ('amé', 'amer'), (';', ['PUNK']), ('mais', 'mais'),
               ('dé', 'dé'), ('plevis', 'plevir'), ('ma', 'ma'),
               ('loiauté', 'loiauté'), (',', ['PUNK']), ('qui', 'qui'),
               ('sor', 'sor'), ('mon', 'mon'), ('cors', 'cors'),
               ('mete', 'mete'), ('flaele', 'flaele'), (',', ['PUNK']),
               ("s'", "s'"), ('onques', 'onques'), ('fors', 'fors'),
               ('cil', 'cil'), ('qui', 'qui'), ("m'", "m'"), ('ot', 'ot'),
               ('pucele', 'pucele'), ('out', ['avoir']), ("m'", "m'"),
               ('amistié', 'amistié'), ('encor', 'encor'), ('nul', 'nul'),
               ('jor', 'jor'), ('!', ['PUNK'])]
     self.assertEqual(lemmas, target)

Example #52

0

Show file

File: asrael.py Project: konstantinschulz/asrael

def build_corpus(raw_text_path: str, processed_text_path: str) -> None:
    if not os.path.exists(raw_text_path):
        print("Downloading corpus...")
        zip_file_path: str = raw_text_path + ".zip"
        response: requests.Response = requests.get(
            "https://box.hu-berlin.de/f/056b874a12cf44de82ab/?dl=1",
            stream=True)
        total_length: int = int(response.headers.get("content-length"))
        done_count: int = 0
        chunk_size: int = 1024
        with open(zip_file_path, "wb+") as f:
            for data in tqdm(
                    response.iter_content(chunk_size=chunk_size),
                    total=math.ceil(total_length // chunk_size),
                    unit="MB",
                    unit_scale=0.001):  # math.ceil(total_length // chunk_size)
                done_count += len(data)
                f.write(data)
        print("Extracting corpus...")
        zip_file: ZipFile = ZipFile(zip_file_path)
        file_path_parts: Tuple[str, str] = os.path.split(raw_text_path)
        zip_file.extract(file_path_parts[1], file_path_parts[0])
        zip_file.close()
    print("Segmenting and tokenizing corpus...")
    raw_text: str
    with open(raw_text_path) as f:
        raw_text = f.read()
    language: str = "latin"
    raw_sentences: List[str] = nltk.sent_tokenize(raw_text, language=language)
    del raw_text
    word_tokenizer = WordTokenizer(language)
    with open(processed_text_path, "a+") as f:
        raw_text_tokenized = []
        for sent in tqdm(raw_sentences):
            raw_text_tokenized.append(word_tokenizer.tokenize(sent))
            if len(raw_text_tokenized) == 1000:
                for sentence in raw_text_tokenized:
                    f.write("\t".join(sentence) + "\n")
                raw_text_tokenized = []

Example #53

0

Show file

    def tag_ner_fr(self, input_text, output_type=list):

        entities = self.entities

        for entity in entities:
            (name, kind) = entity

        word_tokenizer = WordTokenizer("french")
        tokenized_text = word_tokenizer.tokenize(input_text)
        ner_tuple_list = []

        match = False
        for word in tokenized_text:
            for name, kind in entities:
                if word == name:
                    named_things = [(name, "entity", kind)]
                    ner_tuple_list.append(named_things)
                    match = True
                    break
            else:
                ner_tuple_list.append((word, ))
        return ner_tuple_list

Example #54

0

Show file

def convert_to_toks(sents):

    sent_tokenizer = SentenceTokenizer()
    word_tokenizer = WordTokenizer('latin')

    all_sents = []

    for data in sents:
        text = data.lower()

        sents = sent_tokenizer.tokenize(text)
        for sent in sents:
            tokens = word_tokenizer.tokenize(sent)
            filt_toks = []
            filt_toks.append("[CLS]")
            for tok in tokens:
                if tok != "":
                    filt_toks.append(tok)
            filt_toks.append("[SEP]")

            all_sents.append(filt_toks)

    return all_sents

Example #55

0

Show file

File: setup_0.init_cltk.py Project: cwf2/mta_summer_2018

def runTest(text):
   '''Test cltk tools for latin'''
   print('Test phrase:')
   print(' -> ' + text)
   print()

#   print('[1/3] Testing JVReplacer')
#   jv = JVReplacer()
#   text = jv.replace(text)
#   print(' -> ' + text)
#   print()

   print('[2/3] Testing WordTokenizer')
   tokenizer = WordTokenizer('latin')
   tok = tokenizer.tokenize(text)
   print(' -> ' + ', '.join(["'{}'".format(t) for t in tok]))
   print()

   print('[3/3] Testing LemmaReplacer')
   lemmatizer = LemmaReplacer('latin')
   lem = lemmatizer.lemmatize(tok)
   print(' -> ' + ', '.join(["'{}'".format(l) for l in lem]))
   print()

Example #56

0

Show file

def tokenize(request):
    language = request['Content-Language']
    src_data = request['Payload']
    print(language)

    word_tokenizer = WordTokenizer(language)
    data = word_tokenizer.tokenize(src_data)
    clean_data = list(map(cltk_normalize, [w for w in data if w.isalpha()]))
    # and not w in STOPS_LIST]

    # lemma = LemmaReplacer(language).lemmatize(clean_data)
    lemma = None
    if language == 'greek':
        lemma = BackoffGreekLemmatizer().lemmatize(clean_data)
    elif language == 'latin':
        lemma = BackoffLatinLemmatizer().lemmatize(clean_data)

    result = []
    for i, elem in enumerate(lemma):
        w, l = elem
        result.append({'index': i + 1, 'word': w, 'lemma': l})

    return result

Example #57

0

Show file

File: test_lemmatize.py Project: wmshort/cltkv1

 def test_dict_lemmatizer(self):
     """Test model_lemmatizer()"""
     lemmas = {
         "ceterum": "ceterus",
         "antequam": "antequam",
         "destinata": "destino",
         "componam": "compono",
     }  # pylint: disable=line-too-long
     lemmatizer = DictLemmatizer(lemmas=lemmas)
     test_str = "Ceterum antequam destinata componam"
     target = [
         ("ceterum", "ceterus"),
         ("antequam", "antequam"),
         ("destinata", "destino"),
         ("componam", "compono"),
     ]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer("latin")
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)

Example #58

0

Show file

def read_file(filename):
    sent_tokenizer = SentenceTokenizer()
    word_tokenizer = WordTokenizer('latin')

    all_sents = []
    with open(filename, encoding="utf-8") as file:
        data = file.read()

        # BERT model is lowercase
        text = data.lower()

        sents = sent_tokenizer.tokenize(text)
        for sent in sents:
            tokens = word_tokenizer.tokenize(sent)
            filt_toks = []
            for tok in tokens:
                if tok != "":
                    filt_toks.append(tok)
            filt_toks.insert(0, "[CLS]")
            filt_toks.append("[SEP]")

            all_sents.append(filt_toks)

    return all_sents

Example #59

0

Show file

def word_tokenize(text):
    print("Word Tokenizer triggered")
    word_tokenizer = WordTokenizer('sanskrit')
    # print("word tokenize: ", word_tokenizer.tokenize(self.sentence))
    return word_tokenizer.tokenize(text)

Example #60

0

Show file

File: BasicNLP.py Project: milescward/UsingCLTK

from cltk.stop.latin import STOPS_LIST

# See http://docs.cltk.org/en/latest/latin.html#sentence-tokenization

cato_agri_praef = "Est interdum praestare mercaturis rem quaerere, nisi tam periculosum sit, et item foenerari, si tam honestum. Maiores nostri sic habuerunt et ita in legibus posiverunt: furem dupli condemnari, foeneratorem quadrupli. Quanto peiorem civem existimarint foeneratorem quam furem, hinc licet existimare. Et virum bonum quom laudabant, ita laudabant: bonum agricolam bonumque colonum; amplissime laudari existimabatur qui ita laudabatur. Mercatorem autem strenuum studiosumque rei quaerendae existimo, verum, ut supra dixi, periculosum et calamitosum. At ex agricolis et viri fortissimi et milites strenuissimi gignuntur, maximeque pius quaestus stabilissimusque consequitur minimeque invidiosus, minimeque male cogitantes sunt qui in eo studio occupati sunt. Nunc, ut ad rem redeam, quod promisi institutum principium hoc erit."
cato_agri_praef_lowered = cato_agri_praef.lower()
# create a tokenizer instance of the TokenizeSentence Class
latin_sentence_tokenizer = TokenizeSentence('latin')

#tokenize the text into sentence tokens
cato_sentence_tokens = latin_sentence_tokenizer.tokenize_sentences(
    cato_agri_praef)

# tokenize the text (or specific sentences) into specific words
latin_word_tokenizer = WordTokenizer('latin')
cato_word_tokens = latin_word_tokenizer.tokenize(cato_agri_praef_lowered)
cato_word_tokens_WO_punt = [
    token for token in cato_word_tokens if token not in ['.', ',', ':', ';']
]

#print the tokens and the number of tokens
num_of_sentences = len(cato_sentence_tokens)
num_of_words = len(cato_word_tokens_WO_punt)
#print("There are " + str(num_of_sentences) + " sentences in the text")
#print("There are " + str(num_of_words) + " words in the text")
# for sentence in cato_sentence_tokens:
#     print(sentence)
#     print()

#print(cato_word_tokens_WO_punt)