def test_is_dependent_vowel(self): """Test Indic Syllabifier is_dependent_vowel method""" syllabifier = IndianSyllabifier('hindi') v = syllabifier.get_phonetic_feature_vector('न', 'hi') self.assertFalse(syllabifier.is_dependent_vowel(v))
def test_is_anusvaar(self): """Test Indic Syllabifier is_anusvaar method""" syllabifier = IndianSyllabifier('hindi') v = syllabifier.get_phonetic_feature_vector('न', 'hi') self.assertFalse(syllabifier.is_anusvaar(v))
def test_is_valid(self): """Test Indic Syllabifier is_valid method""" syllabifier = IndianSyllabifier('hindi') v = syllabifier.get_phonetic_feature_vector('न', 'hi') self.assertTrue(syllabifier.is_valid(v))
def test_get_offset(self): """Test Indic Syllabifier get_offset method""" correct = 40 syllabifier = IndianSyllabifier('hindi') current = syllabifier.get_offset('न', 'hi') self.assertEqual(current, correct)
def test_coordinated_range(self): """Test Indic Syllabifier in_coordinated_range method""" syllabifier = IndianSyllabifier('hindi') current = syllabifier.get_offset('न', 'hi') current1 = syllabifier.in_coordinated_range_offset(current) self.assertTrue(current1)
def test_is_dependent_vowel(self): """Test Indic Syllabifier is_dependent_vowel method""" syllabifier = IndianSyllabifier('hindi') v = syllabifier.get_phonetic_feature_vector('न', 'hi') self.assertFalse(syllabifier.is_dependent_vowel(v))
def test_syllabify(self): """Test Indic Syllabifier method""" correct = ['न', 'म', 'स्ते'] syllabifier = IndianSyllabifier('hindi') current = syllabifier.orthographic_syllabify('नमस्ते') self.assertEqual(current, correct)
def test_is_anusvaar(self): """Test Indic Syllabifier is_anusvaar method""" syllabifier = IndianSyllabifier('hindi') v = syllabifier.get_phonetic_feature_vector('न', 'hi') self.assertFalse(syllabifier.is_anusvaar(v))
def test_is_valid(self): """Test Indic Syllabifier is_valid method""" syllabifier = IndianSyllabifier('hindi') v = syllabifier.get_phonetic_feature_vector('न', 'hi') self.assertTrue(syllabifier.is_valid(v))
def test_coordinated_range(self): """Test Indic Syllabifier in_coordinated_range method""" syllabifier = IndianSyllabifier('hindi') current = syllabifier.get_offset('न', 'hi') current1 = syllabifier.in_coordinated_range_offset(current) self.assertTrue(current1)
def test_get_offset(self): """Test Indic Syllabifier get_offset method""" correct = 40 syllabifier = IndianSyllabifier('hindi') current = syllabifier.get_offset('न', 'hi') self.assertEqual(current, correct)
def test_syllabify(self): """Test Indic Syllabifier method""" correct = ['न', 'म', 'स्ते'] syllabifier = IndianSyllabifier('hindi') current = syllabifier.orthographic_syllabify('नमस्ते') self.assertEqual(current, correct)
from cltk.corpus.sanskrit.itrans.unicode_transliterate import ItransTransliterator from cltk.tokenize.sentence import TokenizeSentence from cltk.stem.sanskrit.indian_syllabifier import Syllabifier lang = "hi" language = "hindi" tokenizer = TokenizeSentence("sanskrit") syl = Syllabifier(language) #List of phonemes that should not be counted as separate diphones while splitting check_phonemes_1 = ["ः", "ऽ", "ङ्\u200d"] check_phonemes_2 = ["\u200c"] #List of characters that should be taken to the left in case they are present to the right while splitting move_left_1 = ['म्', 'र्', 'न्'] #Checking for numbers and purna-viram def check_token(token): flag = True if token == "।": flag = False elif token.isdigit(): flag = False return flag #Checking for splitting position def check_proximity(split, pos, next_token): if len(split) - pos in range(1, 3):
""" @author: sourabh garg """ import itertools import re import words_tagging from cltk.stem.sanskrit.indian_syllabifier import Syllabifier from cltk.corpus.sanskrit.alphabet import * lang = 'hindi' h = Syllabifier(lang) VOWELS = [ INDEPENDENT_VOWELS_SIMPLE, INDEPENDENT_VOWELS_DIPTHONGS, INDEPENDENT_VOWELS ] VOWELS = list(itertools.chain(*VOWELS)) CONSONANTS = [ CONSONANT_GUTTURALS, CONSONANT_PALATALS, CONSONANT_CEREBRALS, CONSONANT_DENTALS, CONSONANT_LABIALS, SEMIVOWEL_CONSONANT, SIBILANT_CONSONANT, SONANT_ASPIRATE ] CONSONANTS = list(itertools.chain(*CONSONANTS)) CONSONANT_HALANTA = [x + '्' for x in CONSONANTS] CONS_TO_CONS = dict(zip(CONSONANT_HALANTA, CONSONANTS)) matraa_to_vowel = { '': 'अ', 'ा': 'आ', 'ि': 'इ', 'ी': 'ई', 'ु': 'उ',