예제 #1
0
파일: test_stem.py 프로젝트: vierth/cltk
 def test_is_dependent_vowel(self):
     """Test Indic Syllabifier is_dependent_vowel method"""
     syllabifier = IndianSyllabifier('hindi')
     v = syllabifier.get_phonetic_feature_vector('न', 'hi')
     self.assertFalse(syllabifier.is_dependent_vowel(v))
예제 #2
0
파일: test_stem.py 프로젝트: vierth/cltk
 def test_is_anusvaar(self):
     """Test Indic Syllabifier is_anusvaar method"""
     syllabifier = IndianSyllabifier('hindi')
     v = syllabifier.get_phonetic_feature_vector('न', 'hi')
     self.assertFalse(syllabifier.is_anusvaar(v))
예제 #3
0
파일: test_stem.py 프로젝트: vierth/cltk
 def test_is_valid(self):
     """Test Indic Syllabifier is_valid method"""
     syllabifier = IndianSyllabifier('hindi')
     v = syllabifier.get_phonetic_feature_vector('न', 'hi')
     self.assertTrue(syllabifier.is_valid(v))
예제 #4
0
파일: test_stem.py 프로젝트: vierth/cltk
 def test_get_offset(self):
     """Test Indic Syllabifier get_offset method"""
     correct = 40
     syllabifier = IndianSyllabifier('hindi')
     current = syllabifier.get_offset('न', 'hi')
     self.assertEqual(current, correct)
예제 #5
0
파일: test_stem.py 프로젝트: vierth/cltk
 def test_coordinated_range(self):
     """Test Indic Syllabifier in_coordinated_range method"""
     syllabifier = IndianSyllabifier('hindi')
     current = syllabifier.get_offset('न', 'hi')
     current1 = syllabifier.in_coordinated_range_offset(current)
     self.assertTrue(current1)
예제 #6
0
 def test_is_dependent_vowel(self):
     """Test Indic Syllabifier is_dependent_vowel method"""
     syllabifier = IndianSyllabifier('hindi')
     v = syllabifier.get_phonetic_feature_vector('न', 'hi')
     self.assertFalse(syllabifier.is_dependent_vowel(v))
예제 #7
0
파일: test_stem.py 프로젝트: vierth/cltk
 def test_syllabify(self):
     """Test Indic Syllabifier method"""
     correct = ['न', 'म', 'स्ते']
     syllabifier = IndianSyllabifier('hindi')
     current = syllabifier.orthographic_syllabify('नमस्ते')
     self.assertEqual(current, correct)
예제 #8
0
 def test_is_anusvaar(self):
     """Test Indic Syllabifier is_anusvaar method"""
     syllabifier = IndianSyllabifier('hindi')
     v = syllabifier.get_phonetic_feature_vector('न', 'hi')
     self.assertFalse(syllabifier.is_anusvaar(v))
예제 #9
0
 def test_is_valid(self):
     """Test Indic Syllabifier is_valid method"""
     syllabifier = IndianSyllabifier('hindi')
     v = syllabifier.get_phonetic_feature_vector('न', 'hi')
     self.assertTrue(syllabifier.is_valid(v))
예제 #10
0
 def test_coordinated_range(self):
     """Test Indic Syllabifier in_coordinated_range method"""
     syllabifier = IndianSyllabifier('hindi')
     current = syllabifier.get_offset('न', 'hi')
     current1 = syllabifier.in_coordinated_range_offset(current)
     self.assertTrue(current1)
예제 #11
0
 def test_get_offset(self):
     """Test Indic Syllabifier get_offset method"""
     correct = 40
     syllabifier = IndianSyllabifier('hindi')
     current = syllabifier.get_offset('न', 'hi')
     self.assertEqual(current, correct)
예제 #12
0
 def test_syllabify(self):
     """Test Indic Syllabifier method"""
     correct = ['न', 'म', 'स्ते']
     syllabifier = IndianSyllabifier('hindi')
     current = syllabifier.orthographic_syllabify('नमस्ते')
     self.assertEqual(current, correct)
예제 #13
0
from cltk.corpus.sanskrit.itrans.unicode_transliterate import ItransTransliterator
from cltk.tokenize.sentence import TokenizeSentence
from cltk.stem.sanskrit.indian_syllabifier import Syllabifier

lang = "hi"
language = "hindi"
tokenizer = TokenizeSentence("sanskrit")
syl = Syllabifier(language)

#List of phonemes that should not be counted as separate diphones while splitting
check_phonemes_1 = ["ः", "ऽ", "ङ्‍\u200d"]
check_phonemes_2 = ["\u200c"]

#List of characters that should be taken to the left in case they are present to the right while splitting
move_left_1 = ['म्', 'र्', 'न्']


#Checking for numbers and purna-viram
def check_token(token):
    flag = True
    if token == "।":
        flag = False
    elif token.isdigit():
        flag = False

    return flag


#Checking for splitting position
def check_proximity(split, pos, next_token):
    if len(split) - pos in range(1, 3):
예제 #14
0
"""
@author: sourabh garg
"""
import itertools
import re
import words_tagging
from cltk.stem.sanskrit.indian_syllabifier import Syllabifier
from cltk.corpus.sanskrit.alphabet import *

lang = 'hindi'
h = Syllabifier(lang)

VOWELS = [
    INDEPENDENT_VOWELS_SIMPLE, INDEPENDENT_VOWELS_DIPTHONGS, INDEPENDENT_VOWELS
]

VOWELS = list(itertools.chain(*VOWELS))
CONSONANTS = [
    CONSONANT_GUTTURALS, CONSONANT_PALATALS, CONSONANT_CEREBRALS,
    CONSONANT_DENTALS, CONSONANT_LABIALS, SEMIVOWEL_CONSONANT,
    SIBILANT_CONSONANT, SONANT_ASPIRATE
]
CONSONANTS = list(itertools.chain(*CONSONANTS))
CONSONANT_HALANTA = [x + '्' for x in CONSONANTS]
CONS_TO_CONS = dict(zip(CONSONANT_HALANTA, CONSONANTS))
matraa_to_vowel = {
    '': 'अ',
    'ा': 'आ',
    'ि': 'इ',
    'ी': 'ई',
    'ु': 'उ',