Python complete_tokenize Examples, Tokenize.complete_tokenize Python Examples

Example #1

0

Show file

File: Sandhi.py Project: codeSG/Lemmatizer

def rule2(first, sec):
    '''
    **It is a special rule**
    If 'न्' is there in sec , it got converted to 'ण्' , if there exist 'र्' or 'ष्' or'ऋ' in first,
    where elements of listA can be present in between them.
    '''
    listA = [
        '', 'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ए', 'ऐ', 'ओ', 'औ', 'अं', 'अः', 'ह्',
        'य्', 'व्', 'न्', 'क्', 'ख्', 'ग्', 'घ्', 'ङ्', 'प्', 'फ्', 'ब्', 'भ्',
        'म्'
    ]

    sec_n = complete_tokenize(sec)
    first_r = complete_tokenize(first)
    first_r.reverse()

    for i in range(len(sec_n)):
        if sec_n[i] == 'न्':
            for item in first_r:
                if item in listA:
                    continue
                elif item == 'र्' or item == 'ष्' or item == 'ऋ':
                    sec_n[i] = 'ण्'
                    break
                elif item in consonant:
                    break
        elif sec_n[i] in consonant:
            break
    first_r.reverse()
    f = join(first_r)
    s = join(sec_n)
    return f, s

Example #2

0

Show file

File: Stemming.py Project: codeSG/Lemmatizer

def stem(word):
    """
    It inputs an inflected word and outputs the stem for that inflected word provided.
    In this function, first we make a trie of all available noun words,
    then taking tokenization of the inflected word and find any possible 
    match in the trie, if it found an exact match in the declension of that 
    matched noun word,then it would be our stem, else it would truncate 
    the word and repeat the above step, until we get our desired result. 
    """

    lis = complete_tokenize(word)
    if ' ' in word:
        lis = lis[
            2:]  #for vocative case , as they include 'हे' at initial, while searching it must be removed

    length = len(lis)

    for i in range(length):
        serch = lis[:length - i]
        for trie_word in mytrie.find(serch):
            joined_word = join(trie_word)

            if search_noun(word, joined_word) == True:
                found = True
                return joined_word

Example #3

0

Show file

def Declension(word, gender=''):
    ''' 
    It is the main method which produce Declension of any noun word 
    provided, its gender. 
    '''
    cases = [
        'प्रथमा', 'द्वितीया', 'तृतीया', 'चर्तुथी', 'पन्चमी', 'षष्ठी', 'सप्तमी',
        'सम्बोधन'
    ]
    if word in words_tagging.unique:
        Dec = eval(word)
    else:
        special_stem = ['as_stem_', 'an_stem_', 'en_stem_']
        stem_c = stem_class(word)
        stem_type = stem_c + gender
        word_list = complete_tokenize(word)
        if stem_c in special_stem:
            print("special")
            word_prefix = word_list[:-2]
        else:
            word_prefix = word_list[:-1]

        word = join(word_prefix)
        if stem_type not in words_tagging.dict_noun.values():
            stem_type = stem_type + '_1'
        Dec = []
        for row in range(8):
            case = []
            for col in range(3):
                if row == 7:
                    case.append('हे ' + sandhi(word,
                                               eval(stem_type)[row][col]))
                else:
                    case.append(sandhi(word, eval(stem_type)[row][col]))

Example #4

0

Show file

def Declension_noun(word):
    ''' 
    This function inputs only the noun word(no gender is required) present in our database,and 
    it returns the declension of the noun word by adding suffix sequence(from Declension_noun_form module) to the noun.'''
    if word in words_tagging.unique:
        return eval(word)
    else:
        special_stem = ['as_stem_', 'an_stem_', 'en_stem_']
        stem_c = stem_class(word)
        gen = gender(word)
        stem_type = stem_c + gen
        word_list = complete_tokenize(word)
        if stem_c in special_stem:
            word_prefix = word_list[:-2]
        else:
            word_prefix = word_list[:-1]
        word = join(word_prefix)

        decl = []
        for row in range(8):
            case = []
            for col in range(3):
                if row == 7:
                    case.append('हे ' + sandhi(word,
                                               eval(stem_type)[row][col]))
                else:
                    case.append(sandhi(word, eval(stem_type)[row][col]))

Example #5

0

Show file

File: Sandhi.py Project: codeSG/Lemmatizer

def sandhi(first, second):
    f = complete_tokenize(first)
    first, second = rule2(first, second)
    if f[-1] in consonant:
        return rule1(first, second)
    else:
        return rule0(first, second)

Example #6

0

Show file

File: Sandhi.py Project: codeSG/Lemmatizer

def rule1(first, second):
    ''' 
    suchtiv sandhi:  {'स्', 'त्', 'थ्', 'द्', 'ध्', 'न्'} converts to {'श्','च्', 'छ्','ज्', 'झ्', 'ञ्'} respectively ,
    if element of first list comes in any of the two words, and element from second list comes in the other word at the point of concatenation
    '''
    f = complete_tokenize(first)
    s = complete_tokenize(second)
    f_last = f[-1]
    s_start = s[0]
    listB = {
        'स्': 'श्',
        'त्': 'च्',
        'थ्': 'छ्',
        'द्': 'ज्',
        'ध्': 'झ्',
        'न्': 'ञ्'
    }
    if s_start in listB.keys() and f_last in listB.values():
        s[0] = listB[s_start]
    elif s_start in listB.values() and f_last in listB.keys():
        f[-1] = listB[f_last]
    result = f + s
    return join(result)

Example #7

0

Show file

File: Stemming.py Project: codeSG/Lemmatizer

def initialize():
    mytrie = Trie.Trie()
    for stem_cls in words_tagging.all_noun:
        for noun in stem_cls:
            mytrie.insert(complete_tokenize(noun))
    return mytrie

Example #8

0

Show file

File: Sandhi.py Project: codeSG/Lemmatizer

def rule0(first, second):
    """
    It contains swar sandhi(स्वर सन्धि)rules which is categorized as:
       ( 1.)दीर्घ सन्धि
       ( 2.)गुण सन्धि
       ( 3.)वृद्धि सन्धि
       ( 4.)यण् सन्धि
       ( 5.) अयादि सन्धि
    for detailed information visit: https://hi.wikipedia.org/wiki/संधि_(व्याकरण)
    """
    f = complete_tokenize(first)
    s = complete_tokenize(second)
    f_last = f[-1]
    s_start = s[0]
    f.pop()
    s.pop(0)
    add = []
    if f_last in ['अ', 'आ'] and s_start in ['अ', 'आ']:
        add = ['आ']
    elif f_last in ['इ', 'ई'] and s_start in ['इ', 'ई']:
        add = ['ई']
    elif f_last in ['उ', 'ऊ'] and s_start in ['उ', 'ऊ']:
        add = ['ऊ']
    elif f_last in ['ऋ'] and s_start in ['ऋ']:
        add = ['ऋ']

    elif f_last in ['अ', 'आ'] and s_start in ['इ', 'ई']:
        add = ['ए']
    elif f_last in ['अ', 'आ'] and s_start in ['उ', 'ऊ']:
        add = ['ओ']
    elif f_last in ['अ', 'आ'] and s_start in ['ऋ']:
        add = ['अ', 'र्']
    elif f_last in ['अ', 'आ'] and s_start in ['ऌ']:
        add = ['अ', 'ल्']

    elif f_last in ['अ', 'आ'] and s_start in ['ए', 'ऐ']:
        add = ['ऐ']
    elif f_last in ['अ', 'आ'] and s_start in ['ओ', 'औ']:
        add = ['औ']

    elif f_last in ['इ', 'ई'
                    ] and s_start in ['अ', 'आ', 'उ', 'ऊ', 'ए', 'ऐ', 'ओ', 'औ']:
        add = ['य्', s_start]
    elif f_last in ['उ', 'ऊ'
                    ] and s_start in ['अ', 'आ', 'इ', 'ई', 'ए', 'ऐ', 'ओ', 'औ']:
        add = ['ऊ', s_start]
    elif f_last in ['ऋ'] and s_start in [
            'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ए', 'ऐ', 'ओ', 'औ'
    ]:
        add = ['र्', s_start]
    elif f_last in ['ऌ'] and s_start in [
            'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ए', 'ऐ', 'ओ', 'औ'
    ]:
        add = ['ल्', s_start]

    elif f_last in ['ए'] and s_start in [
            'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ए', 'ऐ', 'ओ', 'औ'
    ]:
        add = ['अ', 'य्', s_start]
    elif f_last in ['ऐ'] and s_start in [
            'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ए', 'ऐ', 'ओ', 'औ'
    ]:
        add = ['अा', 'य्', s_start]
    elif f_last in ['ओ'] and s_start in [
            'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ए', 'ऐ', 'ओ', 'औ'
    ]:
        add = ['अ', 'व्', s_start]
    elif f_last in ['औ'] and s_start in [
            'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ए', 'ऐ', 'ओ', 'औ'
    ]:
        add = ['आ', 'व्', s_start]
    else:
        add = [f_last, s_start]

    result = f + add + s

    return join(result)