def removeSuffix(self, word): removed = False if not self.possible_suffixes: # init once self.setSuffixes() self.prepareSuffixes() word_lett = utf8.get_letters(word) rword_lett = copy.copy(word_lett) rword_lett.reverse() # print('rev word ->',rword_lett) rword = u"".join(rword_lett) longest_match = "" for itr in range(len(self.reversed_suffixes)): suffix = self.reversed_suffixes[itr] # print(itr,utf8.get_letters(suffix)) if rword.startswith(suffix): if len(longest_match) <= len(suffix): longest_match = suffix # print('L-match-->',utf8.get_letters(longest_match)) continue if len(longest_match) > 0: removed = True sfx = [] for itr in range(len(utf8.get_letters(longest_match))): sfx.append(word_lett.pop()) word = u"".join(word_lett) sfx.reverse() sfx = u"".join(sfx) # rule to replace suffix alt_suffix = self.replace_suffixes.get(sfx, None) if alt_suffix: word = word + alt_suffix return word, removed
def test_reverse_words( self ): """ unittest for reverse a Tamil string""" print utf8.get_letters(u"இந்த") print u"".join(utf8.get_letters(u"இந்த")) for word in u"இந்த (C) tamil முத்தையா அண்ணாமலை 2013 இந்த ஒரு எழில் தமிழ் நிரலாக்க மொழி உதாரணம்".split(): rword = utf8.reverse_word(word) print word,rword self.assertTrue( utf8.get_letters(rword)[0] == utf8.get_letters(word)[-1] ) return
def test_istamil( self ): zz = u"முத்தையா அண்ணாமலை எந்த ஒரு தெரிந்த அல்லது தெரியாத எழுத்துருவாகவிருந்தாலும் அதனை மேல்தட்டில் உள்ளிட்டு கீழே உள்ள முடியும்" for z in zz.split(u" "): print("********** t/f ********") for x,y in zip(map(utf8.istamil,utf8.get_letters(z)),utf8.get_letters(z)): print("%s => %s"%(y,x)) assert( all( map( utf8.istamil, utf8.get_letters( z ) ) ) ) z = u"முத்தையா அண்ணாமலை" assert( any( map( utf8.istamil, utf8.get_letters( z ) ) ) ) correct = [True, True, True, True, False, True, True, True, True, True, False, False, False, False, False] assert( map(utf8.istamil,utf8.get_letters(u"முத்தையா அண்ணாமலை 2013")) == correct )
def test_entity(self): word = u"nuthin" q = WordEntity(word,row=5,col=6) self.assertEqual(q.word,word) self.assertEqual(q.letters,utf8.get_letters(u"nuthin")) self.assertEqual((q.row, q.col),(5,6)) self.assertTrue(q.isWord())
def test_classifier(self): expected = [] expected.extend(['english']*3) expected.extend(['digit']*4) expected.extend(['kuril','nedil','uyirmei','vallinam','uyirmei']) data = list(map(utf8.classify_letter,utf8.get_letters(u"abc1230அஆரெட்டை"))) self.assertEqual(data,expected)
def norvig_suggestor(word,alphabets=None,nedits=1,limit=float("inf")): if not alphabets: alphabets = tamil_letters if not type(word) is list: wordL = get_letters(word) else: wordL = word # recursive method for edit distance > 1 if nedits > 1: result = [] for nAlternate in norvig_suggestor(wordL,alphabets,nedits-1,limit-len(result)): if len(result) > limit: break result.extend( norvig_suggestor(nAlternate,alphabets,1,limit-len(result)) ) return set(result) ta_splits = [ [u"".join(wordL[:idx-1]),u"".join(wordL[idx:])] for idx in range(len(wordL) + 1)] #pprint( ta_splits ) ta_deletes = [a + b[1:] for a, b in ta_splits if b] ta_transposes = [a + b[1] + b[0] + b[2:] for a, b in ta_splits if len(b)>1] ta_replaces = [a + c + b[1:] for a, b in ta_splits for c in alphabets ] ta_replaces2 = [ c + b for a, b in ta_splits for c in alphabets ] ta_inserts = [a + c + b for a, b in ta_splits for c in alphabets] # TODO: add a normalizing pass word words in vowel+consonant forms to eliminate dangling ligatures return set(ta_deletes + ta_transposes + ta_replaces + ta_replaces2 + ta_inserts )
def test_letter_extract_from_code_pts(self): letters = utf8.get_letters(u"கூவிளம் என்பது என்ன சீர்") #print "len ==== > " , len(letters) assert( len(letters) == 15 ) for pos,letter in enumerate(letters): print(u"%d %s"%(pos,letter)) assert( letter == (u"ர்") )
def test_letter_extract_with_ascii(self): letters = utf8.get_letters(u"கூவிளம் is என்பது also என்ன a சீர்") print "len ==== > " , len(letters) assert(len(letters) == 25 ) for pos,letter in enumerate(letters): print(u"%d %s"%(pos,letter)) assert( letters[-4] == u"a" )
def test_words_to_letters(self): k1 = u"இந்தக் குளிர்ல டெய்லி தலைக்கு குளிக்கற நல்லவங்க இருக்கறதாலதான் கோவை இப்படி சூப்பரா இருக்காம்" word_length = [4,4,3,4,5,6,9,2,4,4,5] for idx,kk in enumerate(k1.split(' ')): idx_len = len( get_letters(kk) ) print('w# ',idx, idx_len ) self.assertEqual( word_length[idx], idx_len)
def getWordCount(self,word): isWord, ref_trie = self.isWord( word, ret_ref_trie = True) if not isWord: raise Exception(u"Word does not exist in Trie") #pprint(str(ref_trie)) letters = utf8.get_letters( word ) return ref_trie.count[ letters[-1] ]
def get(word): word = word.strip() word = word.replace(u' ',u'') letters = utf8.get_letters(word) F = Feature() F.nletters = len(letters)*1.0 F.unigscore = unigram_score(letters) F.bigscore = max(bigram_scores(letters)) for l in letters: try: rtl = reverse_transliterate(l) if any( [rtl.startswith(l) for l in ['a','e','i','o','u'] ] ): F.vowels += 1.0 except Exception as ioe: pass kind = utf8.classify_letter(l) if kind == 'kuril': F.kurils += 1 elif kind == 'nedil': F.nedils += 1 elif kind == 'ayudham': F.ayudhams += 1 elif kind == 'vallinam': F.vallinams += 1 elif kind == 'mellinam': F.mellinams += 1 elif kind == 'idayinam': F.idayinams += 1 elif kind in ['english','digit']: continue elif kind == 'tamil_or_grantham': F.granthams += 1 F.kurils /= F.nletters F.nedils /= F.nletters F.ayudhams /= F.nletters F.vallinams /= F.nletters F.vallinams /= F.nletters F.mellinams /= F.nletters F.idayinams /= F.nletters F.granthams /= F.nletters F.vowels /= F.nletters if letters[0] in utf8.uyir_letters: F.first += 1.0 if letters[0] in utf8.mei_letters: F.first += F.first + 0.25 if letters[0] in utf8.uyirmei_letters: F.first += F.first + 0.05 if letters[-1] in utf8.uyir_letters: F.last += 1.0 if letters[-1] in utf8.mei_letters: F.last += F.last + 0.25 if letters[-1] in utf8.uyirmei_letters: F.last += F.last + 0.05 return F
def anagram(request,word): AllTrueDictionary = wordutils.DictionaryWithPredicate(lambda x: True) TVU,TVU_size = DictionaryBuilder.create(TamilVU) length = len(utf8.get_letters(word)) actual =list(wordutils.anagrams(word,TVU)) json_string = json.dumps(actual,ensure_ascii = False) #creating a Response object to set the content type and the encoding response = HttpResponse(json_string,content_type="application/json; charset=utf-8" ) return response
def getAllWordsPrefix(self,prefix): raise Exception("NOT IMPLEMENTED RIGHT") all_words = [] val,ref_trie,ref_word_limits = self.isWord(prefix,ret_ref_trie=True) # ignore val if val: all_words.append( prefix ) prefix_letters = utf8.get_letters(prefix) self.getAllWordsHelper( ref_trie, ref_word_limits, prefix_letters, all_words) return all_words
def test_tamil_only_words(self): s = u"உடனே உடனே seventh heaven எழுதினால் செய்திப் பத்திரிகை போஆகிவிடும் அசோகமித்திரன் நேர்காணல்" words = s.replace(u"seventh heaven ",u"").split(u" ") letters = utf8.get_letters( s ) outWords = utf8.get_tamil_words( letters ) if ( LINUX ): print( u"|".join(words) ) print( u"|".join(outWords) ) self.assertEqual( outWords, words )
def test_words(self): _str = u"உடனே random elevator jazz உடனே எழுதினால் செய்திப் பத்திரிகை போஆகிவிடும் அசோகமித்திரன் நேர்காணல்" words = _str.split(u" ") letters = utf8.get_letters( _str ) outWords = utf8.get_words( letters, tamil_only = False ) if ( LINUX ): print( u"|".join(words) ) print( u"|".join(outWords) ) self.assertEqual( outWords, words )
def xkcd(): obj = RemoveCaseSuffix() expected = [u"பதிவிற்",u"கட்டளைக",u"அவர்"] words_list = [u"பதிவிற்க்கு",u"கட்டளைகளை",u"அவர்கள்"] for w,x in zip(words_list,expected): rval = obj.removeSuffix(w) assert(rval[1]) print(utf8.get_letters(w),'->',rval[1]) assert(rval[0] == x) return
def keech(request,k1): dic={} for idx,kk in enumerate(k1.split(' ')): idx_len = len( get_letters(kk) ) #print('w# ',idx, idx_len ) dic[idx]=idx_len json_string = json.dumps(dic,ensure_ascii = False) #creating a Response object to set the content type and the encoding response = HttpResponse(json_string,content_type="application/json; charset=utf-8" ) return response
def getAllWordsPrefix(self,prefix): all_words = [] val,curr_trie = self.isWord(prefix,ret_ref_trie=True) prefix_letters = utf8.get_letters(prefix) ref_trie = curr_trie.alphabets.get( prefix_letters[-1], curr_trie ) #print(ref_trie.__str__()) # ignore val if val: all_words.append( prefix ) self.getAllWordsHelper( ref_trie, prefix_letters, all_words=all_words ) return all_words
def test_letter_extract_yield_with_ascii(self): letters = [] ta_str = u"கூவிளம் is என்பது also என்ன a சீர்" for l in utf8.get_letters_iterable(ta_str): letters.append( l ) act_letters = utf8.get_letters(ta_str) print( "len ==== > " , len(letters),"get_letters CALL = ",len(act_letters) ) assert(len(letters) == len(act_letters) ) for pos,letter in enumerate(letters): if ( LINUX ): print( u"%d %s"%(pos,letter) ) self.assertEqual( letters[-4], u"a" )
def test_tamil_only_words(self): string = u"உடனே உடனே seventh heaven எழுதினால் செய்திப் பத்திரிகை போஆகிவிடும் அசோகமித்திரன் நேர்காணல்" words = string.replace(u"seventh heaven ",u"").split(u" ") letters = utf8.get_letters( string ) outWords = utf8.get_tamil_words( letters ) print u"|".join(words) print u"|".join(outWords) assert( outWords == words )
def test_words(self): string = u"உடனே random elevator jazz உடனே எழுதினால் செய்திப் பத்திரிகை போஆகிவிடும் அசோகமித்திரன் நேர்காணல்" words = string.split(u" ") letters = utf8.get_letters( string ) outWords = utf8.get_words( letters ) print u"|".join(words) print u"|".join(outWords) assert( outWords == words )
def xkcd(): obj = RemovePluralSuffix() objf = CaseFilter(obj) expected = [u"பதிவி", u"கட்டளை", u"அவர்", u"பள்ளி"] words_list = [u"பதிவில்", u"கட்டளைகள்", u"அவர்கள்", u"பள்ளிகள்"] for w, x in zip(words_list, expected): rval = obj.removeSuffix(w) trunc_word = objf.apply(w) assert trunc_word == rval[0] assert rval[1] print(utf8.get_letters(w), "->", rval[1]) assert rval[0] == x return
def test_letter_extract_yield(self): ta_str = u"கூவிளம் என்பது என்ன சீர்" act_letters = utf8.get_letters(ta_str) letters = [] for l in utf8.get_letters_iterable(ta_str): letters.append( l ) print( "len ==== > " , len(letters) ) assert( len(letters) == 16 ) print( "len ==== > " , len(letters),"get_letters CALL = ",len(act_letters) ) assert(len(letters) == len(act_letters) ) for pos,letter in enumerate(letters): if ( LINUX ): print(u"%d %s"%(pos,letter)) assert( letter == (u"ர்") )
def test_all_valid(self): data,DEBUG = [],False with codecs.open("data/project_madurai_utf8.txt","r","utf-8") as f: data = filter(lambda x: len(x)>2, f.readlines()) obj = BadIME() for idx,line in enumerate(data): for col,word in enumerate( re.split(u'\s+',line) ): if DEBUG: print(idx,col) print(utf8.get_letters(word)) self.assertEqual(obj.apply(word),(True,None)) pass pass
def test_ngram(request,ng): obj = DTrie() prev_letter = u'' # per-line processor - remove spaces for char in get_letters(u"".join(re.split('\s+',ng)).lower()): if (prev_letter.isalpha() and char.isalpha()) or ( utf8.is_tamil_unicode(prev_letter) and utf8.is_tamil_unicode(char)): bigram = u"".join([prev_letter,char]) obj.add(bigram) # update previous prev_letter = char actual = obj.getAllWordsAndCount() json_string = json.dumps(actual,ensure_ascii = False) #creating a Response object to set the content type and the encoding response = HttpResponse(json_string,content_type="application/json; charset=utf-8" ) return response
def isWordAndTrie(self,word): ref_trie = self.trie letters = utf8.get_letters(word) wLen = len(letters) rval = False prev_trie = None for idx,letter in enumerate(letters): #print(str(ref_trie)) rval = ref_trie.is_word.get(letter,False) prev_trie = ref_trie ref_trie = ref_trie.alphabets.get(letter,None) if not ref_trie: break return rval,prev_trie
def isWord(self,word): # see if @word is present in the current Trie; return True or False letters = utf8.get_letters(word) wLen = len(letters) ref_trie = self.trie ref_word_limits = self.word_limits for itr,letter in enumerate(letters): idx = self.getidx( letter ) #print(idx, letter) if itr == (wLen-1): break if not ref_trie[idx][1]: return False #this branch of Trie did not exist ref_trie = ref_trie[idx][1] ref_word_limits = ref_word_limits[idx][1] return ref_word_limits[idx][0]
def map_to_braille(tamil_string): result = [] for letter in get_letters(tamil_string): if letter in grantha_mei_letters: pos = grantha_mei_letters.index(letter) agaram = grantha_agaram_letters[pos] result.append(table[agaram]) result.append(table[pulli_symbols[0]]) elif letter in uyir_letters or letter == ayudha_letter: result.append(table[letter]) else: lMei, lUyir = splitMeiUyir(letter) pos = grantha_mei_letters.index(lMei) agaram = grantha_agaram_letters[pos] result.append(table[agaram]) if lUyir != 'அ': result.append(table[lUyir]) return result
def main(): eq = Counter() eqd = {} kural = Thirukkural() for kural_no in range(1330): kural_words = get_tamil_words(get_letters(kural.get_kural_no(kural_no + 1).ta)) mathirai = sum([total_maaththirai(word) for word in kural_words]) if eq[mathirai] == 0: eqd[mathirai] = [kural_no + 1] else: eqd[mathirai].append(kural_no + 1) eq[mathirai] += 1 eq_sorted = OrderedDict(sorted(eq.items(), key=lambda x: x)) pprint(eq_sorted) pprint(eq_sorted.values()) pprint(eqd) print("total = ", sum(eq.values())) plt.scatter(eq_sorted.keys(), eq_sorted.values()) plt.ylabel(u"குறட்பாக்கள் எண்ணிக்கை", {"fontname": "Catamaran"}) plt.xlabel(u"மாத்திரை அளவு", {"fontname": "Catamaran"}) # Arial Unicode MS'}) # p0 is the initial guess for the fitting coefficients (A, mu and sigma above) p0 = [75.0, 20.0, 5.0] coeff, var_matrix = curve_fit( gauss, list(eq_sorted.keys()), list(eq_sorted.values()), p0=p0 ) # Get the fitted curve hist_fit = gauss(list(eq_sorted.keys()), *coeff) plt.plot( eq_sorted.keys(), hist_fit, label="Gaussian Fitted data (mean=%g, std=%g)" % (coeff[1], coeff[2]), ) plt.title( r"குறள் மாத்திரை வரிசை (Gauss \mu=%g, \sigma=%g)" % (coeff[1], coeff[2]), {"fontname": "Catamaran"}, ) # Finally, lets get the fitting parameters, i.e. the mean and standard deviation: print("Fitted mean = ", coeff[1]) print("Fitted standard deviation = ", coeff[2]) plt.show()
def isWord(self, word, ret_ref_trie=False): # see if @word is present in the current Trie; return True or False letters = utf8.get_letters(word) wLen = len(letters) ref_trie = self.trie ref_word_limits = self.word_limits for itr, letter in enumerate(letters): idx = self.getidx(letter) #print(idx, letter) if itr == (wLen - 1): break if not ref_trie[idx][1]: return False #this branch of Trie did not exist ref_trie = ref_trie[idx][1] ref_word_limits = ref_word_limits[idx][1] if ret_ref_trie: return ref_word_limits[idx][0], ref_trie, ref_word_limits return ref_word_limits[idx][0]
def test_istamil( self ): zz = u"முத்தையா அண்ணாமலை எந்த ஒரு தெரிந்த அல்லது தெரியாத எழுத்துருவாகவிருந்தாலும் அதனை மேல்தட்டில் உள்ளிட்டு கீழே உள்ள முடியும்" for z in zz.split(u" "): print("********** t/f ********") for x,y in zip(map(utf8.istamil,utf8.get_letters(z)),utf8.get_letters(z)): if ( LINUX ): print(u"%s => %s"%(y,x)) assert( all( map( utf8.istamil, utf8.get_letters( z ) ) ) ) z = u"முத்தையா அண்ணாமலை" assert( any( map( utf8.istamil, utf8.get_letters( z ) ) ) ) correct = [True, True, True, True, False, True, True, True, True, True, False, False, False, False, False] print ( list(map(utf8.istamil,utf8.get_letters(u"முத்தையா அண்ணாமலை 2013"))) ) print ( correct ) assert( list(map(utf8.istamil,utf8.get_letters(u"முத்தையா அண்ணாமலை 2013"))) == correct )
def add(self, word): ref_trie = self.trie letters = utf8.get_letters(word) wLen = len(letters) prev_trie = None assert wLen >= 1 for idx, letter in enumerate(letters): value = ref_trie.alphabets.get(letter, None) prev_trie = ref_trie if not value: ref_trie.alphabets[letter] = Node() ref_trie.is_word[letter] = False ref_trie.count[letter] = 0 ref_trie = ref_trie.alphabets[letter] else: ref_trie = value #print(str(prev_trie)) last_trie = prev_trie last_trie.is_word[letter] = True last_trie.count[letter] += 1 return
def add(self,word): ref_trie = self.trie letters = utf8.get_letters(word) wLen = len(letters) prev_trie = None assert wLen >= 1 for idx,letter in enumerate(letters): value = ref_trie.alphabets.get(letter,None) prev_trie = ref_trie if not value: ref_trie.alphabets[letter] = Node() ref_trie.is_word[letter]=False ref_trie.count[letter]=0 ref_trie = ref_trie.alphabets[letter] else: ref_trie = value #print(str(prev_trie)) last_trie = prev_trie last_trie.is_word[letter] = True last_trie.count[letter] += 1 return
def add(self,word): # trie data structure is built here #print("*"*30,"adding","*"*30) letters = utf8.get_letters(word) wLen = len(letters) ref_trie = self.trie ref_word_limits = self.word_limits for itr,letter in enumerate(letters): try: idx = self.getidx( letter ) except Exception as exp: continue #print(idx, itr) ref_trie[idx][0] = True if itr == (wLen-1): break if not ref_trie[idx][1]: ref_trie[idx][1] = Trie.mk_empty_trie(self.L) ref_word_limits[idx][1] = Trie.mk_empty_trie(self.L) ref_trie = ref_trie[idx][1] ref_word_limits = ref_word_limits[idx][1] ref_word_limits[idx][0] = True
def add(self, word): # trie data structure is built here #print("*"*30,"adding","*"*30) letters = utf8.get_letters(word) wLen = len(letters) ref_trie = self.trie ref_word_limits = self.word_limits for itr, letter in enumerate(letters): try: idx = self.getidx(letter) except Exception as exp: continue #print(idx, itr) ref_trie[idx][0] = True if itr == (wLen - 1): break if not ref_trie[idx][1]: ref_trie[idx][1] = Trie.mk_empty_trie(self.L) ref_word_limits[idx][1] = Trie.mk_empty_trie(self.L) ref_trie = ref_trie[idx][1] ref_word_limits = ref_word_limits[idx][1] ref_word_limits[idx][0] = True
def isWordAndTrie(self, word, prefix=False): ref_trie = self.trie letters = utf8.get_letters(word) wLen = len(letters) rval = False is_prefix = False prev_trie = None for idx, letter in enumerate(letters): #print(str(ref_trie)) rval = ref_trie.is_word.get(letter, False) prev_trie = ref_trie ref_trie = ref_trie.alphabets.get(letter, None) if not ref_trie: break if prefix: if idx < (len(letters) - 1): return False elif not ref_trie: return False return True return rval, prev_trie
def setUp(self): self.AllTrueDictionary = wordutils.DictionaryWithPredicate( lambda x: True) self.TVU, self.TVU_size = DictionaryBuilder.create(TamilVU) self.word = u"சவால்" self.length = len(utf8.get_letters(self.word))
def fun(e): return len(utf8.get_letters(e))
def test_tamil_only_words(self): s = u"உடனே உடனே seventh heaven எழுதினால் செய்திப் பத்திரிகை போஆகிவிடும் அசோகமித்திரன் நேர்காணல்" words = s.replace(u"seventh heaven ", u"").split(u" ") letters = utf8.get_letters(s) outWords = utf8.get_tamil_words(letters) self.assertEqual(outWords, words)
def next_tamil_letter(self): self.handle = codecs.open(self.filename, 'r', 'utf-8') for letter in utf8.get_letters(self.handle.read()): if (utf8.istamil(letter)): yield letter raise StopIteration
def கடையெழுத்து(சொல்): return tamilutf8.get_letters(சொல்)[-1]
def test_word_length(self): actual = 5 letters = utf8.get_letters(u"மென்பொருள்") self.assertEqual(actual, len(letters))
def test_letter_extract_from_code_pts(self): letters = utf8.get_letters(u"கூவிளம் என்பது என்ன சீர்") assert len(letters) == 16 assert letters[-1] == (u"ர்")
def count_letter(self): self.letter_toll += len(tamil.get_letters(self.line))
def removePrefix(self,word): word_lett = utf8.get_letters(word) word_lett.reverse() a,b = self.removeSuffix(u"".join(word_lett)) return [utf8.reverse_word(a),b]
#!/bin/env python3 from codecs import open from tamil import utf8 import re with open("kuttistory.txt", "r", "utf-8") as fp: data = fp.readlines() class Stats: __fields__ = ("total_words", "tamil_words") stats = Stats() stats.total_words = 0.0 stats.tamil_words = 0.0 for line in data: all_words = re.split("\s+", line.strip()) ta_words = list(utf8.get_tamil_words(utf8.get_letters(line))) print((all_words, len(ta_words))) stats.tamil_words += len(ta_words) stats.total_words += len(all_words) # tamil fraction taf = float(stats.tamil_words) / stats.total_words print(("English = {0}%, Tamil = {1}%".format(100.0 * (1 - taf), 100.0 * (taf))))
def get_letters(word): if isinstance(word, list): chars = word else: chars = utf8.get_letters(word) return chars
def __init__(self, word, flagged=False, **kwargs): super(Entity, self).__init__(**kwargs) self.flagged = flagged self.word = word self.letters = utf8.get_letters(word)
def முதலெழுத்து(சொல்): return tamilutf8.get_letters(சொல்)[0]
def test_letter_extract_with_ascii(self): letters = utf8.get_letters(u"கூவிளம் is என்பது also என்ன a சீர்") assert len(letters) == 26 assert letters[-4] == u"a"
def test_shamikshu(self): word = u"க்ஷமிக்ஷூ" self.assertTrue(all(map(utf8.istamil, utf8.get_letters(word)))) self.assertTrue(all(map(utf8.istamil_alnum, utf8.get_letters(word))))
def setUp(self): self.AllTrueDictionary = wordutils.DictionaryWithPredicate(lambda x: True) self.TVU,self.TVU_size = DictionaryBuilder.create(TamilVU) self.word = u"சவால்" self.length = len(utf8.get_letters(self.word))
def test_odd_case(self): # truly mal-formed inputs get mangled by get-letters not_a_word = u"ஆாள்" self.assertEqual(utf8.get_letters(not_a_word), [u"ஆா", u"ள்"]) not_a_word = u"ஆள்்ஆ" self.assertEqual(utf8.get_letters(not_a_word), [u"ஆ", u"ள்்", u"ஆ"])
def get_letters_impl(self, word): return self.is_english and [l for l in word] or utf8.get_letters(word)
def test_word_no2_length(self): actual = 6 letters = utf8.get_letters(u'[\u0baa-\u0baa\u0bcc]+') self.assertEqual(actual, len(letters))
def எழுத்தாக்கு(சொல்): return tamilutf8.get_letters(சொல்)
def test_get_letters2(self): letters = utf8.get_letters(u"hello world தெரிந்த அல்லது தெரியாத") assert (len(letters) == 27) self.assertTrue(letters[13] == u"தெ")
def tamil2eng(text): chars = utf8.get_letters(text) result = [tam2eng_map.get(char, char) for char in chars] return ''.join(result)
a = a.replace(',', '', 10000000000) a = a.replace('?', '', 10000000000) a = a.replace('ஏற்றப்படுகின்றது', '', 10000000000) if (utf8.all_tamil(a)): if a not in new: new.append(a) fil.write(a) fil.write('\n') #f.write(a) #f.write("\n") else: a = '' cnt = count() final = sorted(new, key=lambda w: (len(utf8.get_letters(w)), next(cnt)), reverse=True)[:10] print(final) URL = href['href'] for fa in final: f.write(fa) f.write('\t\t') f.write(str(len(utf8.get_letters(fa)))) f.write('\n') fil2.write('\n') fil2.write(href['href']) f.write('\n') c = c + 1 f.close() winsound.Beep(2000, 1000)
def joinWords(word_a, word_b): word_a = word_a.strip() word_b = word_b.strip() # get readable letters of first word first_word_letters = get_letters(word_a) if first_word_letters[-1] in mei_letters: # first word last char is mei letter. so just return as it is. # todo : apply special conditions also rval = word_a + " " + word_b return rval # end of if first_word_last_chars[-1] in mei_letters: # get mei & uyir characters of first word's last char first_word_last_chars = splitMeiUyir(first_word_letters[-1]) if len(first_word_last_chars) == 2: first_word_last_mei_char, first_word_last_uyir_char = first_word_last_chars else: first_word_last_mei_char, first_word_last_uyir_char = ( first_word_last_chars[0], first_word_last_chars[0], ) # get rule sub dictionary from all dictionary by passing rule = all_rules.get(first_word_last_uyir_char, None) if word_a == word_b: # both input words are same same_word_rule = rule.get("same_words", []) if word_a in same_word_rule[0]: # get conjuction char jn = same_word_rule[1] # insert conjuction char between input words rval = first_word_letters[0] + jn + word_b return rval elif len(first_word_letters) == 3: # both words are same but length is 3. disappear_lastchar = rule.get("same_word_disappear_lastchar", []) if disappear_lastchar: disappear_lastchar = disappear_lastchar[0] if first_word_last_uyir_char == disappear_lastchar: first_word_first_char = first_word_letters[0] # get uyir char of second word's first char first_word_first_uyir_char = splitMeiUyir( first_word_first_char)[-1] # get conjuction char by joining first word's last mei char and second word's first uyir char jn = joinMeiUyir(first_word_last_mei_char, first_word_first_uyir_char) # get first word till pre-last char first_word = u"".join(first_word_letters[:-1]) # get second word from second char till end second_word = u"".join(first_word_letters[1:]) # join all first, conjuction, second word rval = first_word + jn + second_word return rval # end of if disappear_lastchar: # end of if word_a in same_word_rule[0]: # end of if word_a == word_b: if rule: if word_a in rule.get("first_solo_words", []): # todo : need to find tune this first solo word check like using startswith, endswith, etc rval = word_a + " " + word_b return rval # end of if word_a in rule.get('first_solo_words', []): for diff_jn in rule.get("diff_jn_words", []): if word_a in diff_jn[0]: for last in diff_jn[1]: if word_b.startswith(last): # apply different conjuction char rule rval = word_a + diff_jn[2] + word_b return rval # end of for diff_jn in rule.get('diff_jn_words', []): # get readable letters of second word second_word_letters = get_letters(word_b) # get second word's from second char to till end second_word_after_first_char = u"".join(second_word_letters[1:]) # get mei & uyir characters of second word's first char second_word_first_chars = splitMeiUyir(second_word_letters[0]) if len(second_word_first_chars) == 2: ( second_word_first_mei_char, second_word_first_uyir_char, ) = second_word_first_chars else: second_word_first_mei_char, second_word_first_uyir_char = ( second_word_first_chars[0], second_word_first_chars[0], ) if rule: if second_word_first_mei_char in rule.get("secondword_first_chars", []): # apply major conjuction rule return word_a + second_word_first_mei_char + " " + word_b # end of if second_word_first_mei_char in rule.get('secondword_first_chars', []): firstword_double_special_secondword = rule.get( "firstword_double_special_secondword", None) if firstword_double_special_secondword: if len(first_word_letters) == 4: # check either first word has repeated two times if (first_word_letters[:2] == first_word_letters[2:] ): # first word repeat two times within it # get root second word by removing prefix sec_word = (second_word_first_uyir_char + second_word_after_first_char) if sec_word in firstword_double_special_secondword[0]: # get conjuction char by joining special conjuction and second root word jn = joinMeiUyir( firstword_double_special_secondword[1], second_word_first_uyir_char, ) # join all return word_a + jn + second_word_after_first_char # end of if firstword_double_special_secondword: special_secondword_first_chars = rule.get( "special_secondword_first_chars", None) if special_secondword_first_chars: if second_word_first_uyir_char in special_secondword_first_chars[ 0]: # get special conjuction char jn = special_secondword_first_chars[1] # join special conjuction char with second word's first uyir char second_word_first_schar = joinMeiUyir( jn, second_word_first_uyir_char) # complete second word with prefix of conjuction second_word = second_word_first_schar + second_word_after_first_char # join all return word_a + second_word # end of if second_word_first_uyir_char in special_secondword_first_chars[0]: # end of if special_secondword_first_chars: # if all above rules not applicable, then just return as it is ! return word_a + " " + word_b