def test_number_in_text(self): text = u"அவள் ஒரு தேவதை - 5 அப்படியே தலையில் மாவு கொட்டியது" out, _ = check_sandhi(text) otext = u" ".join(out) self.assertEqual( otext, u'அவள் ஒரு தேவதை - 5 அப்படியே தலையில் மாவு கொட்டியது') self.assertTrue('5' in otext)
def test_integration(self): golden = u"அங்குக் கண்டான் அந்த பையன். எத்தனைப் பழங்கள்? கண்டவாறு சொன்னான், ஐந்து சிறுவர்கள், கத்தியோடு நின்றான்,கத்திகொண்டு குத்தினான், வீட்டிலிருந்து சென்றான், கைக் குழந்தை, கற்று கொடுத்தான், குரங்கு குட்டி, விறகு கடை, பொது பணி, தேர்வு கட்டணம், கனியைத் தின்றான்,எனக்கு கொடு, வீட்டினின்று வெளியேறினான், வர சொன்னான், என்னுடைய புத்தகம், எனது புத்தகம், குறிஞ்சி தலைவன், தேங்காய் சட்னி, தயிர் குடம், தீராச் சிக்கல், மரம் தலைவன்." source =u"அங்குக் கண்டான் அந்த பையன். எத்தனை பழங்கள்? கண்டவாறு சொன்னான், ஐந்து சிறுவர்கள், கத்தியோடு நின்றான்," \ u"கத்திகொண்டு குத்தினான், வீட்டிலிருந்து சென்றான், கை குழந்தை, கற்று கொடுத்தான், குரங்கு குட்டி, விறகு கடை, பொது பணி, தேர்வு கட்டணம், கனியை தின்றான்," \ u"எனக்கு கொடு, வீட்டினின்று வெளியேறினான், வர சொன்னான், என்னுடைய புத்தகம், எனது புத்தகம், குறிஞ்சி தலைவன், தேங்காய் சட்னி, தயிர் குடம், தீரா சிக்கல், மரம் தலைவன்." fixed, res = check_sandhi(source) fixed_string = u" ".join(fixed) self.assertEqual(fixed_string, golden) self.assertTrue(isinstance(res, Results)) self.assertEqual(res.counter, 46)
def call_sandhi_check(request): k1= request.GET.get('tamiltext',u'அங்குக் கண்டான் அந்த பையன் எத்தனை பழங்கள் ') dic={} temp=u"" dic['old']=k1 text,res=check_sandhi(k1) for i,j in enumerate(k1.split()): if j!=text[i]: text[i]="<span class='highlight'>"+text[i]+"</span>" dic['new']=u" ".join(text) json_string = json.dumps(dic,ensure_ascii = False) #creating a Response object to set the content type and the encoding response = HttpResponse(json_string,content_type="application/json; charset=utf-8" ) return response
def call_sandhi_check(request): k1= cgi.escape(request.POST.get('tamiltext',u'அங்குக் கண்டான் அந்த பையன் எத்தனை பழங்கள் ')) print(k1) dic={} temp=u"" dic['old']=k1 text,res=check_sandhi(k1) for i,j in enumerate(k1.split()): try: if j!=text[i]: text[i]="<span class='highlight'>"+text[i]+"</span>" except IndexError: pass dic['new']=u" ".join(text) json_string = json.dumps(dic,ensure_ascii = False) #creating a Response object to set the content type and the encoding response = HttpResponse(json_string,content_type="application/json; charset=utf-8" ) return response
def call_sandhi_check(request): k1 = html.escape( request.GET.get("tamiltext", "அங்குக் கண்டான் அந்த பையன் எத்தனை பழங்கள் ") ) dic = {} temp = "" dic["old"] = k1 text, res = check_sandhi(k1) for i, j in enumerate(k1.split()): try: if j != text[i]: text[i] = "<span class='highlight'>" + text[i] + "</span>" except IndexError: pass dic["new"] = " ".join(text) json_string = json.dumps(dic, ensure_ascii=False) # creating a Response object to set the content type and the encoding response = HttpResponse(json_string, content_type="application/json; charset=utf-8") return response
def gpathil11(mword, opt=True, mode='exe'): """ entry point of the spell-checker. Default option is True, mode = 'exe'.""" # print("gpathil11" + mword[0] ) user_file = os.path.join(get_data_dir("koppu"), "user.txt") if os.path.exists(user_file): if (not _Cached._g_userOword) and (not _Cached._g_usergword): with open(user_file, 'r') as fp: userfile = fp.readlines() userOword = userfile[0].split(',') usergword = userfile[1].split(',') _Cached._g_userOword = userOword _Cached._g_usergword = usergword else: userOword = _Cached._g_userOword usergword = _Cached._g_usergword else: userOword = "" usergword = "" splitchar = ',' parinthu = [[None, None] for i in range(len(mword))] ottran = [[None, None] for i in range(len(mword))] # mword,result_stats = check_sandhi(mword) # print(mword) # print(result_stats) for i in range(len(mword)): parinthu[i][0] = 0 # ;//count of suggestion parinthu[i][1] = "wrong" # ;//suggestions ottran[i][0] = 0 ottran[i][1] = 1 for i in range(len(mword)): sandi = "" punarchi = False #1 - if it is verified already if (ottran[i][0] == 1): continue #2 - removing blank char if (len(mword[i]) < 1): parinthu[i, 0] = -1 parinthu[i, 1] = "" continue #3.ignoring single consonant letters if (len(mword[i]) == 2): rgx = "[ா-்]" if (re.match(rgx, mword[i][-1])): ottran[i][0] = 1 parinthu[i][1] = "correct" parinthu[i][0] = 0 continue #4.ignoring single vowel letters if (len(mword[i]) == 1): ottran[i][0] = 1 parinthu[i][1] = "correct" parinthu[i][0] = 0 continue #5- Typo Correction mword[i] = mword[i].replace("ொ", "ொ") mword[i] = mword[i].replace("ோ", "ோ") #6 - Translation if (opt == True): if (ottran[i][0] == 0): istrans = False for key in tword.keys(): tname = key if tname in mword[i]: if (len(tword[tname]) > 0): for k in tword[tname]: #k is array of suggestions a = str(k['t']) b = str(k['w']) for l in tranrule[a]: map = str(l['t']).split(splitchar) if (tname + map[0]) in mword[i]: nword = mword[i].replace( tname + map[0], b + map[1]) if (checkword(nword, 0)): addparinthu(parinthu, i, nword) istrans = True if (istrans == True): ottran[i, 0] = 1 #7.sandhi remover and sandi/punarchi memory if ((i + 2) < len(mword)): if (len(mword[i + 2]) > 0): muthal = mword[i + 2][0:1] rgx1 = "[கசதப]்" rgx2 = "[கசதப]" # if the second word starts with any uyirmei of கசதப let us call sandhi checker if re.match(rgx2, muthal): # if re.match(rgx2,muthal) and not (mword[i][(len(mword[i]) - 2):] == muthal + "்"): sanlist = [mword[i], mword[i + 2]] sanlist, result_stats = check_sandhi(sanlist) #print(sanlist) #print(result_stats) if sanlist[0] != mword[i]: mword[i] = sanlist[0] parinthu[i][0] = 1 parinthu[i][1] = sanlist[0] ottru = mword[i][(len(mword[i]) - 2):] methi = mword[i][0:len(mword[i]) - 2] if re.match(rgx1, ottru): if (muthal + "்" == ottru): if (parinthu[i][0] == 0): mword[i] = methi sandi = ottru elif ottru == "ட்": if re.match(rgx2, muthal): mword[i] = methi + "ள்" punarchi = True elif ottru == "ற்": if re.match(rgx2, muthal): mword[i] = methi + "ல்" punarchi = True elif ottru == "ங்": if muthal == "க": mword[i] = methi + "ம்" sandi = "ங்" punarchi = True elif ottru == "ஞ்": if muthal == "ச": mword[i] = methi + "ம்" sandi = "ஞ்" punarchi = True elif ottru == "ந்": if muthal == "த": mword[i] = methi + "ம்" sandi = "ந்" punarchi = True # o //8. skip if it is repeated word if (mword[i] + sandi) in cacheword: found = cacheword.index(mword[i] + sandi) else: found = -1 if found > -1: a = cacheword[found] if (a == mword[i] + sandi): b = cachesug[found] parinthu[i][1] = b if not istamil(b): parinthu[i][0] = 0 elif b.find(',') < 0: parinthu[i][0] = 1 else: parinthu[i][0] = len(b.split(',')) ottran[i][0] = 1 #9 - skip if was userpreferance if (ottran[i][0] == 0): for a in userOword: if (a == str(mword[i])): ottran[i][0] = 1 parinthu[i][1] = "correct" parinthu[i][0] = 0 if (ottran[i][0] == 0): for a in usergword: nword = a.split('|') if (nword[0] == str(mword[i])): parinthu = addparinthu(parinthu, i, nword[1]) #10 - word match if (ottran[i][0] == 0): if (checkword(mword[i], 0)): ottran[i][0] = 1 parinthu[i][1] = "correct" parinthu[i][0] = 0 #11 - gword suggestion if (opt == True): if (ottran[i][0] == 0): sample = getsuggestion(mword[i]) #emp = {} sample2 = getsuggestion2(mword[i]) sample2.extend(sample) usample = set(sample2) #print(usample) for l in usample: nword = l if (checkword(nword, 7)): # print('nword' + ' ' + nword) if (punarchi): ottru = nword[len(nword) - 2] methi = nword[0, len(nword) - 2] if (ottru == "ள்"): addparinthu(parinthu, i, methi + "ட்") elif (ottru == "ல்"): addparinthu(parinthu, i, methi + "ற்") elif ottru == "ம்": addparinthu(parinthu, i, methi + sandi) else: parinthu = addparinthu(parinthu, i, nword + sandi) # //12 cache the search if (len(mword[i]) > 0): if not (mword[i] + sandi) in cacheword: cacheword.append(mword[i] + sandi) cachesug.append(parinthu[i][1]) # 13 - Check sandhi need or not needed should not cache if (ottran[i], [0] == 1): # //if this word is correct if (len(mword) > i + 2): if (len(mword[i + 2]) > 1): chandi = mword[i + 2][0:1] + "்" # ;//if user did give chandi rgx1 = "[கசதப]்" if re.match(rgx1, chandi): # ) //if next word is kachathapa if (checkword(mword[i + 2], 0)): ottran[i + 2][0] = 1 parinthu[i + 2][1] = "correct" parinthu[i + 2][0] = 0 if (ottran[i + 2], [0] == 1): # //if next word is correct combo = checkword(mword[i] + mword[i + 2], 0) thibo = checkword(mword[i] + chandi + mword[i + 2], 0) derive = checkword( mword[i], 5 ) # ;//return true if it is valid perfect noun if (sandi != ""): if combo: if not thibo: parinthu = addparinthu( parinthu, i, mword[i]) else: if thibo: if not combo: if not derive: parinthu = addparinthu( parinthu, i, mword[i] + chandi) # 14 commented in source itself # //14 - for Developer Sheet research # if (ottran[i][0] == 0): # if (parinthu[i][0] > 0): # //byproduct(mword[i], parinthu[i].join(",")); # print(parinthu) if mode == "web": z = ":" Arr = "" for i in parinthu: # //foreach can't be used, since dynamic(multi dimension) will return all units and no increments are not accepted Arr = Arr + str(i) + z if (z == ":"): z = "|" else: z = ":" return Arr[0:len(Arr) - 1] if DEBUG: print(parinthu) return parinthu