Beispiel #1
0
def GetTestSents(request):
    nbr =  request.POST['nbrTestSents'].strip()
    if nbr != "":
        try:
            tool = MyToolKit()
            hdb = DBHandler('data/model.db')
            result = {}
            hdb.connect()
            cond = ' '
            for i in range(int(nbr)):
                if i < int(nbr)-1 : cond += "id="+str(random.randint(1,36111))+" or "
                else : cond += "id="+str(random.randint(1,36111))

            DataSents = hdb.getFromTable('sents_test',attribute='sent',condition=cond)

            #pprint(sents)
            sents_diac = ''
            for sent in DataSents:
                sents_diac += ' '.join(tool.words(sent[0])[1:len(tool.words(sent[0]))-1])+'\n'
            sents = tool.DeleteDiacritic(sents_diac)
            result['sents_diac'] =  sents_diac
            result['sents_whitout_diac'] = sents

        except ValueError: result = None
        #r = random.randint(1,100)
    else : result = None
    return HttpResponse(json.dumps(result),content_type="application/json")
 def sentProbaility(self,sent,smooth_const):
     V = 217847
     tool = MyToolKit()
     bigrs = bigrams(tool.words(sent));
     p = 1
     for tuple in bigrs:
         p = math.exp(math.log(p)+math.log(self.LaplaceSmoothing(tuple[1],tuple[0],smooth_const,V)))
         #p = math.exp(math.log(p)+math.log(self.AbsoluteDiscountingSmoothing(tuple[1],tuple[0],smooth_const,V)))
     return p
Beispiel #3
0
def DeletDiac(request):
    t1 = time.time()
    tool = MyToolKit()
    result = {}
    res = []
    
    for sent in tool.sents(request.POST['text'].strip(),["\n","\r",".",":",",",';'],subsent=['"',"'",'-'])[0]:
        res.append(tool.DeleteDiacritic(sent))
    result['results'] = '<br>'.join(res)
    result['time'] = round(time.time() - t1,2)
    return HttpResponse(json.dumps(result),content_type="application/json")
Beispiel #4
0
 def alignLetter(self, string1, string2):
     tool = MyToolKit()
     diac = ['َ', 'ِ', 'ً', 'ٌ', 'ٍ', 'ْ', 'ّ', 'ُ']
     string2 = tool.DeleteDiacritic(string2)
     lenth2 = len(string2)
     i = 0
     while i < lenth2:
         if string2[i] != '#': string1 = string1.replace('_', string2[i], 1)
         i += 1
     pprint(string1 + " ==> " + string2)
     return string1
Beispiel #5
0
 def sentProbaility(self, sent, smooth_const):
     V = 217847
     tool = MyToolKit()
     bigrs = bigrams(tool.words(sent))
     p = 1
     for tuple in bigrs:
         p = math.exp(
             math.log(p) + math.log(
                 self.LaplaceSmoothing(tuple[1], tuple[0], smooth_const, V))
         )
         #p = math.exp(math.log(p)+math.log(self.AbsoluteDiscountingSmoothing(tuple[1],tuple[0],smooth_const,V)))
     return p
Beispiel #6
0
def Evaluate(request):
    t1 = time.time()
    tool = MyToolKit()

    text =  request.POST['text'].strip()
    corpus_text = request.POST['sents_diac_corpus_value'].strip()
    result = {}
    result['recall'] = "{:.4f}".format(tool.Recall(corpus_text,text))
    result['precision'] = "{:.4f}".format(tool.Precision(corpus_text,text))
    result['fmeasure'] ="{:.4f}".format(tool.Fmeasure(corpus_text,text))#2*result['recall']*result['precision']/(result['precision']+result['precision'])

    result['time'] = round(time.time() - t1,2)
    return HttpResponse(json.dumps(result),content_type="application/json")
Beispiel #7
0
    def moushakeel_V2(self, text, smooth_const):
        t1 = time.time()
        tool = MyToolKit()
        hdb = DBHandler('data/model.db')
        result = {}
        hdb.connect()
        result_teshkeel = []
        result['token'] = 0
        result['type'] = 0

        ###################### Vocalisation by word #########################

        for sent in tool.sents(text.strip(), ["\n", "\r", ".", ":", ",", ';'],
                               subsent=['"', "'", '-']):
            #le replace a revoir, car il change un peut les mots (tatwil)
            sent = "# " + tool.normalizeArabicAlif(
                tool.DeleteDiacritic(sent)).replace('ـ', '') + " $"
            list_words = tool.words(sent)

            #Récuperer les possibilités pour chaque mot
            dict = {}
            for word in list_words:
                res = hdb.SelectOne('dictionary',
                                    'type="' + word + '"',
                                    attribute='vocabularies')
                if res == None: dict[word] = word
                else: dict[word] = res[0]
            dict['#'] = '#'
            dict['$'] = '$'
            #HMM
            matrice = []
            for word in list_words:
                list_dict = []
                for possib in tool.words(dict[word]):
                    if possib == "#": list_dict.append([-1, possib, 1])
                    else: list_dict.append([-1, possib, 0])
                matrice.append(list_dict)

            sent = self.Viterbi(matrice, smooth_const)

            result_teshkeel.append(' '.join(
                tool.words(sent)[1:len(tool.words(sent)) - 1]))
            result['token'] += len(list_words) - 2
            result['type'] += len(dict) - 2

        #######################################################################

        ###################### Vocalisation by letter #########################
        result_teshkeel = self.LettersVocaliser(result_teshkeel, smooth_const)
        #######################################################################

        result['result'] = result_teshkeel
        result['time'] = round(time.time() - t1, 2)
        return result
 def getPossibilities(self,list_words,dict):
     tool = MyToolKit()
     possibilities = []
     i = 1
     lenList = len(list_words)
     possibilities.append(list_words[0])
     lenpred = 1
     while len(tool.words(possibilities[0]))<lenList:
         first = possibilities[0]
         if len(tool.words(first)) != lenpred:
             i += 1
             lenpred = len(tool.words(first))
         for word in tool.words(dict[list_words[i]]):
             possibilities.append(first+" "+word)
         possibilities.pop(0)
     return possibilities
Beispiel #9
0
def getdict(request):
    t1 = time.time()
    tool = MyToolKit()
    hdb = DBHandler('data/model.db')
    hdb.connect()
    result = {}
    dict = {}
    text = tool.DeleteDiacritic(tool.normalizeArabicAlif(request.POST['text'].strip()))
    list_words = tool.words(text)
    for word in list_words:
        res = hdb.SelectOne('dictionary','type="'+word+'"',attribute='vocabularies')

        if res == None: dict[word] = ''
        else : dict[word] = re.sub(' ','  -  ',res[0])
    result['results'] = dict
    result['time'] = round(time.time() - t1,2)
    result['type'] = len(dict)
    result['token'] = len(list_words)
    return HttpResponse(json.dumps(result),content_type="application/json")
 def moushakeel_V1(self,text,smooth_const):
     t1 = time.time()
     tool = MyToolKit()
     hdb = DBHandler('data/model.db')
     result = {}
     hdb.connect()
     result_teshkeel = []
     result['token'] = 0
     result['type'] = 0
     for sent in tool.sents(text.strip(),["\n","\r",".",":",",",';'],subsent=['"',"'",'-']):
         text = "# "+tool.normalizeArabicAlif(tool.DeleteDiacritic(sent))+" $"
         list_words = tool.words(text)
         #Récuperer les possibilités pour chaque mot
         dict = {}
         for word in list_words:
             res = hdb.SelectOne('dictionary','type="'+word+'"',attribute='vocabularies')
             if res == None: dict[word] = word
             else : dict[word] = res[0]
         dict['#'] = '#'
         dict['$'] = '$'
         possibilities = self.getPossibilities(list_words,dict)
         max_p = 0
         best_sent = ''
         for possib in possibilities:
             p = self.sentProbaility(possib,smooth_const)
             if p > max_p :
                 p = max_p
                 best_sent = possib
         result_teshkeel.append(' '.join(tool.words(best_sent)[1:len(tool.words(best_sent))-1]))
         result['token'] = len(list_words)-2
         result['type'] = len(dict)-2
     result['result'] = result_teshkeel
     result['time'] = round(time.time() - t1,2)
     return result
Beispiel #11
0
    def getNotVocalised(self, sents):
        tool = MyToolKit()
        not_vocalised_by_sents = []
        for i in range(len(sents)):
            not_vocalised = []
            words = tool.words(sents[i])
            j = 0
            while j < len(words):
                if not tool.HasDiac(words[j]):

                    string = ""
                    if j == 0: string = '###' + words[j]
                    else:
                        char = tool.LettersDiac(words[j - 1])
                        string = char[len(char) - 2] + char[len(char) -
                                                            1] + '#' + words[j]
                    k = j + 1
                    while k < len(words) and not tool.HasDiac(words[k]):
                        string += "#" + words[k]
                        k += 1
                    j += k
                    not_vocalised.append(string + "#")
                else:
                    j += 1
            not_vocalised_by_sents.append(not_vocalised)
        return not_vocalised_by_sents
    def moushakeel_V2(self,text,smooth_const):
        t1 = time.time()
        tool = MyToolKit()
        hdb = DBHandler('data/model.db')
        result = {}
        hdb.connect()
        result_teshkeel = []
        result['token'] = 0
        result['type'] = 0

        ###################### Vocalisation by word #########################
        
        for sent in tool.sents(text.strip(),["\n","\r",".",":",",",';'],subsent=['"',"'",'-']):
            #le replace a revoir, car il change un peut les mots (tatwil)
            sent = "# "+tool.normalizeArabicAlif(tool.DeleteDiacritic(sent)).replace('ـ','')+" $"
            list_words = tool.words(sent)

            #Récuperer les possibilités pour chaque mot
            dict = {}
            for word in list_words:
                res = hdb.SelectOne('dictionary','type="'+word+'"',attribute='vocabularies')
                if res == None: dict[word] = word
                else : dict[word] = res[0]
            dict['#'] = '#'
            dict['$'] = '$'
            #HMM
            matrice = []
            for word in list_words:
                list_dict = []
                for possib in tool.words(dict[word]):
                    if possib == "#":list_dict.append([-1,possib,1])
                    else : list_dict.append([-1,possib,0])
                matrice.append(list_dict)

            sent = self.Viterbi(matrice,smooth_const)

            result_teshkeel.append(' '.join(tool.words(sent)[1:len(tool.words(sent))-1]))
            result['token'] += len(list_words)-2
            result['type'] += len(dict)-2
            
        
        #######################################################################
        
        ###################### Vocalisation by letter #########################
        result_teshkeel = self.LettersVocaliser(result_teshkeel,smooth_const)
        #######################################################################
            
            
        result['result'] = result_teshkeel
        result['time'] = round(time.time() - t1,2)
        return result
    def getNotVocalised(self,sents):
        tool = MyToolKit()
        not_vocalised_by_sents = []
        for i in range(len(sents)):
            not_vocalised = []
            words = tool.words(sents[i])
            j = 0
            while j < len(words):
                if not tool.HasDiac(words[j]):
 
                    string = ""
                    if j == 0 : string = '###'+words[j]
                    else : 
                        char = tool.LettersDiac(words[j-1])
                        string = char[len(char)-2]+char[len(char)-1]+'#'+words[j]
                    k = j+1
                    while k < len(words) and not tool.HasDiac(words[k]):
                        string += "#"+words[k]
                        k+=1
                    j += k
                    not_vocalised.append(string+"#")
                else : j += 1
            not_vocalised_by_sents.append(not_vocalised)
        return not_vocalised_by_sents
Beispiel #14
0
 def moushakeel_V1(self, text, smooth_const):
     t1 = time.time()
     tool = MyToolKit()
     hdb = DBHandler('data/model.db')
     result = {}
     hdb.connect()
     result_teshkeel = []
     result['token'] = 0
     result['type'] = 0
     for sent in tool.sents(text.strip(), ["\n", "\r", ".", ":", ",", ';'],
                            subsent=['"', "'", '-']):
         text = "# " + tool.normalizeArabicAlif(
             tool.DeleteDiacritic(sent)) + " $"
         list_words = tool.words(text)
         #Récuperer les possibilités pour chaque mot
         dict = {}
         for word in list_words:
             res = hdb.SelectOne('dictionary',
                                 'type="' + word + '"',
                                 attribute='vocabularies')
             if res == None: dict[word] = word
             else: dict[word] = res[0]
         dict['#'] = '#'
         dict['$'] = '$'
         possibilities = self.getPossibilities(list_words, dict)
         max_p = 0
         best_sent = ''
         for possib in possibilities:
             p = self.sentProbaility(possib, smooth_const)
             if p > max_p:
                 p = max_p
                 best_sent = possib
         result_teshkeel.append(' '.join(
             tool.words(best_sent)[1:len(tool.words(best_sent)) - 1]))
         result['token'] = len(list_words) - 2
         result['type'] = len(dict) - 2
     result['result'] = result_teshkeel
     result['time'] = round(time.time() - t1, 2)
     return result
Beispiel #15
0
 def getPossibilities(self, list_words, dict):
     tool = MyToolKit()
     possibilities = []
     i = 1
     lenList = len(list_words)
     possibilities.append(list_words[0])
     lenpred = 1
     while len(tool.words(possibilities[0])) < lenList:
         first = possibilities[0]
         if len(tool.words(first)) != lenpred:
             i += 1
             lenpred = len(tool.words(first))
         for word in tool.words(dict[list_words[i]]):
             possibilities.append(first + " " + word)
         possibilities.pop(0)
     return possibilities
Beispiel #16
0
    def LettersVocaliser(self, sents, smooth_const):
        tool = MyToolKit()
        hdb = DBHandler('data/model.db')
        hdb.connect()
        res = hdb.getFromTable('letters_dictionary',
                               attribute='type,vocabularies')
        dict = {}
        for r in res:
            dict[r[0]] = r[1]

        not_vocalised = self.getNotVocalised(sents)
        #parcourir les phrases
        for i in range(len(sents)):
            #pprint(not_vocalised[i])

            #parcourir les chaines non vocalisé dans une phrase
            for j in range(len(not_vocalised[i])):
                matrice = []
                char = tool.LettersDiac(not_vocalised[i][j])
                list_dict = []
                list_dict.append(
                    [-1, tool.HideChar(char[0], expect=['#', ' ']), 1])
                matrice.append(list_dict)

                list_dict = []
                list_dict.append(
                    [0, tool.HideChar(char[1], expect=['#', ' ']), 1])
                matrice.append(list_dict)

                list_dict = []
                list_dict.append([0, "#", 1])
                matrice.append(list_dict)

                string = tool.DeleteDiacritic(not_vocalised[i][j])
                #pprint(string)
                #parcourir les caractéres

                #if(string[0] == 'ا' and string[1] == 'ل')
                k = 3
                while k < len(string):
                    list_dict = []
                    if string[k - 1] == "#" and string[k] == 'ا' and string[
                            k + 1] == 'ل':
                        list_dict = []
                        list_dict.append([0, '_', 1])
                        matrice.append(list_dict)

                        list_dict = []
                        list_dict.append([0, '_ْ', 1])
                        matrice.append(list_dict)

                        k += 2
                    else:
                        #parcourir les possibilités
                        for possib in tool.words(dict[string[k]]):
                            if possib == "#": list_dict.append([-1, possib, 1])
                            else:
                                list_dict.append([
                                    -1,
                                    tool.HideChar(possib, expect=['#', ' ']), 0
                                ])
                        matrice.append(list_dict)
                        k += 1

                v = self.ViterbiLetter(matrice, smooth_const)
                string = self.alignLetter(v, not_vocalised[i][j])
                #pprint(string)
                #pprint(not_vocalised[i][j])
                #pprint(sents[i])
                #not_vocalised[i] = not_vocalised[i].replace(not_vocalised[i][j],string)
                sents[i] = sents[i].replace(
                    not_vocalised[i][j].replace('#', ' ').strip(),
                    string.replace('#', ' ').strip())
                """
                if v[:2] == "##" : n = 0
                else: n = 1
                
                string1 = re.sub('#+',' ',not_vocalised[i][j]).strip().split(' '))
                string2 = re.sub('#+',' ',v).strip().split(' ')
                pprint([0])
                pprint(string)
                pprint('------------')
                """
                #print(not_vocalised[i][j]+' => '+v)
                #exit()
                #pprint('---------------------------------')
                #pprint(matrice)
                """
                for a in matrice:
                    #matrice[i-1][k][1]
                    pprint(a[0][0])
            
                exit()
                """

        return sents
Beispiel #17
0
def TashkeelAndEvaluate(request):
    vocaliser = Vocaliser()
    tool = MyToolKit()
    text =  request.POST['text'].strip()
    corpus_text = request.POST['sents_diac_corpus_value'].strip()
    result = vocaliser.moushakeel_V2(text,float(request.POST['ConstLaplace']))
    joined_result = '\n'.join(result['result'])
    result['wer1_recall'] = "{:.4f}".format(tool.wer1_Recall(corpus_text,joined_result))
    result['wer1_precision'] = "{:.4f}".format(tool.wer1_Precision(corpus_text,joined_result))
    result['wer1_fmeasure'] = "{:.4f}".format(tool.wer1_Fmeasure(corpus_text,joined_result))#2*result['recall']*result['precision']/(result['precision']+result['precision'])
    
    result['wer2_recall'] = "{:.4f}".format(tool.wer2_Recall(corpus_text,joined_result))
    result['wer2_precision'] = "{:.4f}".format(tool.wer2_Precision(corpus_text,joined_result))
    result['wer2_fmeasure'] = "{:.4f}".format(tool.wer2_Fmeasure(corpus_text,joined_result))#2*result['recall']*result['precision']/(result['precision']+result['precision'])
     
    result['der1_recall'] = "{:.4f}".format(tool.der1_Recall(corpus_text,joined_result))
    result['der1_precision'] = "{:.4f}".format(tool.der1_Precision(corpus_text,joined_result))
    result['der1_fmeasure'] = "{:.4f}".format(tool.der1_Fmeasure(corpus_text,joined_result))#2*result['recall']*result['precision']/(result['precision']+result['precision'])

    result['der2_recall'] = "{:.4f}".format(tool.der2_Recall(corpus_text,joined_result))
    result['der2_precision'] = "{:.4f}".format(tool.der2_Precision(corpus_text,joined_result))
    result['der2_fmeasure'] = "{:.4f}".format(tool.der2_Fmeasure(corpus_text,joined_result))#2*result['recall']*result['precision']/(result['precision']+result['precision'])
     
    
    result['result'] = '<br>'.join(result['result'])
    return HttpResponse(json.dumps(result),content_type="application/json")
    def LettersVocaliser(self,sents,smooth_const):
        tool = MyToolKit()
        hdb = DBHandler('data/model.db')
        hdb.connect()
        res = hdb.getFromTable('letters_dictionary',attribute='type,vocabularies')
        dict = {}
        for r in res: dict[r[0]] = r[1]
    
        
        not_vocalised = self.getNotVocalised(sents)
        #parcourir les phrases
        for i in range(len(sents)):
            #pprint(not_vocalised[i])
    
            #parcourir les chaines non vocalisé dans une phrase
            for j in range(len(not_vocalised[i])):
                matrice = []
                char = tool.LettersDiac(not_vocalised[i][j])
                list_dict = []
                list_dict.append([-1,tool.HideChar(char[0],expect=['#',' ']),1])
                matrice.append(list_dict)
                
                list_dict = []
                list_dict.append([0,tool.HideChar(char[1],expect=['#',' ']),1])
                matrice.append(list_dict)
                
                list_dict = []
                list_dict.append([0,"#",1])
                matrice.append(list_dict)
                
                string = tool.DeleteDiacritic(not_vocalised[i][j])
                #pprint(string)
                #parcourir les caractéres
         
                #if(string[0] == 'ا' and string[1] == 'ل')
                k = 3
                while k < len(string):
                    list_dict = []
                    if string[k-1] == "#" and string[k] == 'ا' and string[k+1] == 'ل':
                        list_dict = []
                        list_dict.append([0,'_',1])
                        matrice.append(list_dict)
          
                        list_dict = []
                        list_dict.append([0,'_ْ',1])
                        matrice.append(list_dict)
    
                        k += 2
                    else :
                        #parcourir les possibilités
                        for possib in tool.words(dict[string[k]]):
                            if possib == "#":list_dict.append([-1,possib,1])
                            else : list_dict.append([-1,tool.HideChar(possib,expect=['#',' ']),0])
                        matrice.append(list_dict)
                        k += 1
                    

                v = self.ViterbiLetter(matrice,smooth_const)
                string = self.alignLetter(v,not_vocalised[i][j])
                #pprint(string)
                #pprint(not_vocalised[i][j])
                #pprint(sents[i])
                #not_vocalised[i] = not_vocalised[i].replace(not_vocalised[i][j],string)
                sents[i] = sents[i].replace(not_vocalised[i][j].replace('#',' ').strip(),string.replace('#',' ').strip())
                """
                if v[:2] == "##" : n = 0
                else: n = 1
                
                string1 = re.sub('#+',' ',not_vocalised[i][j]).strip().split(' '))
                string2 = re.sub('#+',' ',v).strip().split(' ')
                pprint([0])
                pprint(string)
                pprint('------------')
                """
                #print(not_vocalised[i][j]+' => '+v)
                #exit()
                #pprint('---------------------------------')
                #pprint(matrice)
                """
                for a in matrice:
                    #matrice[i-1][k][1]
                    pprint(a[0][0])
            
                exit()
                """
            
        return sents