Ejemplo n.º 1
0
 def moushakeel_V1(self,text,smooth_const):
     t1 = time.time()
     tool = MyToolKit()
     hdb = DBHandler('data/model.db')
     result = {}
     hdb.connect()
     result_teshkeel = []
     result['token'] = 0
     result['type'] = 0
     for sent in tool.sents(text.strip(),["\n","\r",".",":",",",';'],subsent=['"',"'",'-']):
         text = "# "+tool.normalizeArabicAlif(tool.DeleteDiacritic(sent))+" $"
         list_words = tool.words(text)
         #Récuperer les possibilités pour chaque mot
         dict = {}
         for word in list_words:
             res = hdb.SelectOne('dictionary','type="'+word+'"',attribute='vocabularies')
             if res == None: dict[word] = word
             else : dict[word] = res[0]
         dict['#'] = '#'
         dict['$'] = '$'
         possibilities = self.getPossibilities(list_words,dict)
         max_p = 0
         best_sent = ''
         for possib in possibilities:
             p = self.sentProbaility(possib,smooth_const)
             if p > max_p :
                 p = max_p
                 best_sent = possib
         result_teshkeel.append(' '.join(tool.words(best_sent)[1:len(tool.words(best_sent))-1]))
         result['token'] = len(list_words)-2
         result['type'] = len(dict)-2
     result['result'] = result_teshkeel
     result['time'] = round(time.time() - t1,2)
     return result
Ejemplo n.º 2
0
def GetTestSents(request):
    nbr =  request.POST['nbrTestSents'].strip()
    if nbr != "":
        try:
            tool = MyToolKit()
            hdb = DBHandler('data/model.db')
            result = {}
            hdb.connect()
            cond = ' '
            for i in range(int(nbr)):
                if i < int(nbr)-1 : cond += "id="+str(random.randint(1,36111))+" or "
                else : cond += "id="+str(random.randint(1,36111))

            DataSents = hdb.getFromTable('sents_test',attribute='sent',condition=cond)

            #pprint(sents)
            sents_diac = ''
            for sent in DataSents:
                sents_diac += ' '.join(tool.words(sent[0])[1:len(tool.words(sent[0]))-1])+'\n'
            sents = tool.DeleteDiacritic(sents_diac)
            result['sents_diac'] =  sents_diac
            result['sents_whitout_diac'] = sents

        except ValueError: result = None
        #r = random.randint(1,100)
    else : result = None
    return HttpResponse(json.dumps(result),content_type="application/json")
Ejemplo n.º 3
0
    def LaplaceSmoothing(self,w2,w1,u,v):
        hdb = DBHandler('data/model.db')
        hdb.connect()
        count_w2 = hdb.SelectOne('words','word="'+w2+'"',attribute='freq')
        if count_w2 != None: count_w2 = count_w2[0]
        else : count_w2 = 0

        count_w1_w2 = hdb.SelectOne('grams2','grams="'+w1+' '+w2+'"',attribute='freq')
        if count_w1_w2 != None: count_w1_w2 = count_w1_w2[0]
        else : count_w1_w2 = 0
        return (count_w1_w2+u)/(count_w2+v*u)
Ejemplo n.º 4
0
    def moushakeel_V2(self, text, smooth_const):
        t1 = time.time()
        tool = MyToolKit()
        hdb = DBHandler('data/model.db')
        result = {}
        hdb.connect()
        result_teshkeel = []
        result['token'] = 0
        result['type'] = 0

        ###################### Vocalisation by word #########################

        for sent in tool.sents(text.strip(), ["\n", "\r", ".", ":", ",", ';'],
                               subsent=['"', "'", '-']):
            #le replace a revoir, car il change un peut les mots (tatwil)
            sent = "# " + tool.normalizeArabicAlif(
                tool.DeleteDiacritic(sent)).replace('ـ', '') + " $"
            list_words = tool.words(sent)

            #Récuperer les possibilités pour chaque mot
            dict = {}
            for word in list_words:
                res = hdb.SelectOne('dictionary',
                                    'type="' + word + '"',
                                    attribute='vocabularies')
                if res == None: dict[word] = word
                else: dict[word] = res[0]
            dict['#'] = '#'
            dict['$'] = '$'
            #HMM
            matrice = []
            for word in list_words:
                list_dict = []
                for possib in tool.words(dict[word]):
                    if possib == "#": list_dict.append([-1, possib, 1])
                    else: list_dict.append([-1, possib, 0])
                matrice.append(list_dict)

            sent = self.Viterbi(matrice, smooth_const)

            result_teshkeel.append(' '.join(
                tool.words(sent)[1:len(tool.words(sent)) - 1]))
            result['token'] += len(list_words) - 2
            result['type'] += len(dict) - 2

        #######################################################################

        ###################### Vocalisation by letter #########################
        result_teshkeel = self.LettersVocaliser(result_teshkeel, smooth_const)
        #######################################################################

        result['result'] = result_teshkeel
        result['time'] = round(time.time() - t1, 2)
        return result
Ejemplo n.º 5
0
    def moushakeel_V2(self,text,smooth_const):
        t1 = time.time()
        tool = MyToolKit()
        hdb = DBHandler('data/model.db')
        result = {}
        hdb.connect()
        result_teshkeel = []
        result['token'] = 0
        result['type'] = 0

        ###################### Vocalisation by word #########################
        
        for sent in tool.sents(text.strip(),["\n","\r",".",":",",",';'],subsent=['"',"'",'-']):
            #le replace a revoir, car il change un peut les mots (tatwil)
            sent = "# "+tool.normalizeArabicAlif(tool.DeleteDiacritic(sent)).replace('ـ','')+" $"
            list_words = tool.words(sent)

            #Récuperer les possibilités pour chaque mot
            dict = {}
            for word in list_words:
                res = hdb.SelectOne('dictionary','type="'+word+'"',attribute='vocabularies')
                if res == None: dict[word] = word
                else : dict[word] = res[0]
            dict['#'] = '#'
            dict['$'] = '$'
            #HMM
            matrice = []
            for word in list_words:
                list_dict = []
                for possib in tool.words(dict[word]):
                    if possib == "#":list_dict.append([-1,possib,1])
                    else : list_dict.append([-1,possib,0])
                matrice.append(list_dict)

            sent = self.Viterbi(matrice,smooth_const)

            result_teshkeel.append(' '.join(tool.words(sent)[1:len(tool.words(sent))-1]))
            result['token'] += len(list_words)-2
            result['type'] += len(dict)-2
            
        
        #######################################################################
        
        ###################### Vocalisation by letter #########################
        result_teshkeel = self.LettersVocaliser(result_teshkeel,smooth_const)
        #######################################################################
            
            
        result['result'] = result_teshkeel
        result['time'] = round(time.time() - t1,2)
        return result
Ejemplo n.º 6
0
    def LaplaceSmoothing_letters(self, w1, w2, u, v):
        hdb = DBHandler('data/model.db')
        hdb.connect()

        count_w1 = hdb.SelectOne('letters',
                                 'letter="' + w1 + '"',
                                 attribute='freq')
        if count_w1 != None: count_w1 = count_w1[0]
        else: count_w1 = 0

        count_w2 = hdb.SelectOne('letters_grams3',
                                 'grams="' + w2 + '"',
                                 attribute='freq')
        if count_w2 != None: count_w2 = count_w2[0]
        else: count_w2 = 0
        #pprint('phase 1 : '+w2+' | '+str(count_w2));

        count_w1_w2 = hdb.SelectOne('letters_grams4',
                                    'grams="' + w2 + w1 + '"',
                                    attribute='freq')
        if count_w1_w2 != None: count_w1_w2 = count_w1_w2[0]
        else: count_w1_w2 = 0
        #pprint('phase 2 : '+w2+''+w1+' | '+str(count_w1_w2));
        #Probleme de zero , avec la matrice d'emission(*count_w1)
        return ((count_w1_w2 + u) / (count_w2 + v * u))
Ejemplo n.º 7
0
    def AbsoluteDiscountingSmoothing_letters(self, w2, w1, d, v):
        hdb = DBHandler('data/model.db')
        hdb.connect()
        count_w1 = hdb.SelectOne('letters',
                                 'letter="' + w1 + '"',
                                 attribute='freq')
        #count_w2 = hdb.SelectOne('words','word="'+w2+'"',attribute='freq')
        count_w1_w2 = hdb.SelectOne('letters_grams4',
                                    'grams="' + w2 + w1 + '"',
                                    attribute='freq')
        count_w_w1 = hdb.SelectOne('abs_letters',
                                   'letter="' + w1 + '"',
                                   attribute='freq')

        if count_w1 != None: count_w1 = count_w1[0]
        else: count_w1 = 0

        if count_w1_w2 != None: count_w1_w2 = count_w1_w2[0]
        else: count_w1_w2 = 0

        if count_w_w1 != None: count_w_w1 = count_w_w1[0]
        else: count_w_w1 = 0
        if count_w1 == 0: return 0
        return (max([count_w1_w2 - d, 0]) / count_w1) + (d * count_w_w1 *
                                                         (1 / v)) / count_w1
Ejemplo n.º 8
0
    def AbsoluteDiscountingSmoothing_letters(self,w2,w1,d,v):
        hdb = DBHandler('data/model.db')
        hdb.connect()
        count_w1 = hdb.SelectOne('letters','letter="'+w1+'"',attribute='freq')
        #count_w2 = hdb.SelectOne('words','word="'+w2+'"',attribute='freq')
        count_w1_w2 = hdb.SelectOne('letters_grams4','grams="'+w2+w1+'"',attribute='freq')
        count_w_w1 = hdb.SelectOne('abs_letters','letter="'+w1+'"',attribute='freq')

        if count_w1 != None: count_w1 = count_w1[0]
        else : count_w1 = 0

        if count_w1_w2 != None: count_w1_w2 = count_w1_w2[0]
        else : count_w1_w2 = 0

        if count_w_w1 != None: count_w_w1 = count_w_w1[0]
        else : count_w_w1 = 0
        if count_w1 == 0 : return 0
        return (max([count_w1_w2-d,0])/count_w1)+(d*count_w_w1*(1/v))/count_w1
Ejemplo n.º 9
0
 def LaplaceSmoothing_letters(self,w1,w2,u,v):
     hdb = DBHandler('data/model.db')
     hdb.connect()
     
     count_w1 = hdb.SelectOne('letters','letter="'+w1+'"',attribute='freq')
     if count_w1 != None: count_w1 = count_w1[0]
     else : count_w1 = 0
     
     count_w2 = hdb.SelectOne('letters_grams3','grams="'+w2+'"',attribute='freq')
     if count_w2 != None: count_w2 = count_w2[0]
     else : count_w2 = 0
     #pprint('phase 1 : '+w2+' | '+str(count_w2));
     
     count_w1_w2 = hdb.SelectOne('letters_grams4','grams="'+w2+w1+'"',attribute='freq')
     if count_w1_w2 != None: count_w1_w2 = count_w1_w2[0]
     else : count_w1_w2 = 0
     #pprint('phase 2 : '+w2+''+w1+' | '+str(count_w1_w2));
     #Probleme de zero , avec la matrice d'emission(*count_w1)
     return ((count_w1_w2+u)/(count_w2+v*u))
Ejemplo n.º 10
0
def getdict(request):
    t1 = time.time()
    tool = MyToolKit()
    hdb = DBHandler('data/model.db')
    hdb.connect()
    result = {}
    dict = {}
    text = tool.DeleteDiacritic(tool.normalizeArabicAlif(request.POST['text'].strip()))
    list_words = tool.words(text)
    for word in list_words:
        res = hdb.SelectOne('dictionary','type="'+word+'"',attribute='vocabularies')

        if res == None: dict[word] = ''
        else : dict[word] = re.sub(' ','  -  ',res[0])
    result['results'] = dict
    result['time'] = round(time.time() - t1,2)
    result['type'] = len(dict)
    result['token'] = len(list_words)
    return HttpResponse(json.dumps(result),content_type="application/json")
Ejemplo n.º 11
0
 def moushakeel_V1(self, text, smooth_const):
     t1 = time.time()
     tool = MyToolKit()
     hdb = DBHandler('data/model.db')
     result = {}
     hdb.connect()
     result_teshkeel = []
     result['token'] = 0
     result['type'] = 0
     for sent in tool.sents(text.strip(), ["\n", "\r", ".", ":", ",", ';'],
                            subsent=['"', "'", '-']):
         text = "# " + tool.normalizeArabicAlif(
             tool.DeleteDiacritic(sent)) + " $"
         list_words = tool.words(text)
         #Récuperer les possibilités pour chaque mot
         dict = {}
         for word in list_words:
             res = hdb.SelectOne('dictionary',
                                 'type="' + word + '"',
                                 attribute='vocabularies')
             if res == None: dict[word] = word
             else: dict[word] = res[0]
         dict['#'] = '#'
         dict['$'] = '$'
         possibilities = self.getPossibilities(list_words, dict)
         max_p = 0
         best_sent = ''
         for possib in possibilities:
             p = self.sentProbaility(possib, smooth_const)
             if p > max_p:
                 p = max_p
                 best_sent = possib
         result_teshkeel.append(' '.join(
             tool.words(best_sent)[1:len(tool.words(best_sent)) - 1]))
         result['token'] = len(list_words) - 2
         result['type'] = len(dict) - 2
     result['result'] = result_teshkeel
     result['time'] = round(time.time() - t1, 2)
     return result
Ejemplo n.º 12
0
    def LaplaceSmoothing(self, w2, w1, u, v):
        hdb = DBHandler('data/model.db')
        hdb.connect()
        count_w2 = hdb.SelectOne('words',
                                 'word="' + w2 + '"',
                                 attribute='freq')
        if count_w2 != None: count_w2 = count_w2[0]
        else: count_w2 = 0

        count_w1_w2 = hdb.SelectOne('grams2',
                                    'grams="' + w1 + ' ' + w2 + '"',
                                    attribute='freq')
        if count_w1_w2 != None: count_w1_w2 = count_w1_w2[0]
        else: count_w1_w2 = 0
        return (count_w1_w2 + u) / (count_w2 + v * u)
Ejemplo n.º 13
0
    def LettersVocaliser(self,sents,smooth_const):
        tool = MyToolKit()
        hdb = DBHandler('data/model.db')
        hdb.connect()
        res = hdb.getFromTable('letters_dictionary',attribute='type,vocabularies')
        dict = {}
        for r in res: dict[r[0]] = r[1]
    
        
        not_vocalised = self.getNotVocalised(sents)
        #parcourir les phrases
        for i in range(len(sents)):
            #pprint(not_vocalised[i])
    
            #parcourir les chaines non vocalisé dans une phrase
            for j in range(len(not_vocalised[i])):
                matrice = []
                char = tool.LettersDiac(not_vocalised[i][j])
                list_dict = []
                list_dict.append([-1,tool.HideChar(char[0],expect=['#',' ']),1])
                matrice.append(list_dict)
                
                list_dict = []
                list_dict.append([0,tool.HideChar(char[1],expect=['#',' ']),1])
                matrice.append(list_dict)
                
                list_dict = []
                list_dict.append([0,"#",1])
                matrice.append(list_dict)
                
                string = tool.DeleteDiacritic(not_vocalised[i][j])
                #pprint(string)
                #parcourir les caractéres
         
                #if(string[0] == 'ا' and string[1] == 'ل')
                k = 3
                while k < len(string):
                    list_dict = []
                    if string[k-1] == "#" and string[k] == 'ا' and string[k+1] == 'ل':
                        list_dict = []
                        list_dict.append([0,'_',1])
                        matrice.append(list_dict)
          
                        list_dict = []
                        list_dict.append([0,'_ْ',1])
                        matrice.append(list_dict)
    
                        k += 2
                    else :
                        #parcourir les possibilités
                        for possib in tool.words(dict[string[k]]):
                            if possib == "#":list_dict.append([-1,possib,1])
                            else : list_dict.append([-1,tool.HideChar(possib,expect=['#',' ']),0])
                        matrice.append(list_dict)
                        k += 1
                    

                v = self.ViterbiLetter(matrice,smooth_const)
                string = self.alignLetter(v,not_vocalised[i][j])
                #pprint(string)
                #pprint(not_vocalised[i][j])
                #pprint(sents[i])
                #not_vocalised[i] = not_vocalised[i].replace(not_vocalised[i][j],string)
                sents[i] = sents[i].replace(not_vocalised[i][j].replace('#',' ').strip(),string.replace('#',' ').strip())
                """
                if v[:2] == "##" : n = 0
                else: n = 1
                
                string1 = re.sub('#+',' ',not_vocalised[i][j]).strip().split(' '))
                string2 = re.sub('#+',' ',v).strip().split(' ')
                pprint([0])
                pprint(string)
                pprint('------------')
                """
                #print(not_vocalised[i][j]+' => '+v)
                #exit()
                #pprint('---------------------------------')
                #pprint(matrice)
                """
                for a in matrice:
                    #matrice[i-1][k][1]
                    pprint(a[0][0])
            
                exit()
                """
            
        return sents
Ejemplo n.º 14
0
    def LettersVocaliser(self, sents, smooth_const):
        tool = MyToolKit()
        hdb = DBHandler('data/model.db')
        hdb.connect()
        res = hdb.getFromTable('letters_dictionary',
                               attribute='type,vocabularies')
        dict = {}
        for r in res:
            dict[r[0]] = r[1]

        not_vocalised = self.getNotVocalised(sents)
        #parcourir les phrases
        for i in range(len(sents)):
            #pprint(not_vocalised[i])

            #parcourir les chaines non vocalisé dans une phrase
            for j in range(len(not_vocalised[i])):
                matrice = []
                char = tool.LettersDiac(not_vocalised[i][j])
                list_dict = []
                list_dict.append(
                    [-1, tool.HideChar(char[0], expect=['#', ' ']), 1])
                matrice.append(list_dict)

                list_dict = []
                list_dict.append(
                    [0, tool.HideChar(char[1], expect=['#', ' ']), 1])
                matrice.append(list_dict)

                list_dict = []
                list_dict.append([0, "#", 1])
                matrice.append(list_dict)

                string = tool.DeleteDiacritic(not_vocalised[i][j])
                #pprint(string)
                #parcourir les caractéres

                #if(string[0] == 'ا' and string[1] == 'ل')
                k = 3
                while k < len(string):
                    list_dict = []
                    if string[k - 1] == "#" and string[k] == 'ا' and string[
                            k + 1] == 'ل':
                        list_dict = []
                        list_dict.append([0, '_', 1])
                        matrice.append(list_dict)

                        list_dict = []
                        list_dict.append([0, '_ْ', 1])
                        matrice.append(list_dict)

                        k += 2
                    else:
                        #parcourir les possibilités
                        for possib in tool.words(dict[string[k]]):
                            if possib == "#": list_dict.append([-1, possib, 1])
                            else:
                                list_dict.append([
                                    -1,
                                    tool.HideChar(possib, expect=['#', ' ']), 0
                                ])
                        matrice.append(list_dict)
                        k += 1

                v = self.ViterbiLetter(matrice, smooth_const)
                string = self.alignLetter(v, not_vocalised[i][j])
                #pprint(string)
                #pprint(not_vocalised[i][j])
                #pprint(sents[i])
                #not_vocalised[i] = not_vocalised[i].replace(not_vocalised[i][j],string)
                sents[i] = sents[i].replace(
                    not_vocalised[i][j].replace('#', ' ').strip(),
                    string.replace('#', ' ').strip())
                """
                if v[:2] == "##" : n = 0
                else: n = 1
                
                string1 = re.sub('#+',' ',not_vocalised[i][j]).strip().split(' '))
                string2 = re.sub('#+',' ',v).strip().split(' ')
                pprint([0])
                pprint(string)
                pprint('------------')
                """
                #print(not_vocalised[i][j]+' => '+v)
                #exit()
                #pprint('---------------------------------')
                #pprint(matrice)
                """
                for a in matrice:
                    #matrice[i-1][k][1]
                    pprint(a[0][0])
            
                exit()
                """

        return sents