def moushakeel_V1(self,text,smooth_const):
     t1 = time.time()
     tool = MyToolKit()
     hdb = DBHandler('data/model.db')
     result = {}
     hdb.connect()
     result_teshkeel = []
     result['token'] = 0
     result['type'] = 0
     for sent in tool.sents(text.strip(),["\n","\r",".",":",",",';'],subsent=['"',"'",'-']):
         text = "# "+tool.normalizeArabicAlif(tool.DeleteDiacritic(sent))+" $"
         list_words = tool.words(text)
         #Récuperer les possibilités pour chaque mot
         dict = {}
         for word in list_words:
             res = hdb.SelectOne('dictionary','type="'+word+'"',attribute='vocabularies')
             if res == None: dict[word] = word
             else : dict[word] = res[0]
         dict['#'] = '#'
         dict['$'] = '$'
         possibilities = self.getPossibilities(list_words,dict)
         max_p = 0
         best_sent = ''
         for possib in possibilities:
             p = self.sentProbaility(possib,smooth_const)
             if p > max_p :
                 p = max_p
                 best_sent = possib
         result_teshkeel.append(' '.join(tool.words(best_sent)[1:len(tool.words(best_sent))-1]))
         result['token'] = len(list_words)-2
         result['type'] = len(dict)-2
     result['result'] = result_teshkeel
     result['time'] = round(time.time() - t1,2)
     return result
Beispiel #2
0
    def moushakeel_V2(self, text, smooth_const):
        t1 = time.time()
        tool = MyToolKit()
        hdb = DBHandler('data/model.db')
        result = {}
        hdb.connect()
        result_teshkeel = []
        result['token'] = 0
        result['type'] = 0

        ###################### Vocalisation by word #########################

        for sent in tool.sents(text.strip(), ["\n", "\r", ".", ":", ",", ';'],
                               subsent=['"', "'", '-']):
            #le replace a revoir, car il change un peut les mots (tatwil)
            sent = "# " + tool.normalizeArabicAlif(
                tool.DeleteDiacritic(sent)).replace('ـ', '') + " $"
            list_words = tool.words(sent)

            #Récuperer les possibilités pour chaque mot
            dict = {}
            for word in list_words:
                res = hdb.SelectOne('dictionary',
                                    'type="' + word + '"',
                                    attribute='vocabularies')
                if res == None: dict[word] = word
                else: dict[word] = res[0]
            dict['#'] = '#'
            dict['$'] = '$'
            #HMM
            matrice = []
            for word in list_words:
                list_dict = []
                for possib in tool.words(dict[word]):
                    if possib == "#": list_dict.append([-1, possib, 1])
                    else: list_dict.append([-1, possib, 0])
                matrice.append(list_dict)

            sent = self.Viterbi(matrice, smooth_const)

            result_teshkeel.append(' '.join(
                tool.words(sent)[1:len(tool.words(sent)) - 1]))
            result['token'] += len(list_words) - 2
            result['type'] += len(dict) - 2

        #######################################################################

        ###################### Vocalisation by letter #########################
        result_teshkeel = self.LettersVocaliser(result_teshkeel, smooth_const)
        #######################################################################

        result['result'] = result_teshkeel
        result['time'] = round(time.time() - t1, 2)
        return result
    def moushakeel_V2(self,text,smooth_const):
        t1 = time.time()
        tool = MyToolKit()
        hdb = DBHandler('data/model.db')
        result = {}
        hdb.connect()
        result_teshkeel = []
        result['token'] = 0
        result['type'] = 0

        ###################### Vocalisation by word #########################
        
        for sent in tool.sents(text.strip(),["\n","\r",".",":",",",';'],subsent=['"',"'",'-']):
            #le replace a revoir, car il change un peut les mots (tatwil)
            sent = "# "+tool.normalizeArabicAlif(tool.DeleteDiacritic(sent)).replace('ـ','')+" $"
            list_words = tool.words(sent)

            #Récuperer les possibilités pour chaque mot
            dict = {}
            for word in list_words:
                res = hdb.SelectOne('dictionary','type="'+word+'"',attribute='vocabularies')
                if res == None: dict[word] = word
                else : dict[word] = res[0]
            dict['#'] = '#'
            dict['$'] = '$'
            #HMM
            matrice = []
            for word in list_words:
                list_dict = []
                for possib in tool.words(dict[word]):
                    if possib == "#":list_dict.append([-1,possib,1])
                    else : list_dict.append([-1,possib,0])
                matrice.append(list_dict)

            sent = self.Viterbi(matrice,smooth_const)

            result_teshkeel.append(' '.join(tool.words(sent)[1:len(tool.words(sent))-1]))
            result['token'] += len(list_words)-2
            result['type'] += len(dict)-2
            
        
        #######################################################################
        
        ###################### Vocalisation by letter #########################
        result_teshkeel = self.LettersVocaliser(result_teshkeel,smooth_const)
        #######################################################################
            
            
        result['result'] = result_teshkeel
        result['time'] = round(time.time() - t1,2)
        return result
Beispiel #4
0
def getdict(request):
    t1 = time.time()
    tool = MyToolKit()
    hdb = DBHandler('data/model.db')
    hdb.connect()
    result = {}
    dict = {}
    text = tool.DeleteDiacritic(tool.normalizeArabicAlif(request.POST['text'].strip()))
    list_words = tool.words(text)
    for word in list_words:
        res = hdb.SelectOne('dictionary','type="'+word+'"',attribute='vocabularies')

        if res == None: dict[word] = ''
        else : dict[word] = re.sub(' ','  -  ',res[0])
    result['results'] = dict
    result['time'] = round(time.time() - t1,2)
    result['type'] = len(dict)
    result['token'] = len(list_words)
    return HttpResponse(json.dumps(result),content_type="application/json")
Beispiel #5
0
 def moushakeel_V1(self, text, smooth_const):
     t1 = time.time()
     tool = MyToolKit()
     hdb = DBHandler('data/model.db')
     result = {}
     hdb.connect()
     result_teshkeel = []
     result['token'] = 0
     result['type'] = 0
     for sent in tool.sents(text.strip(), ["\n", "\r", ".", ":", ",", ';'],
                            subsent=['"', "'", '-']):
         text = "# " + tool.normalizeArabicAlif(
             tool.DeleteDiacritic(sent)) + " $"
         list_words = tool.words(text)
         #Récuperer les possibilités pour chaque mot
         dict = {}
         for word in list_words:
             res = hdb.SelectOne('dictionary',
                                 'type="' + word + '"',
                                 attribute='vocabularies')
             if res == None: dict[word] = word
             else: dict[word] = res[0]
         dict['#'] = '#'
         dict['$'] = '$'
         possibilities = self.getPossibilities(list_words, dict)
         max_p = 0
         best_sent = ''
         for possib in possibilities:
             p = self.sentProbaility(possib, smooth_const)
             if p > max_p:
                 p = max_p
                 best_sent = possib
         result_teshkeel.append(' '.join(
             tool.words(best_sent)[1:len(tool.words(best_sent)) - 1]))
         result['token'] = len(list_words) - 2
         result['type'] = len(dict) - 2
     result['result'] = result_teshkeel
     result['time'] = round(time.time() - t1, 2)
     return result