コード例 #1
0
    def LaplaceSmoothing_letters(self, w1, w2, u, v):
        hdb = DBHandler('data/model.db')
        hdb.connect()

        count_w1 = hdb.SelectOne('letters',
                                 'letter="' + w1 + '"',
                                 attribute='freq')
        if count_w1 != None: count_w1 = count_w1[0]
        else: count_w1 = 0

        count_w2 = hdb.SelectOne('letters_grams3',
                                 'grams="' + w2 + '"',
                                 attribute='freq')
        if count_w2 != None: count_w2 = count_w2[0]
        else: count_w2 = 0
        #pprint('phase 1 : '+w2+' | '+str(count_w2));

        count_w1_w2 = hdb.SelectOne('letters_grams4',
                                    'grams="' + w2 + w1 + '"',
                                    attribute='freq')
        if count_w1_w2 != None: count_w1_w2 = count_w1_w2[0]
        else: count_w1_w2 = 0
        #pprint('phase 2 : '+w2+''+w1+' | '+str(count_w1_w2));
        #Probleme de zero , avec la matrice d'emission(*count_w1)
        return ((count_w1_w2 + u) / (count_w2 + v * u))
コード例 #2
0
    def AbsoluteDiscountingSmoothing_letters(self, w2, w1, d, v):
        hdb = DBHandler('data/model.db')
        hdb.connect()
        count_w1 = hdb.SelectOne('letters',
                                 'letter="' + w1 + '"',
                                 attribute='freq')
        #count_w2 = hdb.SelectOne('words','word="'+w2+'"',attribute='freq')
        count_w1_w2 = hdb.SelectOne('letters_grams4',
                                    'grams="' + w2 + w1 + '"',
                                    attribute='freq')
        count_w_w1 = hdb.SelectOne('abs_letters',
                                   'letter="' + w1 + '"',
                                   attribute='freq')

        if count_w1 != None: count_w1 = count_w1[0]
        else: count_w1 = 0

        if count_w1_w2 != None: count_w1_w2 = count_w1_w2[0]
        else: count_w1_w2 = 0

        if count_w_w1 != None: count_w_w1 = count_w_w1[0]
        else: count_w_w1 = 0
        if count_w1 == 0: return 0
        return (max([count_w1_w2 - d, 0]) / count_w1) + (d * count_w_w1 *
                                                         (1 / v)) / count_w1
コード例 #3
0
    def LaplaceSmoothing(self, w2, w1, u, v):
        hdb = DBHandler('data/model.db')
        hdb.connect()
        count_w2 = hdb.SelectOne('words',
                                 'word="' + w2 + '"',
                                 attribute='freq')
        if count_w2 != None: count_w2 = count_w2[0]
        else: count_w2 = 0

        count_w1_w2 = hdb.SelectOne('grams2',
                                    'grams="' + w1 + ' ' + w2 + '"',
                                    attribute='freq')
        if count_w1_w2 != None: count_w1_w2 = count_w1_w2[0]
        else: count_w1_w2 = 0
        return (count_w1_w2 + u) / (count_w2 + v * u)
コード例 #4
0
    def moushakeel_V2(self, text, smooth_const):
        t1 = time.time()
        tool = MyToolKit()
        hdb = DBHandler('data/model.db')
        result = {}
        hdb.connect()
        result_teshkeel = []
        result['token'] = 0
        result['type'] = 0

        ###################### Vocalisation by word #########################

        for sent in tool.sents(text.strip(), ["\n", "\r", ".", ":", ",", ';'],
                               subsent=['"', "'", '-']):
            #le replace a revoir, car il change un peut les mots (tatwil)
            sent = "# " + tool.normalizeArabicAlif(
                tool.DeleteDiacritic(sent)).replace('ـ', '') + " $"
            list_words = tool.words(sent)

            #Récuperer les possibilités pour chaque mot
            dict = {}
            for word in list_words:
                res = hdb.SelectOne('dictionary',
                                    'type="' + word + '"',
                                    attribute='vocabularies')
                if res == None: dict[word] = word
                else: dict[word] = res[0]
            dict['#'] = '#'
            dict['$'] = '$'
            #HMM
            matrice = []
            for word in list_words:
                list_dict = []
                for possib in tool.words(dict[word]):
                    if possib == "#": list_dict.append([-1, possib, 1])
                    else: list_dict.append([-1, possib, 0])
                matrice.append(list_dict)

            sent = self.Viterbi(matrice, smooth_const)

            result_teshkeel.append(' '.join(
                tool.words(sent)[1:len(tool.words(sent)) - 1]))
            result['token'] += len(list_words) - 2
            result['type'] += len(dict) - 2

        #######################################################################

        ###################### Vocalisation by letter #########################
        result_teshkeel = self.LettersVocaliser(result_teshkeel, smooth_const)
        #######################################################################

        result['result'] = result_teshkeel
        result['time'] = round(time.time() - t1, 2)
        return result
コード例 #5
0
ファイル: views.py プロジェクト: djidan10/Arabic-Diacritizer
def getdict(request):
    t1 = time.time()
    tool = MyToolKit()
    hdb = DBHandler('data/model.db')
    hdb.connect()
    result = {}
    dict = {}
    text = tool.DeleteDiacritic(tool.normalizeArabicAlif(request.POST['text'].strip()))
    list_words = tool.words(text)
    for word in list_words:
        res = hdb.SelectOne('dictionary','type="'+word+'"',attribute='vocabularies')

        if res == None: dict[word] = ''
        else : dict[word] = re.sub(' ','  -  ',res[0])
    result['results'] = dict
    result['time'] = round(time.time() - t1,2)
    result['type'] = len(dict)
    result['token'] = len(list_words)
    return HttpResponse(json.dumps(result),content_type="application/json")
コード例 #6
0
 def moushakeel_V1(self, text, smooth_const):
     t1 = time.time()
     tool = MyToolKit()
     hdb = DBHandler('data/model.db')
     result = {}
     hdb.connect()
     result_teshkeel = []
     result['token'] = 0
     result['type'] = 0
     for sent in tool.sents(text.strip(), ["\n", "\r", ".", ":", ",", ';'],
                            subsent=['"', "'", '-']):
         text = "# " + tool.normalizeArabicAlif(
             tool.DeleteDiacritic(sent)) + " $"
         list_words = tool.words(text)
         #Récuperer les possibilités pour chaque mot
         dict = {}
         for word in list_words:
             res = hdb.SelectOne('dictionary',
                                 'type="' + word + '"',
                                 attribute='vocabularies')
             if res == None: dict[word] = word
             else: dict[word] = res[0]
         dict['#'] = '#'
         dict['$'] = '$'
         possibilities = self.getPossibilities(list_words, dict)
         max_p = 0
         best_sent = ''
         for possib in possibilities:
             p = self.sentProbaility(possib, smooth_const)
             if p > max_p:
                 p = max_p
                 best_sent = possib
         result_teshkeel.append(' '.join(
             tool.words(best_sent)[1:len(tool.words(best_sent)) - 1]))
         result['token'] = len(list_words) - 2
         result['type'] = len(dict) - 2
     result['result'] = result_teshkeel
     result['time'] = round(time.time() - t1, 2)
     return result