def getNotVocalised(self, sents): tool = MyToolKit() not_vocalised_by_sents = [] for i in range(len(sents)): not_vocalised = [] words = tool.words(sents[i]) j = 0 while j < len(words): if not tool.HasDiac(words[j]): string = "" if j == 0: string = '###' + words[j] else: char = tool.LettersDiac(words[j - 1]) string = char[len(char) - 2] + char[len(char) - 1] + '#' + words[j] k = j + 1 while k < len(words) and not tool.HasDiac(words[k]): string += "#" + words[k] k += 1 j += k not_vocalised.append(string + "#") else: j += 1 not_vocalised_by_sents.append(not_vocalised) return not_vocalised_by_sents
def LettersVocaliser(self, sents, smooth_const): tool = MyToolKit() hdb = DBHandler('data/model.db') hdb.connect() res = hdb.getFromTable('letters_dictionary', attribute='type,vocabularies') dict = {} for r in res: dict[r[0]] = r[1] not_vocalised = self.getNotVocalised(sents) #parcourir les phrases for i in range(len(sents)): #pprint(not_vocalised[i]) #parcourir les chaines non vocalisé dans une phrase for j in range(len(not_vocalised[i])): matrice = [] char = tool.LettersDiac(not_vocalised[i][j]) list_dict = [] list_dict.append( [-1, tool.HideChar(char[0], expect=['#', ' ']), 1]) matrice.append(list_dict) list_dict = [] list_dict.append( [0, tool.HideChar(char[1], expect=['#', ' ']), 1]) matrice.append(list_dict) list_dict = [] list_dict.append([0, "#", 1]) matrice.append(list_dict) string = tool.DeleteDiacritic(not_vocalised[i][j]) #pprint(string) #parcourir les caractéres #if(string[0] == 'ا' and string[1] == 'ل') k = 3 while k < len(string): list_dict = [] if string[k - 1] == "#" and string[k] == 'ا' and string[ k + 1] == 'ل': list_dict = [] list_dict.append([0, '_', 1]) matrice.append(list_dict) list_dict = [] list_dict.append([0, '_ْ', 1]) matrice.append(list_dict) k += 2 else: #parcourir les possibilités for possib in tool.words(dict[string[k]]): if possib == "#": list_dict.append([-1, possib, 1]) else: list_dict.append([ -1, tool.HideChar(possib, expect=['#', ' ']), 0 ]) matrice.append(list_dict) k += 1 v = self.ViterbiLetter(matrice, smooth_const) string = self.alignLetter(v, not_vocalised[i][j]) #pprint(string) #pprint(not_vocalised[i][j]) #pprint(sents[i]) #not_vocalised[i] = not_vocalised[i].replace(not_vocalised[i][j],string) sents[i] = sents[i].replace( not_vocalised[i][j].replace('#', ' ').strip(), string.replace('#', ' ').strip()) """ if v[:2] == "##" : n = 0 else: n = 1 string1 = re.sub('#+',' ',not_vocalised[i][j]).strip().split(' ')) string2 = re.sub('#+',' ',v).strip().split(' ') pprint([0]) pprint(string) pprint('------------') """ #print(not_vocalised[i][j]+' => '+v) #exit() #pprint('---------------------------------') #pprint(matrice) """ for a in matrice: #matrice[i-1][k][1] pprint(a[0][0]) exit() """ return sents