def test_word_text(self): # is_vocalized(word) self.assertFalse(Araby.is_vocalized(u'العربية')) self.assertTrue(Araby.is_vocalized(u'الْعَرَبِيّةُ')) # is_vocalized(word) self.assertFalse(Araby.is_vocalizedtext(u"العربية لغة جميلة")) self.assertTrue(Araby.is_vocalizedtext(u'الْعَرَبيَّة لُغَةٌ جَمِيلَةٌ')) # is_arabicstring TODO: add more examples self.assertTrue(Araby.is_arabicstring(u'العربية')) # is_arabicrange TODO: add test # is_arabicword TODO: test other cases self.assertFalse(Araby.is_arabicword(u"")) self.assertFalse(Araby.is_arabicword(u"ْلاندخل")) # start with sukun self.assertFalse(Araby.is_arabicword(u'ؤكل')) # start with waw hamza above self.assertFalse(Araby.is_arabicword(u'ئكل')) # start with waw hamza above4 self.assertFalse(Araby.is_arabicword(u'ةدخل')) # start with teh_marbuta self.assertTrue(Araby.is_arabicword(u"العربية"))
def search_arabic(self, q, fetch_subgraph = True, limit = DEFAULT_LIMIT, fetchplan = DEFAULT_FETCHPLAN): """ Searches for given label intelligently handling vocalization. (This does not make much sense without a fetchplan as you will get index nodes only.) """ # If query is not vocalized, search unvocalized index and eventually # return subtree if not araby.is_vocalized(q): return self.search_index(q, fetch_subgraph, "ArabicNode.unvocalized_label", limit, fetchplan) # If it is vocalized, search unvocalized index and check for # "compatibility" of vocalization matches = self.search_index(araby.strip_tashkeel(q), False, "ArabicNode.unvocalized_label", limit) rids = [n.rid for n in matches.primary_results if Tools.is_vocalized_like(q, n.data["label"])] # Ignore vocalization if there is no compatible one if not rids: rids = [n.rid for n in matches.primary_results] return self.get_nodes(rids, fetch_subgraph, limit, fetchplan)
def search_arabic(self, q, fetch_subgraph=True, limit=DEFAULT_LIMIT, fetchplan=DEFAULT_FETCHPLAN): """ Searches for given label intelligently handling vocalization. (This does not make much sense without a fetchplan as you will get index nodes only.) """ # If query is not vocalized, search unvocalized index and eventually # return subtree if not araby.is_vocalized(q): return self.search_index(q, fetch_subgraph, "ArabicNode.unvocalized_label", limit, fetchplan) # If it is vocalized, search unvocalized index and check for # "compatibility" of vocalization matches = self.search_index(araby.strip_tashkeel(q), False, "ArabicNode.unvocalized_label", limit) rids = [ n.rid for n in matches.primary_results if Tools.is_vocalized_like(q, n.data["label"]) ] # Ignore vocalization if there is no compatible one if not rids: rids = [n.rid for n in matches.primary_results] return self.get_nodes(rids, fetch_subgraph, limit, fetchplan)
def check_partial_vocalized(word_vocalised, resulted_data): """ if the entred word is vocalized fully or partially, the analyzer return the vocalized like words This function treat the partial vocalized case. @param word_vocalised: the input word. @type word_vocalised: unicode. @param resulted_data: the founded resulat from dictionary. @type resulted_data: list of dict. @return: list of dictionaries of analyzed words with tags. @rtype: list. """ #print "check partial vocalization",word_vocalised.encode('utf8'),araby.is_vocalized(word_vocalised) #return resulted_data filtred_data = [] if not araby.is_vocalized(word_vocalised): return resulted_data else: #compare the vocalized output with the vocalized input #print ' is vocalized' for item in resulted_data: if 'vocalized' in item and araby.vocalizedlike(word_vocalised, item['vocalized']): item['tags'] += ':'+analex_const.partialVocalizedTag filtred_data.append(item) return filtred_data
def check_fields(self, fields): """ check fields """ voc = fields.get('vocalized', '') if not voc: return "Error: Empty vocalized" if not ar.is_arabicword(voc): return "Error: Invalid Arabic word " # not duplicated if voc in self.index: return "Error: Duplicated Entry " self.index.append(voc) # valid verb form if not ar.is_vocalized(voc): return "Error: Not Vocalized" # valid vocalization if not verify_tashkeel(voc): return "Error: Error in Vocalization " return "ok"
def check_fields(self, fields): """ check fields """ voc = fields.get('vocalized', '') unvoc = fields.get('unvocalized', '') if not voc: return "Error: Empty vocalized" if not ar.is_arabicword(voc): return "Error: Invalid Arabic word " if not is_valid_infinitive_verb(voc): return "Error: Invalid Arabic infinitive verb " # not duplicated if voc in self.index: if len(unvoc) <= 3: return "Warning: Duplicated Entry " else: return "Error: Duplicated Entry " self.index.append(voc) # valid verb form if not ar.is_vocalized(voc): return "Error: Not Vocalized" # valid vocalization if not verify_tashkeel(voc): return "Error: Error in Vocalization " return "ok"
def check_partial_vocalized(word_vocalised, resulted_data): """ if the entred word is vocalized fully or partially, the analyzer return the vocalized like words This function treat the partial vocalized case. @param word_vocalised: the input word. @type word_vocalised: unicode. @param resulted_data: the founded resulat from dictionary. @type resulted_data: list of dict. @return: list of dictionaries of analyzed words with tags. @rtype: list. """ filtred_data = [] if not araby.is_vocalized(word_vocalised): return resulted_data else: #compare the vocalized output with the vocalized input #print ' is vocalized' for item in resulted_data: if 'vocalized' in item: output = item['vocalized'] is_verb = "Verb" in item['type'] if araby.vocalizedlike(word_vocalised, output): item[ 'tags'] += ':' + analex_const.PARTIAL_VOCALIZED_TAG filtred_data.append(item) # حالة التقا الساكنين، مع نص مشكول مسبقا، والفعل في آخره كسرة بدل السكون elif is_verb and word_vocalised.endswith( araby.KASRA) and output.endswith(araby.SUKUN): if araby.vocalizedlike(word_vocalised[:-1], output[:-1]): item[ 'tags'] += ':' + analex_const.PARTIAL_VOCALIZED_TAG filtred_data.append(item) return filtred_data
def contextMenuEvent(self, event): popup_menu = self.createStandardContextMenu() #~popup_menu = QMenu() #~self.setContextMenuPolicy() RightToLeft = 1; # Select the word under the cursor. cursor = self.textCursor() cursor.select(QTextCursor.WordUnderCursor) self.setTextCursor(cursor) # Check if the selected word is misspelled and offer spelling # suggestions if it is. if self.textCursor().hasSelection(): #~text = (unicode(self.textCursor().selectedText())) #this is a workaround for QT bug when double click selects Arabic punctuation marks # plus the word in the text editor see https://bugreports.qt-project.org/browse/QTBUG-42397 originaltext = unicode(self.textCursor().selectedText()) arabicmarks = [u'؟',u'،',u'؛',u'“',u'”',u'‘',u'’'] holder = originaltext[-1] if holder in arabicmarks: self.pretxt = holder else: self.pretxt='' text = originaltext.strip(u'؟،؛“”‘’') # the word is aleady analyzed if self.dict.check(text): spell_menu = QMenu(u'المزيد...') spell_menu.setLayoutDirection(RightToLeft) suggests = self.dict.suggest(text) for word in suggests[:10]: action = SpellAction(word, spell_menu) action.correct.connect(self.correctWord) #~spell_menu.addAction(action) popup_menu.addAction(action) #~spell_menu.setStyleSheet("QMenu {font: 32px; margin: 2px;}") popup_menu.setStyleSheet("QMenu {font: 24px;}") # Only add the spelling suggests to the menu if there are # suggestions. #~if len(spell_menu.actions()) != 0: #~popup_menu.insertSeparator(popup_menu.actions()[0]) #~popup_menu.insertMenu(popup_menu.actions()[0], spell_menu) if len (suggests)>10: for word in suggests[10:]: action = SpellAction(word, spell_menu) action.correct.connect(self.correctWord) #~spell_menu.addAction(action) spell_menu.addAction(action) spell_menu.setStyleSheet("QMenu {font: 24px;}") popup_menu.addSeparator() popup_menu.addMenu(spell_menu) if len(suggests) == 1 and not araby.is_vocalized(suggests[0]): addtodict_action = popup_menu.addAction(u'أضف للقاموس') #~addtodict_action.triggered.connect( lambda x = x = originaltext: self.add_to_dict(x)) addtodict_action.triggered.connect( lambda : self.add_to_dict(originaltext)) # if the word hs no suggestions # we lookup for customized vocalization suggests = self.dict.custom_dict.lookup(word) for word in suggests: action = SpellAction(word, spell_menu) action.correct.connect(self.correctWord) #~spell_menu.addAction(action) popup_menu.addAction(action) else: # redo taskeel for this word #~pass; # if the word hs no suggestions # we lookup for customized vocalization suggests = self.dict.custom_dict.lookup(text) for word in suggests: action = SpellAction(word, spell_menu) action.correct.connect(self.correctWord) #~spell_menu.addAction(action) popup_menu.addAction(action) popup_menu.exec_(event.globalPos())
if araby.is_moon(c): print('moon', end=" ") if araby.is_sun(c): print('sun', end=" ") print(araby.order(c), end=" ") print() word = u"الْعَرَيِيّةُ" word_list = [ u"الْعَرَيِيّةُ", u"العربية", u"الْعَرَيِيّةُ الفصحى", u"غير مشكول", "Taha", ] word1 = u"" for word in word_list: print(word, '\t', end=" ") if araby.is_vocalized(word): print(' is vocalized', end=" ") if araby.is_vocalizedtext(word): print(' is vocalized text', end=" ") if araby.is_arabicword(word): print(' is valid word', end=" ") else: print("invalid arabic word", end=" ") print(' strip harakat', araby.strip_harakat(word), end=" ") print(' strip tashkeel', araby.strip_tashkeel(word), end=" ") print(' strip tatweel', araby.strip_tatweel(word), end=" ") print(' normalize ligature ', araby.normalize_ligature(word), end=" ") if araby.vocalizedlike(word, word1): print("vocalized_like", end=" ") print() word1 = word if araby.vocalizedlike(u"العربية", u"العرَبية"): print("vocalized_like", end=" ") word = u"الْعَرَيِيّةُ" word_list = [ u"الْعَرَيِيّةُ",
if araby.is_weak(c): print ('weak'), if araby.is_moon(c): print ('moon'), if araby.is_sun(c):print ('sun'), print (araby.order(c)), print (); word=u"الْعَرَيِيّةُ" word_list=[ u"الْعَرَيِيّةُ", u"العربية", u"الْعَرَيِيّةُ الفصحى", u"غير مشكول", "Taha", u"سئل لأنه يؤم الإمام" ] word1=u"" for word in word_list: print (word) if araby.is_vocalized(word): print (' is vocalized') if araby.is_vocalizedtext(word): print (' is vocalized text') if araby.is_arabicword(word): print (' is valid word') else: print ("invalid arabic word") print (' strip harakat', araby.strip_harakat(word)) print (' strip tashkeel', araby.strip_tashkeel(word)) print (' strip tatweel',araby.strip_tatweel(word)) print (' normalize ligature ', araby.normalize_ligature(word)) print (' normalize hamza', araby.normalize_hamza(word)) if araby.vocalizedlike(word, word1): print ("vocalized_like") word1=word; if araby.vocalizedlike(u"العربية",u"العرَبية"): print ("vocalized_like")
if araby.is_moon(c): print 'moon', if araby.is_sun(c):print 'sun', print araby.order(c), print; word=u"الْعَرَيِيّةُ" word_list=[ u"الْعَرَيِيّةُ", u"العربية", u"الْعَرَيِيّةُ الفصحى", u"غير مشكول", "Taha", ] word1=u"" for word in word_list: print word.encode('utf8'),'\t', if araby.is_vocalized(word): print ' is vocalized', ## if araby.isArabicstring(word): print ' iisArabicstring', ## else:print ' invalid arabicstring', if araby.is_vocalizedtext(word): print ' is vocalized text', if araby.is_arabicword(word): print ' is valid word', else: print "invalid arabic word", print ' strip harakat', araby.strip_harakat(word).encode('utf8'), print ' strip tashkeel', araby.strip_tashkeel(word).encode('utf8'), print ' strip tatweel',araby.strip_tatweel(word).encode('utf8'), print ' normalize ligature ', araby.normalize_ligature(word).encode('utf8'), if araby.vocalizedlike(word, word1): print "vocalized_like", print; word1=word; if araby.vocalizedlike(u"العربية",u"العرَبية"): print "vocalized_like", word=u"الْعَرَيِيّةُ" word_list=[
def decoupage(word): """Découpe le mot donné en entrée (word) en (préfixes, racine et suffixes). La sortie de la fonction est une liste de dictionnaires regroupant toutes les combinaisons syntaxiquement correctes d'aprés la compatibilitée entre les préfixes et sufixes détéctés et la taille de la racine.""" word_unvocalized = araby.strip_diacritics(word) prefixes, suffixes = [""], [""] combinaisons_possibles = [] for p in Prefixe.objects.all(): if word_unvocalized.startswith(p.unvoweled_form): # print("p:"+p.unvoweled_form) if araby.is_vocalized(word): if araby.vocalizedlike(word[:len(p.voweled_form)], p.voweled_form): prefixes.append(p) else: prefixes.append(p) for s in Suffixe.objects.all(): if word_unvocalized.endswith(s.unvoweled_form): if araby.is_vocalized(word): if araby.vocalizedlike(word[-len(s.voweled_form):], s.voweled_form): suffixes.append(s) else: suffixes.append(s) for pr in prefixes: for sf in suffixes: # Validation criteria if pr != "" and sf != "": if (len(word_unvocalized) - len(pr.unvoweled_form) - len(sf.unvoweled_form)) <= 2 or \ (len(word_unvocalized) - len(pr.unvoweled_form) - len(sf.unvoweled_form)) > 9: continue if ((pr.classe[0] == 'N' and sf.classe[0] == 'V') or (pr.classe[0] == 'V' and sf.classe[0] == 'N') or (pr.classe in ['N1', 'N2', 'N3', 'N5'])): continue # Si on est là -> le préfixe est compatible avec le suffixe, et la taille de la base est accéptable base = word # Supprimer le prefixe de la base // En gardant le Tachkil if pr: for char in pr.unvoweled_form: while char != base[0]: base = base[1:] base = base[1:] while araby.is_tashkeel(base[0]): base = base[1:] # Supprimer le suffixe de la base // En gardant le Tachkil if sf: r_sf = [c for c in sf.unvoweled_form] r_sf.reverse() for char in r_sf: base = base[:base.rindex(char)] combinaisons_possibles.append({ 'Base': base, 'Préfixe': pr, 'Suffixe': sf }) return combinaisons_possibles
if araby.is_moon(c): print 'moon', if araby.is_sun(c): print 'sun', print araby.order(c), print word = u"الْعَرَيِيّةُ" word_list = [ u"الْعَرَيِيّةُ", u"العربية", u"الْعَرَيِيّةُ الفصحى", u"غير مشكول", "Taha", ] word1 = u"" for word in word_list: print word.encode('utf8'), '\t', if araby.is_vocalized(word): print ' is vocalized', ## if araby.isArabicstring(word): print ' iisArabicstring', ## else:print ' invalid arabicstring', if araby.is_vocalizedtext(word): print ' is vocalized text', if araby.is_arabicword(word): print ' is valid word', else: print "invalid arabic word", print ' strip harakat', araby.strip_harakat(word).encode('utf8'), print ' strip tashkeel', araby.strip_tashkeel(word).encode('utf8'), print ' strip tatweel', araby.strip_tatweel(word).encode('utf8'), print ' normalize ligature ', araby.normalize_ligature(word).encode( 'utf8'), if araby.vocalizedlike(word, word1): print "vocalized_like", print word1 = word if araby.vocalizedlike(u"العربية", u"العرَبية"): print "vocalized_like", word = u"الْعَرَيِيّةُ"
def contextMenuEvent(self, event): popup_menu = self.createStandardContextMenu() #~popup_menu = QMenu() #~self.setContextMenuPolicy() RightToLeft = 1 # Select the word under the cursor. cursor = self.textCursor() cursor.select(QTextCursor.WordUnderCursor) self.setTextCursor(cursor) # Check if the selected word is misspelled and offer spelling # suggestions if it is. if self.textCursor().hasSelection(): #~text = (unicode(self.textCursor().selectedText())) #this is a workaround for QT bug when double click selects Arabic punctuation marks # plus the word in the text editor see https://bugreports.qt-project.org/browse/QTBUG-42397 #~ originaltext = unicode(self.textCursor().selectedText()) originaltext = self.textCursor().selectedText() arabicmarks = [u'؟', u'،', u'؛', u'“', u'”', u'‘', u'’'] holder = originaltext[-1] if holder in arabicmarks: self.pretxt = holder else: self.pretxt = '' text = originaltext.strip(u'؟،؛“”‘’') spell_menu = QMenu(u'المزيد...') spell_menu.setLayoutDirection(RightToLeft) # the word is aleady analyzed if self.dict.check(text): suggests = self.dict.suggest(text) for word in suggests[:10]: action = SpellAction(word, spell_menu) action.correct.connect(self.correctWord) #~spell_menu.addAction(action) popup_menu.addAction(action) #~spell_menu.setStyleSheet("QMenu {font: 32px; margin: 2px;}") popup_menu.setStyleSheet("QMenu {font: 24px;}") # Only add the spelling suggests to the menu if there are # suggestions. #~if len(spell_menu.actions()) != 0: #~popup_menu.insertSeparator(popup_menu.actions()[0]) #~popup_menu.insertMenu(popup_menu.actions()[0], spell_menu) if len(suggests) > 10: for word in suggests[10:]: action = SpellAction(word, spell_menu) action.correct.connect(self.correctWord) #~spell_menu.addAction(action) spell_menu.addAction(action) spell_menu.setStyleSheet("QMenu {font: 24px;}") popup_menu.addSeparator() popup_menu.addMenu(spell_menu) if len(suggests) == 1 and not araby.is_vocalized(suggests[0]): addtodict_action = popup_menu.addAction(u'أضف للقاموس') #~addtodict_action.triggered.connect( lambda x = x = originaltext: self.add_to_dict(x)) addtodict_action.triggered.connect( lambda: self.add_to_dict(originaltext)) # if the word hs no suggestions # we lookup for customized vocalization suggests = self.dict.custom_dict.lookup(word) for word in suggests: action = SpellAction(word, spell_menu) action.correct.connect(self.correctWord) #~spell_menu.addAction(action) popup_menu.addAction(action) else: # redo taskeel for this word #~pass; # if the word hs no suggestions # we lookup for customized vocalization suggests = self.dict.custom_dict.lookup(text) for word in suggests: action = SpellAction(word, spell_menu) action.correct.connect(self.correctWord) #~spell_menu.addAction(action) popup_menu.addAction(action) popup_menu.exec_(event.globalPos())