def waznlike2(word1, wazn, extract_root = False): u"""If the word1 is like a wazn (pattern), the letters must be equal, the wazn has FEH, AIN, LAM letters. this are as generic letters. The two words can be full vocalized, or partial vocalized Example: >>> word1 = u"ضارب" >>> wazn = u"فَاعِل" >>> araby.waznlike(word1, wazn) True @param word1: input word @type word1: unicode @param wazn: given word template وزن @type wazn: unicode @param extract_root: return the root @type extract_root: unicode @return: if two words have similar vocalization @rtype: Boolean """ stack1 = stack.Stack(word1) stack2 = stack.Stack(wazn) root = stack.Stack() last1 = stack1.pop() last2 = stack2.pop() vowels = HARAKAT while last1 != None and last2 != None: if last1 == last2 and last2 not in (FEH, AIN, LAM): last1 = stack1.pop() last2 = stack2.pop() elif last1 not in vowels and last2 in (FEH, AIN, LAM): root.push(last1) #~ print "t" last1 = stack1.pop() last2 = stack2.pop() elif last1 in vowels and last2 not in vowels: last1 = stack1.pop() elif last1 not in vowels and last2 in vowels: last2 = stack2.pop() else: break # reverse the root letters root.items.reverse() #~ print " the root is ", root.items#"".join(root.items) if not (stack1.is_empty() and stack2.is_empty()): return False # if one letter is remind after pop in one stack elif last1 != None or last2 != None: return False else: #~ print (u"word '%s' , wazn ='%s'"%(u"".join(stack1.items),u"".join(stack2.items))).encode('utf8') if extract_root: return "".join(root.items) else: return True
def joint(letters, marks): """ joint the letters with the marks the length ot letters and marks must be equal return word Example: >>> letters = u"العربية" >>> marks = u'\\u064e\\u0652\\u064e\\u064e\\u064e\\u064e\\u064f' >>> word = araby.joint(letters, marks) >>> print word.encode('utf8') اَلْعَرَبَيَةُ @param letters: the word letters @type letters: unicode @param marks: the word marks @type marks: unicode @return: word @rtype: unicode """ # The length ot letters and marks must be equal if len(letters) != len(marks): return "" stack_letter = stack.Stack(letters) stack_letter.items.reverse() stack_mark = stack.Stack(marks) stack_mark.items.reverse() word_stack = stack.Stack() last_letter = stack_letter.pop() last_mark = stack_mark.pop() vowels = HARAKAT while last_letter != None and last_mark != None: if last_letter == SHADDA: top = word_stack.pop() if top not in vowels: word_stack.push(top) word_stack.push(last_letter) if last_mark != NOT_DEF_HARAKA: word_stack.push(last_mark) else: word_stack.push(last_letter) if last_mark != NOT_DEF_HARAKA: word_stack.push(last_mark) last_letter = stack_letter.pop() last_mark = stack_mark.pop() if not (stack_letter.is_empty() and stack_mark.is_empty()): return False else: return ''.join(word_stack.items)
def vocalized_similarity(word1, word2): """ if the two words has the same letters and the same harakats, this function return True. The two words can be full vocalized, or partial vocalized Example: >>> word1 = u"ضَربٌ" >>> word2 = u"ضَرْبٌ" >>> araby.vocalizedlike(word1, word2) True >>> word1 = u"ضَربٌ" >>> word2 = u"ضَرْبٍ" >>> araby.vocalized_similarity(word1, word2) -1 @param word1: first word @type word1: unicode @param word2: second word @type word2: unicode @return: return if words are similar, else return negative number of errors @rtype: Boolean / int """ stack1 = stack.Stack(word1) stack2 = stack.Stack(word2) last1 = stack1.pop() last2 = stack2.pop() err_count = 0 vowels = HARAKAT while last1 != None and last2 != None: if last1 == last2: last1 = stack1.pop() last2 = stack2.pop() elif last1 in vowels and last2 not in vowels: last1 = stack1.pop() elif last1 not in vowels and last2 in vowels: last2 = stack2.pop() else: #break if last1 == SHADDA: last1 = stack1.pop() elif last2 == SHADDA: last2 = stack2.pop() else: last1 = stack1.pop() last2 = stack2.pop() err_count += 1 if err_count > 0: return -err_count else: return True
def shaddalike(partial, fully): """ If the two words has the same letters and the same harakats, this fuction return True. The first word is partially vocalized, the second is fully if the partially contians a shadda, it must be at the same place in the fully Example: >>> word1 = u"ردّ" >>> word2=u"ردَّ" >>> araby.shaddalike(word1, word2) True @param partial: the partially vocalized word @type partial: unicode @param fully: the fully vocalized word @type fully: unicode @return: if contains shadda @rtype: Boolean """ #المدخل ليس به شدة، لا داعي للبحث if not has_shadda(partial): return True #المدخل به شدة، والنتيجة ليس بها شدة، خاطئ elif not has_shadda(fully) and has_shadda(partial): return False # المدخل والمخرج بهما شدة، نتأكد من موقعهما partial = strip_harakat(partial) fully = strip_harakat(fully) pstack = stack.Stack(partial) vstack = stack.Stack(fully) plast = pstack.pop() vlast = vstack.pop() # if debug: print "+0", Pstack, Vstack while plast != None and vlast != None: if plast == vlast: plast = pstack.pop() vlast = vstack.pop() elif plast == SHADDA and vlast != SHADDA: # if debug: print "+2", Pstack.items, Plast, Vstack.items, Vlast break elif plast != SHADDA and vlast == SHADDA: # if debug: print "+2", Pstack.items, Plast, Vstack.items, Vlast vlast = vstack.pop() else: # if debug: print "+2", Pstack.items, Plast, Vstack.items, Vlast break if not (pstack.is_empty() and vstack.is_empty()): return False else: return True
def separate(word, extract_shadda=False): """ separate the letters from the vowels, in arabic word, if a letter hasn't a haraka, the not definited haraka is attributed. return ( letters, vowels) Example: >>> araby.separate(text) (u'\\u0627\\u0644\\u0639\\u0631\\u0628\\u064a\\u0629', u'\\u064e\\u0652\\u064e\\u064e\\u064e\\u064e\\u064f') >>> letters, marks =araby.separate(text) >>> print letters.encode('utf8') العربية >>> print marks.encode('utf8') >>> for m in marks: ... print araby.name(m) فتحة سكون فتحة فتحة فتحة فتحة ضمة @param word: the input word @type word: unicode @param extract_shadda: extract shadda as seperate text @type extract_shadda: Boolean @return: ( letters, vowels) @rtype:couple of unicode """ stack1 = stack.Stack(word) # the word is inversed in the stack stack1.items.reverse() letters = stack.Stack() marks = stack.Stack() vowels = HARAKAT last1 = stack1.pop() # if the last element must be a letter, # the arabic word can't starts with a haraka # in th stack the word is inversed while last1 in vowels: last1 = stack1.pop() while last1 != None: if last1 in vowels: # we can't have two harakats beside. # the shadda is considered as a letter marks.pop() marks.push(last1) elif last1 == SHADDA: # is the element is a Shadda, # the previous letter must have a sukun as mark, # and the shadda take the indefinate mark marks.pop() marks.push(SUKUN) marks.push(NOT_DEF_HARAKA) letters.push(SHADDA) else: marks.push(NOT_DEF_HARAKA) letters.push(last1) last1 = stack1.pop() if extract_shadda: # the shadda is considered as letter wordletters = ''.join(letters.items) # print wordletters.encode('utf8') shaddaplaces = re.sub(r'[^%s]' % SHADDA, TATWEEL, wordletters) shaddaplaces = re.sub('%s%s' % (TATWEEL, SHADDA), SHADDA, shaddaplaces) # print wordletters.encode('utf8') wordletters = strip_shadda(wordletters) # print wordletters.encode('utf8') return (wordletters, ''.join(marks.items), shaddaplaces) else: return (''.join(letters.items), ''.join(marks.items))
def waznlike2(word1, wazn, extract_root = False): u"""If the word1 is like a wazn (pattern), the letters must be equal, the wazn has FEH, AIN, LAM letters. this are as generic letters. The two words can be full vocalized, or partial vocalized Example: >>> word1 = u"ضارب" >>> wazn = u"فَاعِل" >>> araby.waznlike(word1, wazn) True @param word1: input word @type word1: unicode @param wazn: given word template وزن @type wazn: unicode @param extract_root: return the root @type extract_root: unicode @return: if two words have similar vocalization @rtype: Boolean """ stack1 = stack.Stack(word1) stack2 = stack.Stack(wazn) root = stack.Stack() last1 = stack1.pop() last2 = stack2.pop() vowels = HARAKAT already_ain = False # used for double ain wazns # if an Ain is already given, the new ain must have the same letter while last1 != None and last2 != None: if last1 == last2 and last2 not in (FEH, AIN, LAM): last1 = stack1.pop() last2 = stack2.pop() elif last1 not in vowels and last2 in (FEH, LAM): root.push(last1) last1 = stack1.pop() last2 = stack2.pop() elif last1 not in vowels and last2 == AIN and not already_ain : # keep the found letter already_ain = last1 root.push(last1) last1 = stack1.pop() last2 = stack2.pop() elif last1 not in vowels and last2 == AIN and already_ain and last1 == already_ain: # keep the found letter # the second ain is not added to root last1 = stack1.pop() last2 = stack2.pop() #~ else: #~ # test if the AIN is equal to previous #~ if already_ain == last1: #~ root.push(last1) #~ last1 = stack1.pop() #~ last2 = stack2.pop() #~ else: #~ break elif last1 in vowels and last2 not in vowels: last1 = stack1.pop() elif last1 not in vowels and last2 in vowels: last2 = stack2.pop() else: break # reverse the root letters root.items.reverse() #~ print " the root is ", root.items#"".join(root.items) if not (stack1.is_empty() and stack2.is_empty()): return False # if one letter is remind after pop in one stack elif last1 != None or last2 != None: return False else: #~ print (u"word '%s' , wazn ='%s'"%(u"".join(stack1.items),u"".join(stack2.items))).encode('utf8') if extract_root: return "".join(root.items) else: return True