Esempio n. 1
0
    def waznlike2(word1, wazn, extract_root = False):
        u"""If the  word1 is like a wazn (pattern),
        the letters must be equal,
        the wazn has FEH, AIN, LAM letters.
        this are as generic letters.
        The two words can be full vocalized, or partial vocalized

        Example:
            >>> word1 = u"ضارب"
            >>> wazn = u"فَاعِل"
            >>> araby.waznlike(word1, wazn)
            True

        @param word1: input word
        @type word1: unicode
        @param wazn: given word template  وزن
        @type wazn: unicode
        @param extract_root: return the root
        @type extract_root: unicode
        @return: if two words have similar vocalization
        @rtype: Boolean
        """
        stack1 = stack.Stack(word1)
        stack2 = stack.Stack(wazn)
        root = stack.Stack()
        last1 = stack1.pop()
        last2 = stack2.pop()
        vowels = HARAKAT
        while last1 != None and last2 != None:
            if last1 == last2 and last2 not in (FEH, AIN, LAM):
                last1 = stack1.pop()
                last2 = stack2.pop()
            elif last1 not in vowels and last2 in (FEH, AIN, LAM):
                root.push(last1)
                #~ print "t"
                last1 = stack1.pop()
                last2 = stack2.pop()
            elif last1 in vowels and last2 not in vowels:
                last1 = stack1.pop()
            elif last1 not in vowels and last2 in vowels:
                last2 = stack2.pop()
            else:
                break
        # reverse the root letters
        root.items.reverse()
        #~ print " the root is ", root.items#"".join(root.items)
        if not (stack1.is_empty() and stack2.is_empty()):
            return False
        # if one letter is remind after pop in one stack
        elif last1 != None or last2 != None:
            return False
        else:
            #~ print (u"word '%s' , wazn ='%s'"%(u"".join(stack1.items),u"".join(stack2.items))).encode('utf8') 
            if extract_root:
                return "".join(root.items)
            else:
                return True
Esempio n. 2
0
def joint(letters, marks):
    """ joint the letters with the marks
    the length ot letters and marks must be equal 
    return word

    Example:
        >>> letters = u"العربية"
        >>> marks   = u'\\u064e\\u0652\\u064e\\u064e\\u064e\\u064e\\u064f'
        >>> word = araby.joint(letters, marks)
        >>> print word.encode('utf8')
        اَلْعَرَبَيَةُ
    
    @param letters: the word letters
    @type letters: unicode
    @param marks: the word marks
    @type marks: unicode    
    @return: word
    @rtype: unicode
    """
    # The length ot letters and marks must be equal
    if len(letters) != len(marks):
        return ""
    stack_letter = stack.Stack(letters)
    stack_letter.items.reverse()
    stack_mark = stack.Stack(marks)
    stack_mark.items.reverse()

    word_stack = stack.Stack()
    last_letter = stack_letter.pop()
    last_mark = stack_mark.pop()
    vowels = HARAKAT
    while last_letter != None and last_mark != None:
        if last_letter == SHADDA:
            top = word_stack.pop()
            if top not in vowels:
                word_stack.push(top)
            word_stack.push(last_letter)
            if last_mark != NOT_DEF_HARAKA:
                word_stack.push(last_mark)
        else:
            word_stack.push(last_letter)
            if last_mark != NOT_DEF_HARAKA:
                word_stack.push(last_mark)

        last_letter = stack_letter.pop()
        last_mark = stack_mark.pop()

    if not (stack_letter.is_empty() and stack_mark.is_empty()):
        return False
    else:
        return ''.join(word_stack.items)
Esempio n. 3
0
def vocalized_similarity(word1, word2):
    """
    if the two words has the same letters and the same harakats,  this function return True.
    The two words can be full vocalized, or partial vocalized
    
    Example:
        >>> word1 = u"ضَربٌ"
        >>> word2 = u"ضَرْبٌ"
        >>> araby.vocalizedlike(word1, word2)
        True
        >>> word1 = u"ضَربٌ"
        >>> word2 = u"ضَرْبٍ"
        >>> araby.vocalized_similarity(word1, word2)
        -1
    
    @param word1: first word 
    @type word1: unicode
    @param word2: second word 
    @type word2: unicode
    @return: return if words are similar, else return negative number of errors
    @rtype: Boolean / int 
    """

    stack1 = stack.Stack(word1)
    stack2 = stack.Stack(word2)
    last1 = stack1.pop()
    last2 = stack2.pop()
    err_count = 0
    vowels = HARAKAT
    while last1 != None and last2 != None:
        if last1 == last2:
            last1 = stack1.pop()
            last2 = stack2.pop()
        elif last1 in vowels and last2 not in vowels:
            last1 = stack1.pop()
        elif last1 not in vowels and last2 in vowels:
            last2 = stack2.pop()
        else:
            #break
            if last1 == SHADDA:
                last1 = stack1.pop()
            elif last2 == SHADDA:
                last2 = stack2.pop()
            else:
                last1 = stack1.pop()
                last2 = stack2.pop()
                err_count += 1
    if err_count > 0:
        return -err_count
    else:
        return True
Esempio n. 4
0
def shaddalike(partial, fully):
    """
    If the two words has the same letters and the same harakats,  this fuction return True.
    The first word is partially vocalized, the second is fully
    if the partially contians a shadda, it must be at the same place in the fully 
    
    Example:
        >>> word1 = u"ردّ"
        >>> word2=u"ردَّ"
        >>> araby.shaddalike(word1, word2)
        True
    
    @param partial: the partially vocalized word
    @type partial: unicode
    @param fully: the fully vocalized word
    @type fully: unicode
    @return: if contains shadda 
    @rtype: Boolean
    """
    #المدخل ليس به شدة، لا داعي للبحث
    if not has_shadda(partial):
        return True
    #المدخل به شدة، والنتيجة ليس بها شدة، خاطئ
    elif not has_shadda(fully) and has_shadda(partial):
        return False

# المدخل والمخرج بهما شدة، نتأكد من موقعهما
    partial = strip_harakat(partial)
    fully = strip_harakat(fully)
    pstack = stack.Stack(partial)
    vstack = stack.Stack(fully)
    plast = pstack.pop()
    vlast = vstack.pop()
    # if debug: print "+0",  Pstack,  Vstack
    while plast != None and vlast != None:
        if plast == vlast:
            plast = pstack.pop()
            vlast = vstack.pop()
        elif plast == SHADDA and vlast != SHADDA:
            # if debug: print "+2",  Pstack.items, Plast,  Vstack.items, Vlast
            break
        elif plast != SHADDA and vlast == SHADDA:
            # if debug: print "+2",  Pstack.items, Plast,  Vstack.items, Vlast
            vlast = vstack.pop()
        else:
            # if debug: print "+2",  Pstack.items, Plast,  Vstack.items, Vlast
            break
    if not (pstack.is_empty() and vstack.is_empty()):
        return False
    else:
        return True
Esempio n. 5
0
def separate(word, extract_shadda=False):
    """
    separate the letters from the vowels,  in arabic word, 
    if a letter hasn't a haraka,  the not definited haraka is attributed.
    return ( letters, vowels)
    
    Example:
        >>> araby.separate(text)
        (u'\\u0627\\u0644\\u0639\\u0631\\u0628\\u064a\\u0629', u'\\u064e\\u0652\\u064e\\u064e\\u064e\\u064e\\u064f')
        >>> letters, marks =araby.separate(text)
        >>> print letters.encode('utf8')
        العربية
        >>> print marks.encode('utf8')
        >>> for m in marks:
        ...     print araby.name(m)
        فتحة
        سكون
        فتحة
        فتحة
        فتحة
        فتحة
        ضمة
    
    @param word: the input word
    @type word: unicode
    @param extract_shadda: extract shadda as seperate text
    @type extract_shadda: Boolean 
    @return: ( letters, vowels)
    @rtype:couple of unicode
    """
    stack1 = stack.Stack(word)
    # the word is inversed in the stack
    stack1.items.reverse()
    letters = stack.Stack()
    marks = stack.Stack()
    vowels = HARAKAT
    last1 = stack1.pop()
    # if the last element must be a letter,
    # the arabic word can't starts with a haraka
    # in th stack the word is inversed
    while last1 in vowels:
        last1 = stack1.pop()
    while last1 != None:
        if last1 in vowels:
            # we can't have two harakats beside.
            # the shadda is considered as a letter
            marks.pop()
            marks.push(last1)
        elif last1 == SHADDA:
            # is the element is a Shadda,
            # the previous letter must have a sukun as mark,
            # and the shadda take the indefinate  mark
            marks.pop()
            marks.push(SUKUN)
            marks.push(NOT_DEF_HARAKA)
            letters.push(SHADDA)
        else:
            marks.push(NOT_DEF_HARAKA)
            letters.push(last1)
        last1 = stack1.pop()
    if extract_shadda:
        # the shadda is considered as letter
        wordletters = ''.join(letters.items)
        # print wordletters.encode('utf8')
        shaddaplaces = re.sub(r'[^%s]' % SHADDA, TATWEEL, wordletters)
        shaddaplaces = re.sub('%s%s' % (TATWEEL, SHADDA), SHADDA, shaddaplaces)
        # print wordletters.encode('utf8')
        wordletters = strip_shadda(wordletters)
        # print wordletters.encode('utf8')
        return (wordletters, ''.join(marks.items), shaddaplaces)
    else:
        return (''.join(letters.items), ''.join(marks.items))
    def waznlike2(word1, wazn, extract_root = False):
        u"""If the  word1 is like a wazn (pattern),
        the letters must be equal,
        the wazn has FEH, AIN, LAM letters.
        this are as generic letters.
        The two words can be full vocalized, or partial vocalized

        Example:
            >>> word1 = u"ضارب"
            >>> wazn = u"فَاعِل"
            >>> araby.waznlike(word1, wazn)
            True

        @param word1: input word
        @type word1: unicode
        @param wazn: given word template  وزن
        @type wazn: unicode
        @param extract_root: return the root
        @type extract_root: unicode
        @return: if two words have similar vocalization
        @rtype: Boolean
        """
        stack1 = stack.Stack(word1)
        stack2 = stack.Stack(wazn)
        root = stack.Stack()
        last1 = stack1.pop()
        last2 = stack2.pop()
        vowels = HARAKAT
        already_ain = False # used for double ain wazns
        # if an Ain is already given, the new ain must have the same letter
        while last1 != None and last2 != None:
            if last1 == last2 and last2 not in (FEH, AIN, LAM):
                last1 = stack1.pop()
                last2 = stack2.pop()
            elif last1 not in vowels and last2 in (FEH, LAM):
                
                root.push(last1)
                last1 = stack1.pop()
                last2 = stack2.pop()
            elif last1 not in vowels and last2 == AIN and not already_ain :
                # keep the found letter
                already_ain = last1
                root.push(last1)
                last1 = stack1.pop()
                last2 = stack2.pop()
            elif last1 not in vowels and last2 == AIN and  already_ain and last1 == already_ain:
                # keep the found letter
                # the second ain is not added to root
                last1 = stack1.pop()
                last2 = stack2.pop()
            #~ else:
                #~ # test if the AIN is equal to previous
                #~ if already_ain == last1:
                    #~ root.push(last1)
                    #~ last1 = stack1.pop()
                    #~ last2 = stack2.pop()
                #~ else:
                    #~ break
            elif last1 in vowels and last2 not in vowels:
                last1 = stack1.pop()
            elif last1 not in vowels and last2 in vowels:
                last2 = stack2.pop()
            else:
                break
        # reverse the root letters
        root.items.reverse()
        #~ print " the root is ", root.items#"".join(root.items)
        if not (stack1.is_empty() and stack2.is_empty()):
            return False
        # if one letter is remind after pop in one stack
        elif last1 != None or last2 != None:
            return False
        else:
            #~ print (u"word '%s' , wazn ='%s'"%(u"".join(stack1.items),u"".join(stack2.items))).encode('utf8') 
            if extract_root:
                return "".join(root.items)
            else:
                return True