Exemple #1
0
def check_partial_vocalized(word_vocalised, resulted_data):
    """
    if the entred word is vocalized fully or partially, 
    the analyzer return the vocalized like words
    This function treat the partial vocalized case.
    @param word_vocalised: the input word.
    @type word_vocalised: unicode.
    @param resulted_data: the founded resulat from dictionary.
    @type resulted_data: list of dict.
    @return: list of dictionaries of analyzed words with tags.
    @rtype: list.        
    """
    #print "check partial vocalization",word_vocalised.encode('utf8'),araby.is_vocalized(word_vocalised)
    #return resulted_data    
    filtred_data = []
    if not araby.is_vocalized(word_vocalised):
        return resulted_data
    else:
        #compare the vocalized output with the vocalized input
        #print ' is vocalized'
        for item in  resulted_data:
            if 'vocalized' in item and araby.vocalizedlike(word_vocalised,
              item['vocalized']):
                item['tags'] += ':'+analex_const.partialVocalizedTag
                filtred_data.append(item)
    return  filtred_data
Exemple #2
0
 def test_vocalized_similarity(self):
     """Test vocalized_similarity function ?"""
     word1 = u"ضَربٌ"
     word2 = u"ضَرْبٌ"
     self.assertTrue(ar.vocalizedlike(word1, word2))
     self.assertNotEqual(ar.vocalized_similarity(word1, word2), -2)
     self.assertTrue(ar.vocalized_similarity(word1, word2))
Exemple #3
0
def check_partial_vocalized(word_vocalised, resulted_data):
    """
    if the entred word is vocalized fully or partially, 
    the analyzer return the vocalized like words
    This function treat the partial vocalized case.
    @param word_vocalised: the input word.
    @type word_vocalised: unicode.
    @param resulted_data: the founded resulat from dictionary.
    @type resulted_data: list of dict.
    @return: list of dictionaries of analyzed words with tags.
    @rtype: list.        
    """
    #print "check partial vocalization",word_vocalised.encode('utf8'),araby.is_vocalized(word_vocalised)
    #return resulted_data    
    filtred_data = []
    if not araby.is_vocalized(word_vocalised):
        return resulted_data
    else:
        #compare the vocalized output with the vocalized input
        #print ' is vocalized'
        for item in  resulted_data:
            if 'vocalized' in item and araby.vocalizedlike(word_vocalised,
              item['vocalized']):
                item['tags'] += ':'+analex_const.partialVocalizedTag
                filtred_data.append(item)
    return  filtred_data
Exemple #4
0
 def test_vocalized_similarity(self):
     """Test vocalized_similarity function ?"""
     word1 = u"ضَربٌ"
     word2 = u"ضَرْبٌ"
     self.assertTrue(ar.vocalizedlike(word1, word2))
     self.assertNotEqual(ar.vocalized_similarity(word1, word2), -2)
     self.assertTrue(ar.vocalized_similarity(word1, word2))
Exemple #5
0
def Comparetashkeel(text):
	import tashkeel.tashkeel as ArabicVocalizer
	# the entred text is vocalized correctly
	correct_text=text;
	text=araby.stripTashkeel(text);
	vocalizer=ArabicVocalizer.TashkeelClass();
	vocalized_text=vocalizer.tashkeel(text);
	
	# compare voalized text with a correct text
	text1=correct_text;
	text2=vocalized_text;
	# remove collocations symboles
	text2=text2.replace("'","");
	text2=text2.replace("~","");
	
	#stemmer=tashaphyne.stemming.ArabicLightStemmer()
	list1=vocalizer.analyzer.tokenize(text1);
	list2=vocalizer.analyzer.tokenize(text2);
	print u":".join(list1).encode('utf8');
	print u":".join(list2).encode('utf8');
	correct=0;
	incorrect=0;
	total=len(list1);
	if len(list1)!=len(list2):
		print "lists haven't the same length";
	else:
		for i in range(total):
			if araby.vocalizedlike(list1[i],list2[i]):
				correct+=1;
			else:
				incorrect+=1;
	
	result=[vocalized_text,"correct:%0.2f%%"%round(correct*100.00/total,2),"incorrect:%0.2f%%"%round(incorrect*100.00/total,2),total]
	return result#correct*100/total;
Exemple #6
0
def mot_except(word):
    """Détecte si un mot donné en entrée est un mot éxceptionnel ou non par rapport à la BDD."""
    combs = []
    for me in ExceptionalWord.objects.filter(
            unvoweled_form=araby.strip_diacritics(word)):
        if araby.vocalizedlike(word, me):
            combs.append(me)
    return combs
Exemple #7
0
def mot_outil(word):
    """Détecte si un mot donné en entrée est un mot outil ou non par rapport à la BDD."""
    mo_combs = []
    combs = decoupage(word)
    for c in combs:
        for mo in ToolWord.objects.filter(
                unvoweled_form=araby.strip_diacritics(c['Base'])):
            if araby.vocalizedlike(c['Base'], mo.voweled_form):
                dico = {'tw_object': mo}
                dico['Préfixe'] = c['Préfixe']
                dico['Suffixe'] = c['Suffixe']
                mo_combs.append(dico)
    return mo_combs
Exemple #8
0
def nom_propre(word):
    """Détecte si un mot donné en entrée est un mot spécifique ou non par rapport à la BDD."""
    np_combs = []
    combs = decoupage(word)
    for c in combs:
        for np in ProperNoun.objects.filter(
                unvoweled_form=araby.strip_diacritics(c['Base'])):
            if araby.vocalizedlike(c['Base'], np.voweled_form):
                dico = {'pn_object': np}
                dico['Base'] = c['Base']
                dico['Préfixe'] = c['Préfixe']
                dico['Suffixe'] = c['Suffixe']
                np_combs.append(dico)
    return np_combs
Exemple #9
0
    def check_partial_vocalized(word_vocalised, resulted_data):
        """
        if the entred word is vocalized fully or partially,
        the analyzer return the vocalized like words
        This function treat the partial vocalized case.
        @param word_vocalised: the input word.
        @type word_vocalised: unicode.
        @param resulted_data: the founded resulat from dictionary.
        @type resulted_data: list of dict.
        @return: list of dictionaries of analyzed words with tags.
        @rtype: list.
        """
        filtred_data = []
        if not araby.is_vocalized(word_vocalised):
            return resulted_data
        else:
            #compare the vocalized output with the vocalized input
            #print ' is vocalized'
            for item in resulted_data:
                if 'vocalized' in item:
                    output = item['vocalized']
                    is_verb = "Verb" in item['type']
                    if araby.vocalizedlike(word_vocalised, output):
                        item[
                            'tags'] += ':' + analex_const.PARTIAL_VOCALIZED_TAG
                        filtred_data.append(item)
                        # حالة التقا الساكنين، مع نص مشكول مسبقا، والفعل في آخره كسرة بدل السكون
                    elif is_verb and word_vocalised.endswith(
                            araby.KASRA) and output.endswith(araby.SUKUN):
                        if araby.vocalizedlike(word_vocalised[:-1],
                                               output[:-1]):
                            item[
                                'tags'] += ':' + analex_const.PARTIAL_VOCALIZED_TAG
                            filtred_data.append(item)

        return filtred_data
Exemple #10
0
def Comparetashkeel(text):
    import tashkeel.tashkeel as ArabicVocalizer
    # the entred text is vocalized correctly
    correct_text = text
    text = araby.stripTashkeel(text)
    vocalizer = ArabicVocalizer.TashkeelClass()
    vocalized_text = vocalizer.tashkeel(text)

    # compare voalized text with a correct text
    text1 = correct_text
    text2 = vocalized_text
    # remove collocations symboles
    text2 = text2.replace("'", "")
    text2 = text2.replace("~", "")

    #stemmer=tashaphyne.stemming.ArabicLightStemmer()
    list1 = vocalizer.analyzer.tokenize(text1)
    list2 = vocalizer.analyzer.tokenize(text2)
    print u":".join(list1).encode('utf8')
    print u":".join(list2).encode('utf8')
    correct = 0
    incorrect = 0
    total = len(list1)
    if len(list1) != len(list2):
        print "lists haven't the same length"
    else:
        for i in range(total):
            if araby.vocalizedlike(list1[i], list2[i]):
                correct += 1
            else:
                incorrect += 1

    result = [
        vocalized_text,
        "correct:%0.2f%%" % round(correct * 100.00 / total, 2),
        "incorrect:%0.2f%%" % round(incorrect * 100.00 / total, 2), total
    ]
    return result  #correct*100/total;
Exemple #11
0
    def check_partial_vocalized(self, word_vocalised, resulted_data):
        """
		if the entred word is vocalized fully or partially, 
		the analyzer return the vocalized like words;
		This function treat the partial vocalized case.
		@param word_vocalised: the input word.
		@type word_vocalised: unicode.
		@param resulted_data: the founded resulat from dictionary.
		@type resulted_data: list of dict.
		@return: list of dictionaries of analyzed words with tags.
		@rtype: list.		
		"""
        # print word_vocalised.encode('utf8');
        filtred_data = []
        if not araby.isVocalized(word_vocalised):
            return resulted_data
        else:
            # compare the vocalized output with the vocalized input
            # print ' is vocalized';
            for item in resulted_data:
                if "vocalized" in item.__dict__ and araby.vocalizedlike(word_vocalised, item.__dict__["vocalized"]):
                    item.__dict__["tags"] += ":" + analex_const.partialVocalizedTag
                    filtred_data.append(item)
            return filtred_data
Exemple #12
0
def compare_tashkeel(text):
    """
    Compare tashkeel between vocalized text and automatic vocalized text
    """
    import tashkeel.tashkeel as ArabicVocalizer
    # the entred text is vocalized correctly
    correct_text = text.strip()
    text = araby.strip_tashkeel(text.strip())
    cpath = os.path.join(os.path.dirname(__file__), '../tmp/')
    vocalizer = ArabicVocalizer.TashkeelClass(mycache_path=cpath)
    #~vocalized_text = vocalizer.tashkeel(text)
    #~ vocalizer.disable_cache()

    vocalized_dict = vocalizer.tashkeel_ouput_html_suggest(text)
    
    # compare voalized text with a correct text
    text1 = correct_text
    #~text2 = vocalized_text
    displayed_html = u""
    
    #stemmer=tashaphyne.stemming.ArabicLightStemmer()
    #~texts = vocalizer.analyzer.split_into_phrases(text1)
    texts = [text1, ]
    list1 =[]
    for txt in texts:
        list1 += vocalizer.analyzer.tokenize(txt)
    list2 = vocalized_dict
    print u"\t".join(list1).encode('utf8')
    correct = 0
    incorrect = 0
    total = len(list1)
    if len(list1)!= len(list2):
        print "lists haven't the same length", len(list1), len(list2)
        for i in range(min(len(list1), len(list2))):
            print (u"'%s'\t'%s'"%(list1[i], list2[i].get('chosen',''))).encode("utf8")
        sys.exit()
    else:
        for i in range(total):
            wo1 = list1[i]
            wo1_strip = wo1            
            wo2 = list2[i]['chosen']
            wo2_strip = list2[i]['semi']  # words without inflection mark
            inflect = list2[i]['inflect']
            link = list2[i]['link']
            rule = list2[i]['rule']
            style = "diff"
            #~if araby.is_vocalized(wo2) and araby.vocalizedlike(wo1, wo2):
            if araby.vocalizedlike(wo1, wo2):
                if wo2 == "\n":
                    wo2 = "<br/>"
                #~displayed_html += u" " + wo2
                displayed_html += u" <span id='diff'  class='%s' original='%s' inflect='%s' link='%s' rule='%s'>%s</span>" % ( style, wo1, inflect, link, str(rule), wo2)

                correct += 1
            else:
                incorrect += 1
                # green for last mark difference
                wo1_strip = wo1
                #~wo2_strip = araby.strip_lastharaka(wo2)
                if araby.vocalizedlike(wo1_strip, wo2_strip):
                    style = 'diff-mark'
                else:
                    # if the last marks are equal
                    wm1 = wo1[-1:]
                    wm2 = wo2[-1:]
                    if (araby.is_haraka(wm1) and araby.is_haraka(wm2) and wm1 == wm2) \
                    or (bool(araby.is_haraka(wm1)) ^  bool(araby.is_haraka(wm2))):
                        style = "diff-word"
                    else:
                        style = 'diff-all'
                displayed_html += u" <span id='diff'  class='%s' original='%s' inflect='%s' link='%s' rule='%s'>%s</span>" % ( style, wo1, inflect, link, str(rule), wo2)
    per_correct = round(correct*100.00/total, 2)
    per_incorrect = round(incorrect*100.00/total, 2)
    result = [displayed_html, "correct:%0.2f%%, incorrect:%0.2f%%"%(per_correct, per_incorrect)]
    return result#correct*100/total
    u"الْعَرَيِيّةُ الفصحى",
    u"غير مشكول",
    "Taha",
]
word1 = u""
for word in word_list:
    print(word, '\t', end=" ")
    if araby.is_vocalized(word): print(' is vocalized', end=" ")
    if araby.is_vocalizedtext(word): print(' is vocalized text', end=" ")
    if araby.is_arabicword(word): print(' is valid word', end=" ")
    else: print("invalid arabic word", end=" ")
    print(' strip harakat', araby.strip_harakat(word), end=" ")
    print(' strip tashkeel', araby.strip_tashkeel(word), end=" ")
    print(' strip tatweel', araby.strip_tatweel(word), end=" ")
    print(' normalize ligature ', araby.normalize_ligature(word), end=" ")
    if araby.vocalizedlike(word, word1): print("vocalized_like", end=" ")
    print()
    word1 = word
if araby.vocalizedlike(u"العربية", u"العرَبية"):
    print("vocalized_like", end=" ")
word = u"الْعَرَيِيّةُ"
word_list = [
    u"الْعَرَيِيّةُ",
    u"العربية",
    u"الْعَرَيِيّةُ الفصحى",
    u"غير مشكول",
    "Taha",
]
word1 = u""
for word in word_list:
    print(word, '\t', end=" ")
Exemple #14
0
 def test_vocalizedlike(self):
     """Test vocalizedlike function ?"""
     word1 = u"ضَربٌ"
     word2 = u"ضَرْبٌ"
     self.assertTrue(ar.vocalizedlike(word1, word2))
Exemple #15
0
def decoupage(word):
    """Découpe le mot donné en entrée (word) en (préfixes, racine et suffixes). La sortie de la fonction est une liste
    de dictionnaires regroupant toutes les combinaisons syntaxiquement correctes d'aprés la compatibilitée entre les
     préfixes et sufixes détéctés et la taille de la racine."""
    word_unvocalized = araby.strip_diacritics(word)
    prefixes, suffixes = [""], [""]
    combinaisons_possibles = []
    for p in Prefixe.objects.all():
        if word_unvocalized.startswith(p.unvoweled_form):
            # print("p:"+p.unvoweled_form)
            if araby.is_vocalized(word):
                if araby.vocalizedlike(word[:len(p.voweled_form)],
                                       p.voweled_form):
                    prefixes.append(p)
            else:
                prefixes.append(p)
    for s in Suffixe.objects.all():
        if word_unvocalized.endswith(s.unvoweled_form):
            if araby.is_vocalized(word):
                if araby.vocalizedlike(word[-len(s.voweled_form):],
                                       s.voweled_form):
                    suffixes.append(s)
            else:
                suffixes.append(s)

    for pr in prefixes:
        for sf in suffixes:
            # Validation criteria
            if pr != "" and sf != "":
                if (len(word_unvocalized) - len(pr.unvoweled_form) - len(sf.unvoweled_form)) <= 2 or \
                    (len(word_unvocalized) - len(pr.unvoweled_form) - len(sf.unvoweled_form)) > 9:
                    continue
                if ((pr.classe[0] == 'N' and sf.classe[0] == 'V')
                        or (pr.classe[0] == 'V' and sf.classe[0] == 'N')
                        or (pr.classe in ['N1', 'N2', 'N3', 'N5'])):
                    continue
            # Si on est là -> le préfixe est compatible avec le suffixe, et la taille de la base est accéptable
            base = word
            # Supprimer le prefixe de la base // En gardant le Tachkil
            if pr:
                for char in pr.unvoweled_form:
                    while char != base[0]:
                        base = base[1:]
                    base = base[1:]
                while araby.is_tashkeel(base[0]):
                    base = base[1:]

            # Supprimer le suffixe de la base // En gardant le Tachkil
            if sf:
                r_sf = [c for c in sf.unvoweled_form]
                r_sf.reverse()
                for char in r_sf:
                    base = base[:base.rindex(char)]

            combinaisons_possibles.append({
                'Base': base,
                'Préfixe': pr,
                'Suffixe': sf
            })

    return combinaisons_possibles
Exemple #16
0
    def test_vocalizedlike(self):

        # vocalizedlike(word1, word2)
        word1 = u"ضَربٌ"
        word2 = u"ضَرْبٌ"
        self.assertTrue(Araby.vocalizedlike(word1, word2))
Exemple #17
0
from pyarabic.unshape import unshaping_line
import arabic_reshaper
from pyarabic import araby
f = open('data.txt','r')
lignes = f.readlines()
print(araby.vocalizedlike('ب ر ي ت'.replace(' ', ''), 'بريت'))
for ligne in lignes:
    f2 = open('right.txt','r')
    rights = f2.readlines()
    print(rights)
    for right in rights:
        if araby.vocalizedlike(unshaping_line(ligne).replace(' ', ''), unshaping_line(right)):
            print(unshaping_line(right).encode('utf8'))
            print('بريت')
Exemple #18
0
def compare_tashkeel(text):
    """
    Compare tashkeel between vocalized text and automatic vocalized text
    """
    import tashkeel.tashkeel as ArabicVocalizer
    # the entred text is vocalized correctly
    correct_text = text.strip()
    text = araby.strip_tashkeel(text.strip())
    vocalizer = ArabicVocalizer.TashkeelClass()
    #~vocalized_text = vocalizer.tashkeel(text)
    vocalized_dict = vocalizer.tashkeel_ouput_html_suggest(text)

    # compare voalized text with a correct text
    text1 = correct_text
    #~text2 = vocalized_text
    displayed_html = u""

    #stemmer=tashaphyne.stemming.ArabicLightStemmer()
    #~texts = vocalizer.analyzer.split_into_phrases(text1)
    texts = [
        text1,
    ]
    list1 = []
    for txt in texts:
        list1 += vocalizer.analyzer.tokenize(txt)
    list2 = vocalized_dict
    print u"\t".join(list1).encode('utf8')
    correct = 0
    incorrect = 0
    total = len(list1)
    if len(list1) != len(list2):
        print "lists haven't the same length", len(list1), len(list2)
        for i in range(min(len(list1), len(list2))):
            print(u"'%s'\t'%s'" %
                  (list1[i], list2[i].get('chosen', ''))).encode("utf8")
        sys.exit()
    else:
        for i in range(total):
            wo1 = list1[i]
            wo1_strip = wo1
            wo2 = list2[i]['chosen']
            wo2_strip = list2[i]['semi']  # words without inflection mark
            inflect = list2[i]['inflect']
            link = list2[i]['link']
            rule = list2[i]['rule']
            style = "diff"
            #~if araby.is_vocalized(wo2) and araby.vocalizedlike(wo1, wo2):
            if araby.vocalizedlike(wo1, wo2):
                if wo2 == "\n":
                    wo2 = "<br/>"
                #~displayed_html += u" " + wo2
                displayed_html += u" <span id='diff'  class='%s' original='%s' inflect='%s' link='%s' rule='%s'>%s</span>" % (
                    style, wo1, inflect, link, str(rule), wo2)

                correct += 1
            else:
                incorrect += 1
                # green for last mark difference
                wo1_strip = wo1
                #~wo2_strip = araby.strip_lastharaka(wo2)
                if araby.vocalizedlike(wo1_strip, wo2_strip):
                    style = 'diff-mark'
                else:
                    # if the last marks are equal
                    wm1 = wo1[-1:]
                    wm2 = wo2[-1:]
                    if (araby.is_haraka(wm1) and araby.is_haraka(wm2) and wm1 == wm2) \
                    or (bool(araby.is_haraka(wm1)) ^  bool(araby.is_haraka(wm2))):
                        style = "diff-word"
                    else:
                        style = 'diff-all'
                displayed_html += u" <span id='diff'  class='%s' original='%s' inflect='%s' link='%s' rule='%s'>%s</span>" % (
                    style, wo1, inflect, link, str(rule), wo2)
    per_correct = round(correct * 100.00 / total, 2)
    per_incorrect = round(incorrect * 100.00 / total, 2)
    result = [
        displayed_html,
        "correct:%0.2f%%, incorrect:%0.2f%%" % (per_correct, per_incorrect)
    ]
    return result  #correct*100/total
Exemple #19
0
"Taha",
]
word1=u""
for word in word_list:
    print word.encode('utf8'),'\t',
    if araby.is_vocalized(word): print ' is vocalized',
##    if araby.isArabicstring(word): print ' iisArabicstring',
##    else:print ' invalid arabicstring',
    if araby.is_vocalizedtext(word): print ' is vocalized text',
    if araby.is_arabicword(word): print ' is valid word',
    else: print "invalid arabic word",
    print ' strip harakat', araby.strip_harakat(word).encode('utf8'),
    print ' strip tashkeel', araby.strip_tashkeel(word).encode('utf8'),
    print ' strip tatweel',araby.strip_tatweel(word).encode('utf8'),
    print ' normalize ligature ', araby.normalize_ligature(word).encode('utf8'),
    if araby.vocalizedlike(word, word1): print "vocalized_like",
    print;
    word1=word;
if araby.vocalizedlike(u"العربية",u"العرَبية"): print "vocalized_like",
word=u"الْعَرَيِيّةُ"
word_list=[
u"الْعَرَيِيّةُ",
u"العربية",
u"الْعَرَيِيّةُ الفصحى",
u"غير مشكول",
"Taha",
]
word1=u""
for word in word_list:
    print word.encode('utf8'),'\t',
    if araby.is_vocalized(word): print ' is vocalized',
Exemple #20
0
"Taha",
]
word1=""
for word in word_list:
    print(word,'\t')
    if araby.is_vocalized(word): print(' is vocalized')
##    if araby.isArabicstring(word): print ' iisArabicstring',
##    else:print ' invalid arabicstring',
    if araby.is_vocalizedtext(word): print(' is vocalized text')
    if araby.is_arabicword(word): print(' is valid word')
    else: print("invalid arabic word")
    print(' strip harakat', araby.strip_harakat(word))
    print(' strip tashkeel', araby.strip_tashkeel(word))
    print(' strip tatweel',araby.strip_tatweel(word))
    print(' normalize ligature ', araby.normalize_ligature(word))
    if araby.vocalizedlike(word, word1): print("vocalized_like")
    print();
    word1=word;
if araby.vocalizedlike("العربية","العرَبية"): print("vocalized_like")
word="الْعَرَيِيّةُ"
word_list=[
"الْعَرَيِيّةُ",
"العربية",
"الْعَرَيِيّةُ الفصحى",
"غير مشكول",
"Taha",
]
word1=""
for word in word_list:
    print(word,'\t')
    if araby.is_vocalized(word): print(' is vocalized')
Exemple #21
0
]
word1 = u""
for word in word_list:
    print word.encode('utf8'), '\t',
    if araby.is_vocalized(word): print ' is vocalized',
    ##    if araby.isArabicstring(word): print ' iisArabicstring',
    ##    else:print ' invalid arabicstring',
    if araby.is_vocalizedtext(word): print ' is vocalized text',
    if araby.is_arabicword(word): print ' is valid word',
    else: print "invalid arabic word",
    print ' strip harakat', araby.strip_harakat(word).encode('utf8'),
    print ' strip tashkeel', araby.strip_tashkeel(word).encode('utf8'),
    print ' strip tatweel', araby.strip_tatweel(word).encode('utf8'),
    print ' normalize ligature ', araby.normalize_ligature(word).encode(
        'utf8'),
    if araby.vocalizedlike(word, word1): print "vocalized_like",
    print
    word1 = word
if araby.vocalizedlike(u"العربية", u"العرَبية"): print "vocalized_like",
word = u"الْعَرَيِيّةُ"
word_list = [
    u"الْعَرَيِيّةُ",
    u"العربية",
    u"الْعَرَيِيّةُ الفصحى",
    u"غير مشكول",
    "Taha",
]
word1 = u""
for word in word_list:
    print word.encode('utf8'), '\t',
    if araby.is_vocalized(word): print ' is vocalized',
Exemple #22
0
 def test_vocalizedlike(self):
     """Test vocalizedlike function ?"""
     word1 = u"ضَربٌ"
     word2 = u"ضَرْبٌ"
     self.assertTrue(ar.vocalizedlike(word1, word2))