def detectNumberWords(text):
	"""
	Detect number words in a text.
	@param text: input text
	@type text: unicode
	@return : number words extracted from text
	@rtype: integer
	>>> text2number(u"وجدت خمسمئة وثلاثة وعشرين دينارا");
	خمسمئة وثلاثة وعشرين
	"""

	words=araby.tokenize(text)
	#print words;
	phrasesContext=extractNumberPhrasesWithinContext(text);
	for phCon in phrasesContext:
		if len(phCon)>=3:
			previous=phCon[0];
			phrase=phCon[1];
			next=phCon[2];
			numberedwords=phrase;
			numeric = text2number(numberedwords);
			tags = getPreviousTag(previous);
			vocalized = vocalizeNumber(araby.stripTashkeel(numberedwords).split(' '), tags);				
			#calcul  vocalization similarity : 
			sim = araby.vocalizedSimilarity(numberedwords, vocalized);
			vocUnit=vocalizeUnit(numeric, next);
			simUnit = araby.vocalizedSimilarity(vocUnit, next);					
			if sim<0:
				print(u'\t'.join([str(sim), numberedwords, vocalized, str(numeric), u' '.join([previous,phrase, next]), next, vocUnit, str(simUnit)]).encode('utf8'));
Beispiel #2
0
def gwords( text ):
    ''' (string) -> int
    Return the number of variants (not occurrences) of gwords in the given text.

    >>> gwords( '' )
    0
    >>> gwords( ' abc ' )
    0
    >>> gwords( TEST_FIXTURES['gwords'][0] + ' ' + TEST_FIXTURES['gwords'][1] )
    2
    >>> gwords( "%s %s %s" % (TEST_FIXTURES['gwords'][0],\
        TEST_FIXTURES['gwords'][1], TEST_FIXTURES['gwords'][1]) )
    2
    >>> gwords( "%s%s %s" % (TEST_FIXTURES['gwords'][0],\
        TEST_FIXTURES['gwords'][1], TEST_FIXTURES['gwords'][1]) )
    1
    >>> gwords( "%s%s %s %s" % (TEST_FIXTURES['gwords'][0],\
        araby.DAMMA, TEST_FIXTURES['gwords'][1], TEST_FIXTURES['gwords'][1]) )
    2
    >>> gwords( "%s%s %s%s %s" % (TEST_FIXTURES['gwords'][0],\
        araby.DAMMA, 'abc', TEST_FIXTURES['gwords'][1], TEST_FIXTURES['gwords'][1]) )
    2
    '''
    """ Search by regular expression then filter the possibilities """
    words_set = set( araby.stripTashkeel( text ).split() )
    return len( words_set & GWORDS_FORMS )
def text2number(text):
	"""
	Convert arabic text into number, for example convert تسعة وعشرون  =>29.
	@param text: input text
	@type text: unicode
	@return : number extracted from text
	@rtype: integer
	>>> text2number(u"خمسمئة وثلاث وعشرون");
	523
	"""
	#the result total is 0
	total=0;
	# the partial total for the three number
	partial=0;
	text=araby.stripTashkeel(text);
	words=text.split(u' ');
	#print words;
	for word in words:
		if word and word!=u'واحد' and word[0] in (u'و', u'ف', u'ل', u'ب', u'ك'):
			word=word[1:];
		if word!=u'واحد' and word.startswith(u'و'):
			word=word[1:];
			
		if word in NumberWords:
			actualnumber=NumberWords[word];
			if actualnumber%1000==0:
				# the case of 1000 or 1 million
				if partial==0: partial=1;
				total+=partial* actualnumber;
				#re-initiate the partial total
				partial=0;				
			else: partial+=NumberWords[word];
	# add the final partial to total
	total+=partial;		
	return total
Beispiel #4
0
def detectNumberWords(text):
	"""
	Detect number words in a text.
	@param text: input text
	@type text: unicode
	@return : number words extracted from text
	@rtype: integer
	>>> text2number(u"وجدت خمسمئة وثلاثة وعشرين دينارا");
	خمسمئة وثلاثة وعشرين
	"""

	words=araby.tokenize(text)
	#print words;
	phrasesContext=extractNumberPhrasesWithinContext(text);
	for phCon in phrasesContext:
		if len(phCon)>=3:
			previous=phCon[0];
			phrase=phCon[1];
			next=phCon[2];
			numberedwords=phrase;
			numeric = text2number(numberedwords);
			tags = getPreviousTag(previous);
			vocalized = vocalizeNumber(araby.stripTashkeel(numberedwords).split(' '), tags);				
			#calcul  vocalization similarity : 
			sim = araby.vocalizedSimilarity(numberedwords, vocalized);
			vocUnit=vocalizeUnit(numeric, next);
			simUnit = araby.vocalizedSimilarity(vocUnit, next);					
			if sim<0:
				print u'\t'.join([str(sim), numberedwords, vocalized, str(numeric), u' '.join([previous,phrase, next]), next, vocUnit, str(simUnit)]).encode('utf8');
Beispiel #5
0
def text2number(text):
	"""
	Convert arabic text into number, for example convert تسعة وعشرون  =>29.
	@param text: input text
	@type text: unicode
	@return : number extracted from text
	@rtype: integer
	>>> text2number(u"خمسمئة وثلاث وعشرون");
	523
	"""
	#the result total is 0
	total=0;
	# the partial total for the three number
	partial=0;
	text=araby.stripTashkeel(text);
	words=text.split(u' ');
	#print words;
	for word in words:
		if word and word!=u'واحد' and word[0] in (u'و', u'ف', u'ل', u'ب', u'ك'):
			word=word[1:];
		if word!=u'واحد' and word.startswith(u'و'):
			word=word[1:];
			
		if NumberWords.has_key(word):
			actualnumber=NumberWords[word];
			if actualnumber%1000==0:
				# the case of 1000 or 1 million
				if partial==0: partial=1;
				total+=partial* actualnumber;
				#re-initiate the partial total
				partial=0;				
			else: partial+=NumberWords[word];
	# add the final partial to total
	total+=partial;		
	return total
Beispiel #6
0
def detectNumberPhrasesPosition(wordlist):
    """
	Detect number words in a text and return positions of each phrase.
	@param wordlist: wordlist
	@type wordlist: unicode list
	@return : list of numbers clause positions [(start,end),(start2,end2),]
	@rtype: list of tuple
	>>> detectNumberPhrasesPosition(u"وجدت خمسمئة وثلاثة وعشرين دينارا فاشتريت ثلاثة عشر دفترا");
	(1،3)، (6،7)
	"""
    wordlist  #=text.split(u' ');
    #print words;
    phrases = []
    startNumber = -1
    endNumber = False
    for i in range(len(wordlist)):
        word = wordlist[i]
        if i + 1 < len(wordlist):
            next = araby.stripTashkeel(wordlist[i + 1])
        else:
            next = None
        #save the original word with possible harakat if exist
        word_nm = araby.stripTashkeel(word)
        key = word_nm
        # the first word can have prefixes
        if word_nm and not startNumber and word_nm != u'واحد' and word_nm[
                0] in (u'و', u'ف', u'ل', u'ب', u'ك'):
            key = word_nm[1:]
        elif word_nm != u'واحد' and word_nm.startswith(u'و'):
            key = word_nm[1:]
        if NumberWords.has_key(key):
            if not key in (u'أحد', u'إحدى', u'اثنا', u'اثني', u'اثنتي',
                           u'اثنتا') or next in (u'عشر', u'عشرة'):
                if startNumber < 0:
                    startNumber = i
                endNumber = i
            # phrase.append(word);
        else:
            if startNumber >= 0:  #There are a previous number phrase.
                phrases.append((startNumber, endNumber))
            startNumber = -1
    # add the final phrases
    if startNumber >= 0:  #There are a previous number phrase.
        phrases.append((startNumber, endNumber))
    return phrases
Beispiel #7
0
def detectNumberPhrasesPosition(wordlist):
	"""
	Detect number words in a text and return positions of each phrase.
	@param wordlist: wordlist
	@type wordlist: unicode list
	@return : list of numbers clause positions [(start,end),(start2,end2),]
	@rtype: list of tuple
	>>> detectNumberPhrasesPosition(u"وجدت خمسمئة وثلاثة وعشرين دينارا فاشتريت ثلاثة عشر دفترا");
	(1،3)، (6،7)
	"""
	wordlist#=text.split(u' ');
	#print words;
	phrases = [];
	startNumber =-1;
	endNumber   =False;
	for i in range(len(wordlist)):
		word=wordlist[i];
		if i+1<len(wordlist):
			next=araby.stripTashkeel(wordlist[i+1]);
		else: next=None;
		#save the original word with possible harakat if exist
		word_nm=araby.stripTashkeel(word);
		key=word_nm;
		# the first word can have prefixes 
		if word_nm and not startNumber and word_nm!=u'واحد' and word_nm[0] in (u'و', u'ف', u'ل', u'ب', u'ك'):
			key=word_nm[1:];
		elif word_nm!=u'واحد' and word_nm.startswith(u'و'):
			key=word_nm[1:];
		if NumberWords.has_key(key):
			if not key in (u'أحد',u'إحدى',u'اثنا',u'اثني', u'اثنتي', u'اثنتا') or next in (u'عشر', u'عشرة'):
				if startNumber<0:
					startNumber = i;
				endNumber = i;
			# phrase.append(word);
		else:
			if startNumber>=0: #There are a previous number phrase.
				phrases.append((startNumber, endNumber));
			startNumber=-1;
	# add the final phrases 
	if startNumber>=0: #There are a previous number phrase.
		phrases.append((startNumber, endNumber));
	return phrases
def getPreviousTag(word):
	"""Get the word tags
	@param word: given word
	@type word: unicode
	@return :word tag
	@rtype: unicode
	"""
	word=araby.stripTashkeel(word);
	tags=u'';
	if word in NOUN_NASEB_LIST:
		return u'منصوب';
	elif word in JAR_LIST:
		return u'مجرور';
	elif word in RAFE3_LIST:
		return u'مرفوع';
	else:
		return u'';
def vocalizeUnit(numeric, unit):
	""" Vocalize a number words
	@param numeric: given number
	@type numeric: integer
	@param unit: unit to vocalize
	@type unit: unicode
	@return: the vocalized unit, or unit word if itsnt a unit word.
	@rtype: unicode
	"""
	#detect tags 
	# The given word is not a unit
	unit_nm = araby.stripTashkeel(unit);
	if not isUnit(unit_nm):
		return unit;
	tags= u"";
	vocalizedUnit=unit;

	# العدد بين واحد واثنان يتطلب صفة للوحدة ويكون بعدها
	# هذه الحالة لا تبرمج

	if numeric>=0 and numeric <=2:
		return unit;
	# الإضافة إلى تمييز مضاف  إليه مجرور مفرد
	# تممييز الألف والمئة والمليون والمليار 
	# يتطلب إضافة إلى مفرد
	# مثلا ألف رجل
	elif  numeric % 100 == 0 or  numeric % 1000 == 0:
		
		tag='SingleMajrour';
		vocalizedUnit = UnitWords[unit_nm]['a'];
	# العدد المفرد يتطلب 
	# إضافة إلى الجمع
	elif numeric % 100 <=10:
		tags+="Plural";
		vocalizedUnit = UnitWords[unit_nm]['p'];

	elif numeric % 100 <100:
		tags+='SingleMansoub';
		vocalizedUnit = UnitWords[unit_nm]['n'];
	else:
		tags='';
		vocalizedUnit = UnitWords[unit_nm]['i'];
	if not vocalizedUnit:
		return 'Error'+tags;
	else:
		return vocalizedUnit;
Beispiel #10
0
def getPreviousTag(word):
	"""Get the word tags
	@param word: given word
	@type word: unicode
	@return :word tag
	@rtype: unicode
	"""
	word=araby.stripTashkeel(word);
	tags=u'';
	if word in NOUN_NASEB_LIST:
		return u'منصوب';
	elif word in JAR_LIST:
		return u'مجرور';
	elif word in RAFE3_LIST:
		return u'مرفوع';
	else:
		return u'';
Beispiel #11
0
def vocalizeUnit(numeric, unit):
	""" Vocalize a number words
	@param numeric: given number
	@type numeric: integer
	@param unit: unit to vocalize
	@type unit: unicode
	@return: the vocalized unit, or unit word if itsnt a unit word.
	@rtype: unicode
	"""
	#detect tags 
	# The given word is not a unit
	unit_nm = araby.stripTashkeel(unit);
	if not isUnit(unit_nm):
		return unit;
	tags= u"";
	vocalizedUnit=unit;

	# العدد بين واحد واثنان يتطلب صفة للوحدة ويكون بعدها
	# هذه الحالة لا تبرمج

	if numeric>=0 and numeric <=2:
		return unit;
	# الإضافة إلى تمييز مضاف  إليه مجرور مفرد
	# تممييز الألف والمئة والمليون والمليار 
	# يتطلب إضافة إلى مفرد
	# مثلا ألف رجل
	elif  numeric % 100 == 0 or  numeric % 1000 == 0:
		
		tag='SingleMajrour';
		vocalizedUnit = UnitWords[unit_nm]['a'];
	# العدد المفرد يتطلب 
	# إضافة إلى الجمع
	elif numeric % 100 <=10:
		tags+="Plural";
		vocalizedUnit = UnitWords[unit_nm]['p'];

	elif numeric % 100 <100:
		tags+='SingleMansoub';
		vocalizedUnit = UnitWords[unit_nm]['n'];
	else:
		tags='';
		vocalizedUnit = UnitWords[unit_nm]['i'];
	if not vocalizedUnit:
		return 'Error'+tags;
	else:
		return vocalizedUnit;
Beispiel #12
0
def normalize_text(text):
    '''
    return normalized text
    
    Normalisation steps:
    *   strip diacritics
    *   strip tatweel
    *   normalize lam-alef
    *   normalize hamza
    *   normalize spellerrors
    
    >>> normalize_text('')
    u''
    '''
    text = araby.stripTashkeel(text)
    text = araby.stripTatweel(text)
    text = normalize_lamalef(text)
    text = normalize_hamza(text)
    text = normalize_spellerrors(text)

    return text
Beispiel #13
0
def normalize_text(text):
    '''
    return normalized text
    
    Normalisation steps:
    *   strip diacritics
    *   strip tatweel
    *   normalize lam-alef
    *   normalize hamza
    *   normalize spellerrors
    
    >>> normalize_text('')
    u''
    '''
    text = araby.stripTashkeel(text)
    text = araby.stripTatweel(text)
    text = normalize_lamalef(text)
    text = normalize_hamza(text)
    text = normalize_spellerrors(text)
    
    return text
def vocalizeNumber(wordlist, synTags=""):
	""" Vocalize a number words
	@param wordlist: words to vocalize
	@type wordlist: unicode list
	@param synTags: tags about the clause
	@type synTags: unicode
	@return: the vocalized wordlist.
	@rtype: unicode
	"""
	newlist=[];
	prefix=u"";
	next=u"";
	#detect tags 
	# we can pass tags to this number word
	tags= synTags;
	majrour = False;
	if len(wordlist)==1:
		#return araby.stripTashkeel(wordlist[0]);
		word=wordlist[0];
		word_nm=araby.stripTashkeel(word);
		key=word_nm;
		voc=word;
		# the first word can have prefixes 
		if word_nm and not wordlist and word_nm!=u'واحد' and word[0] in (u'و', u'ف', u'ل', u'ب', u'ك'):
			if word_nm[0] in (u'ل', u'ب', u'ك'):
				tags +=u"مجرور";
			key=word[1:];
		elif word_nm!=u'واحد' and word_nm.startswith(u'و'):
			key=word_nm[1:];
		# تحذب بعض الكلمات لأنها تلتبس مع أسماء الأجزاء مثل خُمس وخمس
		if key in NumberWords and not key in (u'عشر',u'خمس',u'سبع', u'تسع',u'خمسا', u'سبعا',u'تسعا',u'عشرا', u'ألفين',u'عشرة', u'صفر', u'ألف'):
			voc =prefix+VocalizedNumberWords[key]['i']
		return [voc,];
	for i in range(len(wordlist)):
		#save the original word with possible harakat if exist
		word=wordlist[i];
		word_nm=araby.stripTashkeel(word);
		key=word_nm;
		# the first word can have prefixes 
		if i==0 and word_nm and word_nm!=u'واحد' and word[0] in (u'و', u'ف', u'ل', u'ب', u'ك'):
			if word_nm[0] in (u'ل', u'ب', u'ك'):
				tags +=u"مجرور";
			key=word[1:];
		elif word_nm!=u'واحد' and word_nm.startswith(u'و'):
			key=word_nm[1:];
		if key in NumberWords:
			if word_nm.endswith(u'ين') : 
				tags +=u"مجهول"; # إما مجرور أو منصوب
			elif word_nm.endswith(u'ان')  or word_nm.endswith(u'ون') :
				tags +=u"مرفوع";
	#add tashkeel 
	#wordlist=araby.stripTashkeel(u" ".join(wordlist)).split(' ');
	previousKey=u'';
	for i in range(len(wordlist)):
		word =wordlist[i];
		if i+1<len(wordlist):
			next=wordlist[i+1];
		else: next =u"";
		key=word;
		# the first word can have prefixes 
		if word and word!=u'واحد' and word[0] in (u'و', u'ف', u'ل', u'ب', u'ك'):
			key=word[1:];
			prefix=word[0];
			if prefix in  (u'و', u'ف',  u'ك'):
				prefix +=u'َ'
			elif prefix in  ( u'ل', u'ب'):
				prefix +=u'ِ'
		else: prefix='';
		if key in VocalizedNumberWords:
			voc=u'';
			if VocalizedNumberWords[key]['s']=="*":
				voc =prefix+VocalizedNumberWords[key]['i']

			# مبني على النصب في حالة المركب العددي
			elif next==u'عشر' or next==u'عشرة':
					voc =prefix+VocalizedNumberWords[key]['n']
			# مبني على النصب في حالة المركب العددي
			elif key==u'عشر' and previousKey in NumberTenMasculinUnits:
					voc =u'عَشَرَ'
			elif key==u'عشرة' and previousKey in NumberTenFemininUnits:
					voc =u'عَشْرَةَ'

			elif u'مرفوع' in tags:
				if next.startswith(u'و'):
					voc =prefix+VocalizedNumberWords[key]['r2']
				else:		
					voc =prefix+VocalizedNumberWords[key]['r']
			elif u'مجهول' in tags:
				voc =prefix+VocalizedNumberWords[key]['i']
			
			elif u'مجرور' in tags:
				if next.startswith(u'و'):
					voc =prefix+VocalizedNumberWords[key]['j2']
				else:		
					voc =prefix+VocalizedNumberWords[key]['j']
			# منصوب
			elif u'منصوب' in tags:
				if next.startswith(u'و'):
					voc =prefix+VocalizedNumberWords[key]['n2']
				else:		
					voc =prefix+VocalizedNumberWords[key]['n']			
			else:
				voc =prefix+VocalizedNumberWords[key]['i'] 
			newlist.append(voc)		
		else:
			newlist.append(prefix+key);
		previousKey=key;
	return newlist;
Beispiel #15
0
def vocalizeNumber(wordlist, synTags=""):
	""" Vocalize a number words
	@param wordlist: words to vocalize
	@type wordlist: unicode list
	@param synTags: tags about the clause
	@type synTags: unicode
	@return: the vocalized wordlist.
	@rtype: unicode
	"""
	newlist=[];
	prefix=u"";
	next=u"";
	#detect tags 
	# we can pass tags to this number word
	tags= synTags;
	majrour = False;
	if len(wordlist)==1:
		#return araby.stripTashkeel(wordlist[0]);
		word=wordlist[0];
		word_nm=araby.stripTashkeel(word);
		key=word_nm;
		voc=word;
		# the first word can have prefixes 
		if word_nm and not wordlist and word_nm!=u'واحد' and word[0] in (u'و', u'ف', u'ل', u'ب', u'ك'):
			if word_nm[0] in (u'ل', u'ب', u'ك'):
				tags +=u"مجرور";
			key=word[1:];
		elif word_nm!=u'واحد' and word_nm.startswith(u'و'):
			key=word_nm[1:];
		# تحذب بعض الكلمات لأنها تلتبس مع أسماء الأجزاء مثل خُمس وخمس
		if NumberWords.has_key(key) and not key in (u'عشر',u'خمس',u'سبع', u'تسع',u'خمسا', u'سبعا',u'تسعا',u'عشرا', u'ألفين',u'عشرة', u'صفر', u'ألف'):
			voc =prefix+VocalizedNumberWords[key]['i']
		return [voc,];
	for i in range(len(wordlist)):
		#save the original word with possible harakat if exist
		word=wordlist[i];
		word_nm=araby.stripTashkeel(word);
		key=word_nm;
		# the first word can have prefixes 
		if i==0 and word_nm and word_nm!=u'واحد' and word[0] in (u'و', u'ف', u'ل', u'ب', u'ك'):
			if word_nm[0] in (u'ل', u'ب', u'ك'):
				tags +=u"مجرور";
			key=word[1:];
		elif word_nm!=u'واحد' and word_nm.startswith(u'و'):
			key=word_nm[1:];
		if NumberWords.has_key(key):
			if word_nm.endswith(u'ين') : 
				tags +=u"مجهول"; # إما مجرور أو منصوب
			elif word_nm.endswith(u'ان')  or word_nm.endswith(u'ون') :
				tags +=u"مرفوع";
	#add tashkeel 
	#wordlist=araby.stripTashkeel(u" ".join(wordlist)).split(' ');
	previousKey=u'';
	for i in range(len(wordlist)):
		word =wordlist[i];
		if i+1<len(wordlist):
			next=wordlist[i+1];
		else: next =u"";
		key=word;
		# the first word can have prefixes 
		if word and word!=u'واحد' and word[0] in (u'و', u'ف', u'ل', u'ب', u'ك'):
			key=word[1:];
			prefix=word[0];
			if prefix in  (u'و', u'ف',  u'ك'):
				prefix +=u'َ'
			elif prefix in  ( u'ل', u'ب'):
				prefix +=u'ِ'
		else: prefix='';
		if VocalizedNumberWords.has_key(key):
			voc=u'';
			if VocalizedNumberWords[key]['s']=="*":
				voc =prefix+VocalizedNumberWords[key]['i']

			# مبني على النصب في حالة المركب العددي
			elif next==u'عشر' or next==u'عشرة':
					voc =prefix+VocalizedNumberWords[key]['n']
			# مبني على النصب في حالة المركب العددي
			elif key==u'عشر' and previousKey in NumberTenMasculinUnits:
					voc =u'عَشَرَ'
			elif key==u'عشرة' and previousKey in NumberTenFemininUnits:
					voc =u'عَشْرَةَ'

			elif u'مرفوع' in tags:
				if next.startswith(u'و'):
					voc =prefix+VocalizedNumberWords[key]['r2']
				else:		
					voc =prefix+VocalizedNumberWords[key]['r']
			elif u'مجهول' in tags:
				voc =prefix+VocalizedNumberWords[key]['i']
			
			elif u'مجرور' in tags:
				if next.startswith(u'و'):
					voc =prefix+VocalizedNumberWords[key]['j2']
				else:		
					voc =prefix+VocalizedNumberWords[key]['j']
			# منصوب
			elif u'منصوب' in tags:
				if next.startswith(u'و'):
					voc =prefix+VocalizedNumberWords[key]['n2']
				else:		
					voc =prefix+VocalizedNumberWords[key]['n']			
			else:
				voc =prefix+VocalizedNumberWords[key]['i'] 
			newlist.append(voc)		
		else:
			newlist.append(prefix+key);
		previousKey=key;
	return newlist;