Beispiel #1
0
    def check_word_as_pounct(self, word):
        """
		Check if the word is a pounctuation, 
		@param word: the input word.
		@type word: unicode.
		@return: list of dictionaries of analyzed words with tags.
		@rtype: list.		
		"""
        detailed_result = []
        # ToDo : fix it to isdigit, by moatz saad
        if word.isnumeric():
            detailed_result.append(
                stemmedword.stemmedWord(
                    {
                        "word": word,
                        "procletic": "",
                        "encletic": "",
                        "prefix": "",
                        "suffix": "",
                        "stem": "",
                        "original": word,
                        "vocalized": word,
                        "tags": self.get_number_tags(word),
                        "type": "NUMBER",
                        "root": "",
                        "template": "",
                        "freq": 0,
                        "syntax": "",
                    }
                )
            )
        if word in stem_pounct_const.POUNCTUATION:

            detailed_result.append(
                stemmedword.stemmedWord(
                    {
                        "word": word,
                        "procletic": "",
                        "encletic": "",
                        "prefix": "",
                        "suffix": "",
                        "stem": "",
                        "original": word,
                        "vocalized": word,
                        "tags": stem_pounct_const.POUNCTUATION[word]["tags"],
                        "type": "POUNCT",
                        "root": "",
                        "template": "",
                        "freq": 0,
                        "syntax": "",
                    }
                )
            )
        return detailed_result
	def stemming_stopword(self, word):
		"""
		Analyze word morphologically as noun
		@param word: the input word.
		@type word: unicode.
		@return: list of dictionaries of analyzed words with tags.
		@rtype: list.
		"""	
		# the detailled stemmming result
		detailed_result=[];
		# search the sw in the dictionary
		# we can return the tashkeel
		#list of IDs of found stopwords in dictionary
		swIdList = [];
		# search in database by word, and return all ids
		#word = araby.stripTashkeel(word);
		swIdList = self.swDictionary.lookup(word);
		for id in swIdList:
			sw_tuple = self.swDictionary.getEntryById(id);
			detailed_result.append(stemmedword.stemmedWord({
			'word':		word,
			'procletic': sw_tuple['procletic'],
			'encletic':	 sw_tuple['encletic'],
			'prefix':	'',
			'suffix':	'',
			'stem':			sw_tuple['stem'],
			'original':		sw_tuple['original'],
			'vocalized':	sw_tuple['vocalized'],
			'tags':			sw_tuple['tags'],
			'type':			sw_tuple['type'],
			'root':'',
			'template':'',
			'freq':'freqstopword',
			'originaltags':sw_tuple['tags'],
			'syntax':'',
			}));
		return detailed_result;
Beispiel #3
0
    def steming_second_level(self, noun, noun2, procletic, encletic):
        """
		Analyze word morphologically by stemming the conjugation affixes.
		@param noun: the input noun.
		@type noun: unicode.
		@param noun2: the noun stemed from syntaxic affixes.
		@type noun2: unicode.
		@param procletic: the syntaxic prefixe extracted in the fisrt stage.
		@type procletic: unicode.
		@param encletic: the syntaxic suffixe extracted in the fisrt stage.
		@type encletic: unicode.
		@return: list of dictionaries of analyzed words with tags.
		@rtype: list.
		"""
        detailed_result = []
        #segment the coinjugated verb
        list_seg_conj = self.conjStemmer.segment(noun2)
        # verify affix compatibility
        list_seg_conj = self.verify_affix(
            noun2, list_seg_conj, stem_noun_const.NOMINAL_CONJUGATION_AFFIX)
        # add vocalized forms of suffixes
        list_seg_conj_voc = []
        for seg_conj in list_seg_conj:
            prefix_conj = noun2[:seg_conj[0]]
            stem_conj = noun2[seg_conj[0]:seg_conj[1]]
            suffix_conj = noun2[seg_conj[1]:]
            affix_conj = prefix_conj + '-' + suffix_conj
            # get all vocalized form of suffixes
            for vocalized_suffix in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[
                    suffix_conj]['vocalized']:
                # if u'تنوين' not in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[vocalized_suffix]['tags']:
                seg_conj_voc = {
                    'prefix': '',
                    'suffix': vocalized_suffix,
                    'stem': stem_conj
                }
                # verify compatibility between procletics and afix
                if (self.is_compatible_proaffix_affix(procletic, encletic,
                                                      vocalized_suffix)):
                    # verify the existing of a noun stamp in the dictionary
                    # if self.NOUN_DICTIONARY_STAMP.has_key(stamp):
                    # list_seg_conj2.append(seg_conj)
                    list_seg_conj_voc.append(seg_conj_voc)

        list_seg_conj = list_seg_conj_voc
        for seg_conj in list_seg_conj:
            prefix_conj = seg_conj['prefix']
            stem_conj = seg_conj['stem']
            suffix_conj = seg_conj['suffix']
            has_plural_suffix = (
                (u"جمع"
                 in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['tags'])
                or
                (u"مثنى"
                 in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['tags'])
            )
            #print "has_plural", has_plural_suffix;
            affix_conj = '-'.join([prefix_conj, suffix_conj])
            # noirmalize hamza before gessing  deffirents origines
            stem_conj = araby.normalizeHamza(stem_conj)
            if self.debug:
                print "*\t", "-".join(
                    [str(len(stem_conj)), prefix_conj, stem_conj,
                     suffix_conj]).encode("utf8")
            # generate possible stems
            # add stripped letters to the stem to constitute possible noun list
            possible_noun_list = self.getStemVariants(stem_conj, prefix_conj,
                                                      suffix_conj)
            if self.debug:
                print "\tpossible original nouns:  ", "\t".join(
                    possible_noun_list).encode('utf8')
            # search the noun in the dictionary
            # we can return the tashkeel
            infnoun_form_list = []
            for infnoun in possible_noun_list:
                # get the noun and get all its forms from the dict
                # if the noun has plural suffix, don't look up in broken plural dictionary
                infnoun_foundL = self.nounDictionary.lookup(infnoun)
                #infnoun_found=self.find_nouns_in_dictionary(infnoun,has_plural_suffix);
                ##							listsingle=self.find_broken_plural(infnoun);
                ##							print ' *****','-'.join(listsingle).encode('utf8')
                if len(infnoun_foundL) > 0:
                    if self.debug: print "\t in dict", infnoun.encode('utf8')
                else:
                    if self.debug:
                        print infnoun.encode('utf8'), "not found in dictionary"
                infnoun_form_list += infnoun_foundL
            for id in infnoun_form_list:
                noun_tuple = self.nounDictionary.getEntryById(id)
                infnoun = noun_tuple['vocalized']
                # affixes tags contains prefixes and suffixes tags
                affix_tags  =  stem_noun_const.COMP_PREFIX_LIST_TAGS[procletic]['tags'] \
                    +stem_noun_const.COMP_SUFFIX_LIST_TAGS[encletic]['tags'] \
                    +stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['tags']
                #test if the  given word from dictionary accept those tags given by affixes
                # دراسة توافق الزوائد مع خصائص الاسم،
                # مثلا هل يقبل الاسم التأنيث.
                # if not self.validateTags(noun_tuple, affix_tags, procletic, encletic, suffix_conj):

                if self.validateTags(noun_tuple, affix_tags, procletic,
                                     encletic, suffix_conj):
                    # if the result vocalized noun is not the same length
                    vocalized = self.vocalize(infnoun, procletic, prefix_conj,
                                              suffix_conj, encletic)
                    # the noun can have some harakat or shadda, then we must remove all tashkeel and compare
                    # vocalized_nm=araby.stripTashkeel(vocalized);
                    # noun_nm=araby.stripTashkeel(noun);

                    original = noun_tuple['original']
                    wordtype = noun_tuple['wordtype']
                    #add some tags from dictionary entry as mamnou3 min sarf and broken plural
                    originalTags = []
                    if noun_tuple['mamnou3_sarf'] == u"ممنوع من الصرف":
                        originalTags.append(u"ممنوع من الصرف")
                    if noun_tuple['number'] == u"جمع تكسير":
                        originalTags.append(u"جمع تكسير")
                        # affix_tags+=(,);
                    detailed_result.append(
                        stemmedword.stemmedWord({
                            'word':
                            noun,
                            'procletic':
                            procletic,
                            'encletic':
                            encletic,
                            'prefix':
                            prefix_conj,
                            'suffix':
                            suffix_conj,
                            'stem':
                            stem_conj,
                            'original':
                            infnoun,  #original,
                            'vocalized':
                            vocalized,
                            'tags':
                            u':'.join(affix_tags),
                            'type':
                            u':'.join(['Noun', wordtype]),  #'Noun',
                            'root':
                            '',
                            'template':
                            '',
                            'freq':
                            'freqnoun',  # to note the frequency type 
                            'originaltags':
                            u':'.join(originalTags),
                            'syntax':
                            '',
                        }))
        return detailed_result
Beispiel #4
0
    def check_text(self, text, mode='all'):
        """
		Analyze text morphologically.
The analyzed data given by morphological analyzer Qalsadi have the following format:
				"<th>المدخل</th>", "<th>تشكيل</th>","<th>الأصل</th>","<th>السابقة</th>", "<th>الجذع</th>",
				"<th>اللاحقة</th>", "<th>الحالة الإعرابية</th>","<th>الجذر</th>", "<th>النوع</th><th>شيوع</th>",
				"</tr>"
		morphological Result is a list of list of dict.
		The list contains all possible morphological analysis as a dict
		[
		[
		 {
			"word": "الحياة",		# input word
			"vocalized": "الْحَيَاةُ",   # vocalized form of the input word 
			"procletic": "ال",		# the syntaxic pprefix called procletic
			"prefix": "",			# the conjugation or inflection prefix
			"stem": "حياة",			# the word stem
			"suffix": "ُ", 			# the conjugation suffix of the word
			"encletic": "",			# the syntaxic suffix
			
			"tags": "تعريف::مرفوع*", # tags of affixes and tags extracted form lexical dictionary

			"freq": 0,				# the word frequency from Word Frequency database 
			"root": "",				# the word root; not yet used
			"template": "",			# the template وزن 
			"type": "Noun:مصدر",	# the word type
			"original": "حَيَاةٌ"		#original word from lexical dictionary
			"syntax":""				# used for syntaxique analysis porpos
			},
		 {"vocalized": "الْحَيَاةِ", "suffix": "ِ", "tags": "تعريف::مجرور", "stem": "حياة", "prefix": "", "freq": 0, "encletic": "", "word": "الحياة", "procletic": "ال", "root": "", "template": "", "type": "Noun:مصدر", "original": "حَيَاةٌ", "syntax":""}, 
		 {"vocalized": "الْحَيَاةَ", "suffix": "َ", "tags": "تعريف::منصوب", "stem": "حياة", "prefix": "", "freq": 0, "encletic": "", "word": "الحياة", "procletic": "ال", "root": "", "template": "", "type": "Noun:مصدر", "original": "حَيَاةٌ", "syntax":""}
		],
		[ 
		 {"vocalized": "جَمِيلَةُ", "suffix": "َةُ", "tags": "::مؤنث:مرفوع:ممنوع من الصرف", "stem": "جميل", "prefix": "", "freq": 63140, "encletic": "", "word": "جميلة", "procletic": "", "root": "", "template": "", "type": "Noun:صيغة مبالغة", "original": "جَمِيلٌ", "syntax":""}, 
		 {"vocalized": "جَمِيلَةِ", "suffix": "َةِ", "tags": "::مؤنث:مجرور:ممنوع من الصرف", "stem": "جميل", "prefix": "", "freq": 63140, "encletic": "", "word": "جميلة", "procletic": "", "root": "", "template": "", "type": "Noun:صيغة مبالغة", "original": "جَمِيلٌ"}, {"vocalized": "جَمِيلَةَ", "suffix": "َةَ", "tags": "::مؤنث:منصوب:ممنوع من الصرف", "stem": "جميل", "prefix": "", "freq": 63140, "encletic": "", "word": "جميلة", "procletic": "", "root": "", "template": "", "type": "Noun:صيغة مبالغة", "original": "جَمِيلٌ", "syntax":""}
		]
		],
		@param text: the input text.
		@type text: unicode.
		@param mode: the mode of analysis as 'verbs', 'nouns', or 'all'.
		@type mode: unicode.
		@return: list of dictionaries of analyzed words with tags.
		@rtype: list.
		"""
        list_word = self.text_tokenize(text)
        if self.allowTagGuessing:
            list_guessed_tag = self.tagger.wordTagging(list_word)
            # avoid errors
            if len(list_guessed_tag) != len(list_word):
                #if the two lists have'nt the same length,
                # we construct a empty list for tags with the same length
                # print "error on guess tags"
                # sys.exit();
                list_guessed_tag = ['nv'] * len(list_word)
        # disambiguate  some words to speed up the analysis
        # newWordlist = self.disambiguator.disambiguateWords( list_word, list_guessed_tag);
        if self.allowDisambiguation:
            newWordlist = self.disambiguator.disambiguateWords(
                list_word, list_guessed_tag)
            # avoid the incomplete list
            if len(newWordlist) == len(list_word):
                list_word = newWordlist
                # print u" ".join(list_word).encode('utf8');
                # print u" ".join(list_guessed_tag).encode('utf8');

        resulted_text = u""
        resulted_data = []
        #checkedWords={}; #global
        if mode == 'all':
            for i in range(len(list_word[:self.limit])):
                word = list_word[i]
                self.count_word(word)
                #~ if self.allowCacheUse and word in self.cache['checkedWords']: #.has_key(word):
                if self.allowCacheUse and self.cache.isAlreadyChecked(word):
                    #~ print (u"'%s'"%word).encode('utf8'), 'found'
                    one_data_list = self.cache.getChecked(word)
                    Stemmed_one_data_list = [
                        stemmedword.stemmedWord(w) for w in one_data_list
                    ]
                    resulted_data.append(Stemmed_one_data_list)
                else:
                    guessedTag = list_guessed_tag[i]
                    #~ print (u"'%s'"%word).encode('utf8'), ' not'
                    one_data_list = self.check_word(word, guessedTag)
                    Stemmed_one_data_list = [
                        stemmedword.stemmedWord(w) for w in one_data_list
                    ]
                    resulted_data.append(Stemmed_one_data_list)

                    #~ resulted_data.append(one_data_list);
                    #~ if self.allowCacheUse: self.cache['checkedWords'][word]=one_data_list;
                    one_data_list_to_serialize = [
                        w.__dict__ for w in one_data_list
                    ]
                    if self.allowCacheUse:
                        self.cache.addChecked(word, one_data_list_to_serialize)

        elif mode == 'nouns':

            for word in list_word[:self.limit]:
                one_data_list = self.check_word_as_noun(word)
                Stemmed_one_data_list = [
                    stemmedword.stemmedWord(w) for w in one_data_list
                ]
                resulted_data.append(Stemmed_one_data_list)
                #~ resulted_data.append(one_data_list);
        elif mode == 'verbs':
            for word in list_word[:self.limit]:
                one_data_list = self.check_word_as_verb(word)
                Stemmed_one_data_list = [
                    stemmedword.stemmedWord(w) for w in one_data_list
                ]
                resulted_data.append(Stemmed_one_data_list)
                #~ resulted_data.append(one_data_list);
        return resulted_data
Beispiel #5
0
    def check_word(self, word, guessedTag=""):
        """
		Analyze one word morphologically as verbs
		@param word: the input word.
		@type word: unicode.
		@return: list of dictionaries of analyzed words with tags.
		@rtype: list.
		"""
        word = araby.stripTatweel(word)
        word_vocalised = word
        word_nm = araby.stripTashkeel(word)
        resulted_text = u""
        resulted_data = []
        # if word is a pounctuation
        resulted_data += self.check_word_as_pounct(word_nm)
        # Done: if the word is a stop word we have  some problems,
        # the stop word can also be another normal word (verb or noun),
        # we must consider it in future works
        # if word is stopword allow stop words analysis
        resulted_data += self.check_word_as_stopword(word_nm)

        # if word is verb
        # مشكلة بعض الكلمات المستبعدة تعتبر أفعلا أو اسماء
        if self.tagger.hasVerbTag(guessedTag) or self.tagger.isStopWordTag(guessedTag):
            resulted_data += self.check_word_as_verb(word_nm)
            # print "is verb", rabti,len(resulted_data);
            # if word is noun
        if self.tagger.hasNounTag(guessedTag) or self.tagger.isStopWordTag(guessedTag):
            resulted_data += self.check_word_as_noun(word_nm)
        if len(resulted_data) == 0:
            # check the word as unkonwn
            resulted_data += self.check_word_as_unknown(word_nm)
            # check if the word is nomralized and sollution are equivalent
        resulted_data = self.check_normalized(word_vocalised, resulted_data)
        # check if the word is shadda like
        resulted_data = self.check_shadda(word_vocalised, resulted_data)

        # check if the word is vocalized like results
        if self.partial_vocalization_support:
            resulted_data = self.check_partial_vocalized(word_vocalised, resulted_data)
            # add word frequency information in tags
        resulted_data = self.addWordFrequency(resulted_data)

        if len(resulted_data) == 0:
            resulted_data.append(
                stemmedword.stemmedWord(
                    {
                        "word": word,
                        "procletic": "",
                        "encletic": "",
                        "prefix": "",
                        "suffix": "",
                        "stem": "",
                        "original": word,
                        "vocalized": word,
                        "tags": u"",
                        "type": "unknown",
                        "root": "",
                        "template": "",
                        "freq": self.wordfreq.getFreq(word, "unknown"),
                        "syntax": "",
                    }
                )
            )
        return resulted_data
Beispiel #6
0
    def check_text(self, text, mode="all"):
        """
		Analyze text morphologically.
The analyzed data given by morphological analyzer Qalsadi have the following format:
				"<th>المدخل</th>", "<th>تشكيل</th>","<th>الأصل</th>","<th>السابقة</th>", "<th>الجذع</th>",
				"<th>اللاحقة</th>", "<th>الحالة الإعرابية</th>","<th>الجذر</th>", "<th>النوع</th><th>شيوع</th>",
				"</tr>"
		morphological Result is a list of list of dict.
		The list contains all possible morphological analysis as a dict
		[
		[
		 {
			"word": "الحياة",		# input word
			"vocalized": "الْحَيَاةُ",   # vocalized form of the input word 
			"procletic": "ال",		# the syntaxic pprefix called procletic
			"prefix": "",			# the conjugation or inflection prefix
			"stem": "حياة",			# the word stem
			"suffix": "ُ", 			# the conjugation suffix of the word
			"encletic": "",			# the syntaxic suffix
			
			"tags": "تعريف::مرفوع*", # tags of affixes and tags extracted form lexical dictionary

			"freq": 0,				# the word frequency from Word Frequency database 
			"root": "",				# the word root; not yet used
			"template": "",			# the template وزن 
			"type": "Noun:مصدر",	# the word type
			"original": "حَيَاةٌ"		#original word from lexical dictionary
			"syntax":""				# used for syntaxique analysis porpos
			},
		 {"vocalized": "الْحَيَاةِ", "suffix": "ِ", "tags": "تعريف::مجرور", "stem": "حياة", "prefix": "", "freq": 0, "encletic": "", "word": "الحياة", "procletic": "ال", "root": "", "template": "", "type": "Noun:مصدر", "original": "حَيَاةٌ", "syntax":""}, 
		 {"vocalized": "الْحَيَاةَ", "suffix": "َ", "tags": "تعريف::منصوب", "stem": "حياة", "prefix": "", "freq": 0, "encletic": "", "word": "الحياة", "procletic": "ال", "root": "", "template": "", "type": "Noun:مصدر", "original": "حَيَاةٌ", "syntax":""}
		],
		[ 
		 {"vocalized": "جَمِيلَةُ", "suffix": "َةُ", "tags": "::مؤنث:مرفوع:ممنوع من الصرف", "stem": "جميل", "prefix": "", "freq": 63140, "encletic": "", "word": "جميلة", "procletic": "", "root": "", "template": "", "type": "Noun:صيغة مبالغة", "original": "جَمِيلٌ", "syntax":""}, 
		 {"vocalized": "جَمِيلَةِ", "suffix": "َةِ", "tags": "::مؤنث:مجرور:ممنوع من الصرف", "stem": "جميل", "prefix": "", "freq": 63140, "encletic": "", "word": "جميلة", "procletic": "", "root": "", "template": "", "type": "Noun:صيغة مبالغة", "original": "جَمِيلٌ"}, {"vocalized": "جَمِيلَةَ", "suffix": "َةَ", "tags": "::مؤنث:منصوب:ممنوع من الصرف", "stem": "جميل", "prefix": "", "freq": 63140, "encletic": "", "word": "جميلة", "procletic": "", "root": "", "template": "", "type": "Noun:صيغة مبالغة", "original": "جَمِيلٌ", "syntax":""}
		]
		],
		@param text: the input text.
		@type text: unicode.
		@param mode: the mode of analysis as 'verbs', 'nouns', or 'all'.
		@type mode: unicode.
		@return: list of dictionaries of analyzed words with tags.
		@rtype: list.
		"""
        list_word = self.text_tokenize(text)
        if self.allowTagGuessing:
            list_guessed_tag = self.tagger.wordTagging(list_word)
            # avoid errors
            if len(list_guessed_tag) != len(list_word):
                # if the two lists have'nt the same length,
                # we construct a empty list for tags with the same length
                # print "error on guess tags"
                # sys.exit();
                list_guessed_tag = ["nv"] * len(list_word)
                # disambiguate  some words to speed up the analysis
                # newWordlist = self.disambiguator.disambiguateWords( list_word, list_guessed_tag);
        if self.allowDisambiguation:
            newWordlist = self.disambiguator.disambiguateWords(list_word, list_guessed_tag)
            # avoid the incomplete list
            if len(newWordlist) == len(list_word):
                list_word = newWordlist
                # print u" ".join(list_word).encode('utf8');
                # print u" ".join(list_guessed_tag).encode('utf8');

        resulted_text = u""
        resulted_data = []
        # checkedWords={}; #global
        if mode == "all":
            for i in range(len(list_word[: self.limit])):
                word = list_word[i]
                self.count_word(word)
                # ~ if self.allowCacheUse and word in self.cache['checkedWords']: #.has_key(word):
                if self.allowCacheUse and self.cache.isAlreadyChecked(word):
                    # ~ print (u"'%s'"%word).encode('utf8'), 'found'
                    one_data_list = self.cache.getChecked(word)
                    Stemmed_one_data_list = [stemmedword.stemmedWord(w) for w in one_data_list]
                    resulted_data.append(Stemmed_one_data_list)
                else:
                    guessedTag = list_guessed_tag[i]
                    # ~ print (u"'%s'"%word).encode('utf8'), ' not'
                    one_data_list = self.check_word(word, guessedTag)
                    Stemmed_one_data_list = [stemmedword.stemmedWord(w) for w in one_data_list]
                    resulted_data.append(Stemmed_one_data_list)

                    # ~ resulted_data.append(one_data_list);
                    # ~ if self.allowCacheUse: self.cache['checkedWords'][word]=one_data_list;
                    one_data_list_to_serialize = [w.__dict__ for w in one_data_list]
                    if self.allowCacheUse:
                        self.cache.addChecked(word, one_data_list_to_serialize)

        elif mode == "nouns":

            for word in list_word[: self.limit]:
                one_data_list = self.check_word_as_noun(word)
                Stemmed_one_data_list = [stemmedword.stemmedWord(w) for w in one_data_list]
                resulted_data.append(Stemmed_one_data_list)
                # ~ resulted_data.append(one_data_list);
        elif mode == "verbs":
            for word in list_word[: self.limit]:
                one_data_list = self.check_word_as_verb(word)
                Stemmed_one_data_list = [stemmedword.stemmedWord(w) for w in one_data_list]
                resulted_data.append(Stemmed_one_data_list)
                # ~ resulted_data.append(one_data_list);
        return resulted_data
Beispiel #7
0
    def steming_second_level(self, noun, noun2, procletic, encletic):
        """
		Analyze word morphologically by stemming the conjugation affixes.
		@param noun: the input noun.
		@type noun: unicode.
		@param noun2: the noun stemed from syntaxic affixes.
		@type noun2: unicode.
		@param procletic: the syntaxic prefixe extracted in the fisrt stage.
		@type procletic: unicode.
		@param encletic: the syntaxic suffixe extracted in the fisrt stage.
		@type encletic: unicode.
		@return: list of dictionaries of analyzed words with tags.
		@rtype: list.
		"""
        detailed_result = []
        # segment the coinjugated verb
        list_seg_conj = self.conjStemmer.segment(noun2)
        # verify affix compatibility
        list_seg_conj = self.verify_affix(noun2, list_seg_conj, stem_noun_const.NOMINAL_CONJUGATION_AFFIX)

        # add vocalized forms of suffixes
        list_seg_conj_voc = []
        for seg_conj in list_seg_conj:
            prefix_conj = noun2[: seg_conj[0]]
            stem_conj = noun2[seg_conj[0] : seg_conj[1]]
            suffix_conj = noun2[seg_conj[1] :]
            affix_conj = prefix_conj + "-" + suffix_conj
            # get all vocalized form of suffixes
            for vocalized_suffix in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]["vocalized"]:
                seg_conj_voc = {"prefix": "", "suffix": vocalized_suffix, "stem": stem_conj}
                # verify compatibility between procletics and afix
                if self.is_compatible_proaffix_affix(procletic, encletic, vocalized_suffix):
                    # verify the existing of a noun stamp in the dictionary
                    # if self.NOUN_DICTIONARY_STAMP.has_key(stamp):
                    # list_seg_conj2.append(seg_conj)
                    list_seg_conj_voc.append(seg_conj_voc)
        list_seg_conj = list_seg_conj_voc
        for seg_conj in list_seg_conj:
            prefix_conj = seg_conj["prefix"]
            stem_conj = seg_conj["stem"]
            suffix_conj = seg_conj["suffix"]
            has_plural_suffix = (u"جمع" in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]["tags"]) or (
                u"مثنى" in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]["tags"]
            )
            # print "has_plural", has_plural_suffix;
            affix_conj = "-".join([prefix_conj, suffix_conj])
            # noirmalize hamza before gessing  deffirents origines
            stem_conj = tashaphyne.normalize.normalize_hamza(stem_conj)
            if self.debug:
                print "*\t", "-".join([str(len(stem_conj)), prefix_conj, stem_conj, suffix_conj]).encode("utf8")
                # generate possible stems
                # add stripped letters to the stem to constitute possible noun list
            possible_noun_list = self.getStemVariants(stem_conj, prefix_conj, suffix_conj)
            if self.debug:
                print "\tpossible original nouns:  ", "\t".join(possible_noun_list).encode("utf8")
                # search the noun in the dictionary
                # we can return the tashkeel
            infnoun_form_list = []
            for infnoun in possible_noun_list:
                # get the noun and get all its forms from the dict
                # if the noun has plural suffix, don't look up in broken plural dictionary
                infnoun_foundL = self.nounDictionary.lookup(infnoun, "unknown")
                # infnoun_found=self.find_nouns_in_dictionary(infnoun,has_plural_suffix);
                ##							listsingle=self.find_broken_plural(infnoun);
                ##							print ' *****','-'.join(listsingle).encode('utf8')
                if len(infnoun_foundL) > 0:
                    if self.debug:
                        print "\t in dict", infnoun.encode("utf8")
                else:
                    if self.debug:
                        print infnoun.encode("utf8"), "not found in dictionary"
                infnoun_form_list += infnoun_foundL
            for id in infnoun_form_list:
                noun_tuple = self.nounDictionary.getEntryById(id)
                infnoun = noun_tuple["vocalized"]
                originalTags = ()
                original = noun_tuple["vocalized"]
                wordtype = noun_tuple["word_type"]
                detailed_result.append(
                    stemmedword.stemmedWord(
                        {
                            "word": noun,
                            "procletic": procletic,
                            "encletic": encletic,
                            "prefix": prefix_conj,
                            "suffix": suffix_conj,
                            "stem": stem_conj,
                            "original": infnoun,  # original,
                            "vocalized": self.vocalize(infnoun, procletic, prefix_conj, suffix_conj, encletic),
                            "tags": u":".join(
                                stem_noun_const.COMP_PREFIX_LIST_TAGS[procletic]["tags"]
                                + stem_noun_const.COMP_SUFFIX_LIST_TAGS[encletic]["tags"]
                                + stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]["tags"]
                            ),
                            "type": u":".join(["Noun", wordtype]),  #'Noun',
                            "root": "",
                            "template": "",
                            "freq": noun_tuple["freq"],  # self.wordfreq.getFreq(infnoun,'noun'),
                            "originaltags": u":".join(originalTags),
                            "syntax": "",
                        }
                    )
                )
        return detailed_result
Beispiel #8
0
	def steming_second_level(self,noun,noun2,procletic,encletic):
		"""
		Analyze word morphologically by stemming the conjugation affixes.
		@param noun: the input noun.
		@type noun: unicode.
		@param noun2: the noun stemed from syntaxic affixes.
		@type noun2: unicode.
		@param procletic: the syntaxic prefixe extracted in the fisrt stage.
		@type procletic: unicode.
		@param encletic: the syntaxic suffixe extracted in the fisrt stage.
		@type encletic: unicode.
		@return: list of dictionaries of analyzed words with tags.
		@rtype: list.
		"""	
		detailed_result=[];
		#segment the coinjugated verb
		list_seg_conj=self.conjStemmer.segment(noun2);
		# verify affix compatibility
		list_seg_conj=self.verify_affix(noun2,list_seg_conj,stem_noun_const.NOMINAL_CONJUGATION_AFFIX);
		# add vocalized forms of suffixes
		list_seg_conj_voc=[];
		for seg_conj in list_seg_conj:
			prefix_conj=noun2[:seg_conj[0]];
			stem_conj=noun2[seg_conj[0]:seg_conj[1]]
			suffix_conj=noun2[seg_conj[1]:]
			affix_conj=prefix_conj+'-'+suffix_conj;
			# get all vocalized form of suffixes
			for vocalized_suffix in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['vocalized']:
				# if u'تنوين' not in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[vocalized_suffix]['tags']:
				seg_conj_voc={'prefix':'','suffix':vocalized_suffix,'stem':stem_conj}
				# verify compatibility between procletics and afix
				if (self.is_compatible_proaffix_affix(procletic, encletic, vocalized_suffix)):
				# verify the existing of a noun stamp in the dictionary
				# if self.NOUN_DICTIONARY_STAMP.has_key(stamp):
					# list_seg_conj2.append(seg_conj)
					list_seg_conj_voc.append(seg_conj_voc)

		list_seg_conj=list_seg_conj_voc;
		for seg_conj in list_seg_conj:
			prefix_conj=seg_conj['prefix'];
			stem_conj=seg_conj['stem']
			suffix_conj=seg_conj['suffix']
			has_plural_suffix=((u"جمع" in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['tags']) or( u"مثنى" in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['tags']))
			#print "has_plural", has_plural_suffix;
			affix_conj='-'.join([prefix_conj,suffix_conj])
			# noirmalize hamza before gessing  deffirents origines
			stem_conj = araby.normalizeHamza(stem_conj)
			if self.debug:
				print "*\t", "-".join([str(len(stem_conj)),prefix_conj,stem_conj,suffix_conj]).encode("utf8") ;
			# generate possible stems
			# add stripped letters to the stem to constitute possible noun list
			possible_noun_list=self.getStemVariants(stem_conj,prefix_conj,suffix_conj);
			if self.debug:
				print "\tpossible original nouns:  ","\t".join(possible_noun_list).encode('utf8');
			# search the noun in the dictionary
			# we can return the tashkeel
			infnoun_form_list=[];
			for infnoun in possible_noun_list:
				# get the noun and get all its forms from the dict
				# if the noun has plural suffix, don't look up in broken plural dictionary
				infnoun_foundL=self.nounDictionary.lookup(infnoun);
				#infnoun_found=self.find_nouns_in_dictionary(infnoun,has_plural_suffix);
##							listsingle=self.find_broken_plural(infnoun);
##							print ' *****','-'.join(listsingle).encode('utf8')
				if len(infnoun_foundL)>0:
					if self.debug: print "\t in dict",infnoun.encode('utf8');
				else:
					if self.debug: print infnoun.encode('utf8'),"not found in dictionary"
				infnoun_form_list+=infnoun_foundL;
			for id in infnoun_form_list:
				noun_tuple=self.nounDictionary.getEntryById(id);
				infnoun=noun_tuple['vocalized'];
				# affixes tags contains prefixes and suffixes tags
				affix_tags  =  stem_noun_const.COMP_PREFIX_LIST_TAGS[procletic]['tags'] \
								+stem_noun_const.COMP_SUFFIX_LIST_TAGS[encletic]['tags'] \
								+stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['tags']
				#test if the  given word from dictionary accept those tags given by affixes
				# دراسة توافق الزوائد مع خصائص الاسم،
				# مثلا هل يقبل الاسم التأنيث.
				# if not self.validateTags(noun_tuple, affix_tags, procletic, encletic, suffix_conj):
					
				if self.validateTags(noun_tuple, affix_tags, procletic, encletic, suffix_conj):
					# if the result vocalized noun is not the same length
					vocalized=self.vocalize(infnoun,procletic,prefix_conj,suffix_conj,encletic);
					# the noun can have some harakat or shadda, then we must remove all tashkeel and compare
					# vocalized_nm=araby.stripTashkeel(vocalized);
					# noun_nm=araby.stripTashkeel(noun);

					original=noun_tuple['original'];
					wordtype=noun_tuple['wordtype'];
					#add some tags from dictionary entry as mamnou3 min sarf and broken plural
					originalTags=[];
					if noun_tuple['mamnou3_sarf']==u"ممنوع من الصرف":
						originalTags.append(u"ممنوع من الصرف");
					if noun_tuple['number']==u"جمع تكسير":
						originalTags.append(u"جمع تكسير");						
						# affix_tags+=(,);
					detailed_result.append(stemmedword.stemmedWord({
					'word':noun,
					'procletic':procletic,
					'encletic':encletic,
					'prefix':prefix_conj,
					'suffix':suffix_conj,
					'stem':stem_conj,
					'original':infnoun,#original,
					'vocalized':vocalized,
					'tags':u':'.join(affix_tags),
					'type':u':'.join(['Noun',wordtype]),#'Noun',
					'root':'',
					'template':'',
					'freq':'freqnoun', # to note the frequency type 
					'originaltags':u':'.join(originalTags),
					'syntax':'',
					}));
		return detailed_result;
Beispiel #9
0
    def steming_second_level(self, noun, noun2, procletic, encletic):
        """
		Analyze word morphologically by stemming the conjugation affixes.
		@param noun: the input noun.
		@type noun: unicode.
		@param noun2: the noun stemed from syntaxic affixes.
		@type noun2: unicode.
		@param procletic: the syntaxic prefixe extracted in the fisrt stage.
		@type procletic: unicode.
		@param encletic: the syntaxic suffixe extracted in the fisrt stage.
		@type encletic: unicode.
		@return: list of dictionaries of analyzed words with tags.
		@rtype: list.
		"""
        detailed_result = []
        #segment the coinjugated verb
        list_seg_conj = self.conjStemmer.segment(noun2)
        # verify affix compatibility
        list_seg_conj = self.verify_affix(
            noun2, list_seg_conj, stem_noun_const.NOMINAL_CONJUGATION_AFFIX)

        # add vocalized forms of suffixes
        list_seg_conj_voc = []
        for seg_conj in list_seg_conj:
            prefix_conj = noun2[:seg_conj[0]]
            stem_conj = noun2[seg_conj[0]:seg_conj[1]]
            suffix_conj = noun2[seg_conj[1]:]
            affix_conj = prefix_conj + '-' + suffix_conj
            # get all vocalized form of suffixes
            for vocalized_suffix in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[
                    suffix_conj]['vocalized']:
                seg_conj_voc = {
                    'prefix': '',
                    'suffix': vocalized_suffix,
                    'stem': stem_conj
                }
                # verify compatibility between procletics and afix
                if (self.is_compatible_proaffix_affix(procletic, encletic,
                                                      vocalized_suffix)):
                    # verify the existing of a noun stamp in the dictionary
                    # if self.NOUN_DICTIONARY_STAMP.has_key(stamp):
                    # list_seg_conj2.append(seg_conj)
                    list_seg_conj_voc.append(seg_conj_voc)
        list_seg_conj = list_seg_conj_voc
        for seg_conj in list_seg_conj:
            prefix_conj = seg_conj['prefix']
            stem_conj = seg_conj['stem']
            suffix_conj = seg_conj['suffix']
            has_plural_suffix = (
                (u"جمع"
                 in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['tags'])
                or
                (u"مثنى"
                 in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['tags'])
            )
            #print "has_plural", has_plural_suffix;
            affix_conj = '-'.join([prefix_conj, suffix_conj])
            # noirmalize hamza before gessing  deffirents origines
            stem_conj = tashaphyne.normalize.normalize_hamza(stem_conj)
            if self.debug:
                print "*\t", "-".join(
                    [str(len(stem_conj)), prefix_conj, stem_conj,
                     suffix_conj]).encode("utf8")
            # generate possible stems
            # add stripped letters to the stem to constitute possible noun list
            possible_noun_list = self.getStemVariants(stem_conj, prefix_conj,
                                                      suffix_conj)
            if self.debug:
                print "\tpossible original nouns:  ", "\t".join(
                    possible_noun_list).encode('utf8')
            # search the noun in the dictionary
            # we can return the tashkeel
            infnoun_form_list = []
            for infnoun in possible_noun_list:
                # get the noun and get all its forms from the dict
                # if the noun has plural suffix, don't look up in broken plural dictionary
                infnoun_foundL = self.nounDictionary.lookup(
                    infnoun, 'unknown')
                #infnoun_found=self.find_nouns_in_dictionary(infnoun,has_plural_suffix);
                ##							listsingle=self.find_broken_plural(infnoun);
                ##							print ' *****','-'.join(listsingle).encode('utf8')
                if len(infnoun_foundL) > 0:
                    if self.debug: print "\t in dict", infnoun.encode('utf8')
                else:
                    if self.debug:
                        print infnoun.encode('utf8'), "not found in dictionary"
                infnoun_form_list += infnoun_foundL
            for id in infnoun_form_list:
                noun_tuple = self.nounDictionary.getEntryById(id)
                infnoun = noun_tuple['vocalized']
                originalTags = ()
                original = noun_tuple['vocalized']
                wordtype = noun_tuple['word_type']
                detailed_result.append(
                    stemmedword.stemmedWord({
                        'word':
                        noun,
                        'procletic':
                        procletic,
                        'encletic':
                        encletic,
                        'prefix':
                        prefix_conj,
                        'suffix':
                        suffix_conj,
                        'stem':
                        stem_conj,
                        'original':
                        infnoun,  #original,
                        'vocalized':
                        self.vocalize(infnoun, procletic, prefix_conj,
                                      suffix_conj, encletic),
                        'tags':
                        u':'.join(stem_noun_const.
                                  COMP_PREFIX_LIST_TAGS[procletic]['tags'] +
                                  stem_noun_const.
                                  COMP_SUFFIX_LIST_TAGS[encletic]['tags'] +
                                  stem_noun_const.
                                  CONJ_SUFFIX_LIST_TAGS[suffix_conj]['tags']),
                        'type':
                        u':'.join(['Noun', wordtype]),  #'Noun',
                        'root':
                        '',
                        'template':
                        '',
                        'freq':
                        noun_tuple[
                            'freq'],  #self.wordfreq.getFreq(infnoun,'noun'),
                        'originaltags':
                        u':'.join(originalTags),
                        'syntax':
                        '',
                    }))
        return detailed_result