def check_word_as_pounct(self, word): """ Check if the word is a pounctuation, @param word: the input word. @type word: unicode. @return: list of dictionaries of analyzed words with tags. @rtype: list. """ detailed_result = [] # ToDo : fix it to isdigit, by moatz saad if word.isnumeric(): detailed_result.append( stemmedword.stemmedWord( { "word": word, "procletic": "", "encletic": "", "prefix": "", "suffix": "", "stem": "", "original": word, "vocalized": word, "tags": self.get_number_tags(word), "type": "NUMBER", "root": "", "template": "", "freq": 0, "syntax": "", } ) ) if word in stem_pounct_const.POUNCTUATION: detailed_result.append( stemmedword.stemmedWord( { "word": word, "procletic": "", "encletic": "", "prefix": "", "suffix": "", "stem": "", "original": word, "vocalized": word, "tags": stem_pounct_const.POUNCTUATION[word]["tags"], "type": "POUNCT", "root": "", "template": "", "freq": 0, "syntax": "", } ) ) return detailed_result
def stemming_stopword(self, word): """ Analyze word morphologically as noun @param word: the input word. @type word: unicode. @return: list of dictionaries of analyzed words with tags. @rtype: list. """ # the detailled stemmming result detailed_result=[]; # search the sw in the dictionary # we can return the tashkeel #list of IDs of found stopwords in dictionary swIdList = []; # search in database by word, and return all ids #word = araby.stripTashkeel(word); swIdList = self.swDictionary.lookup(word); for id in swIdList: sw_tuple = self.swDictionary.getEntryById(id); detailed_result.append(stemmedword.stemmedWord({ 'word': word, 'procletic': sw_tuple['procletic'], 'encletic': sw_tuple['encletic'], 'prefix': '', 'suffix': '', 'stem': sw_tuple['stem'], 'original': sw_tuple['original'], 'vocalized': sw_tuple['vocalized'], 'tags': sw_tuple['tags'], 'type': sw_tuple['type'], 'root':'', 'template':'', 'freq':'freqstopword', 'originaltags':sw_tuple['tags'], 'syntax':'', })); return detailed_result;
def steming_second_level(self, noun, noun2, procletic, encletic): """ Analyze word morphologically by stemming the conjugation affixes. @param noun: the input noun. @type noun: unicode. @param noun2: the noun stemed from syntaxic affixes. @type noun2: unicode. @param procletic: the syntaxic prefixe extracted in the fisrt stage. @type procletic: unicode. @param encletic: the syntaxic suffixe extracted in the fisrt stage. @type encletic: unicode. @return: list of dictionaries of analyzed words with tags. @rtype: list. """ detailed_result = [] #segment the coinjugated verb list_seg_conj = self.conjStemmer.segment(noun2) # verify affix compatibility list_seg_conj = self.verify_affix( noun2, list_seg_conj, stem_noun_const.NOMINAL_CONJUGATION_AFFIX) # add vocalized forms of suffixes list_seg_conj_voc = [] for seg_conj in list_seg_conj: prefix_conj = noun2[:seg_conj[0]] stem_conj = noun2[seg_conj[0]:seg_conj[1]] suffix_conj = noun2[seg_conj[1]:] affix_conj = prefix_conj + '-' + suffix_conj # get all vocalized form of suffixes for vocalized_suffix in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[ suffix_conj]['vocalized']: # if u'تنوين' not in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[vocalized_suffix]['tags']: seg_conj_voc = { 'prefix': '', 'suffix': vocalized_suffix, 'stem': stem_conj } # verify compatibility between procletics and afix if (self.is_compatible_proaffix_affix(procletic, encletic, vocalized_suffix)): # verify the existing of a noun stamp in the dictionary # if self.NOUN_DICTIONARY_STAMP.has_key(stamp): # list_seg_conj2.append(seg_conj) list_seg_conj_voc.append(seg_conj_voc) list_seg_conj = list_seg_conj_voc for seg_conj in list_seg_conj: prefix_conj = seg_conj['prefix'] stem_conj = seg_conj['stem'] suffix_conj = seg_conj['suffix'] has_plural_suffix = ( (u"جمع" in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['tags']) or (u"مثنى" in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['tags']) ) #print "has_plural", has_plural_suffix; affix_conj = '-'.join([prefix_conj, suffix_conj]) # noirmalize hamza before gessing deffirents origines stem_conj = araby.normalizeHamza(stem_conj) if self.debug: print "*\t", "-".join( [str(len(stem_conj)), prefix_conj, stem_conj, suffix_conj]).encode("utf8") # generate possible stems # add stripped letters to the stem to constitute possible noun list possible_noun_list = self.getStemVariants(stem_conj, prefix_conj, suffix_conj) if self.debug: print "\tpossible original nouns: ", "\t".join( possible_noun_list).encode('utf8') # search the noun in the dictionary # we can return the tashkeel infnoun_form_list = [] for infnoun in possible_noun_list: # get the noun and get all its forms from the dict # if the noun has plural suffix, don't look up in broken plural dictionary infnoun_foundL = self.nounDictionary.lookup(infnoun) #infnoun_found=self.find_nouns_in_dictionary(infnoun,has_plural_suffix); ## listsingle=self.find_broken_plural(infnoun); ## print ' *****','-'.join(listsingle).encode('utf8') if len(infnoun_foundL) > 0: if self.debug: print "\t in dict", infnoun.encode('utf8') else: if self.debug: print infnoun.encode('utf8'), "not found in dictionary" infnoun_form_list += infnoun_foundL for id in infnoun_form_list: noun_tuple = self.nounDictionary.getEntryById(id) infnoun = noun_tuple['vocalized'] # affixes tags contains prefixes and suffixes tags affix_tags = stem_noun_const.COMP_PREFIX_LIST_TAGS[procletic]['tags'] \ +stem_noun_const.COMP_SUFFIX_LIST_TAGS[encletic]['tags'] \ +stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['tags'] #test if the given word from dictionary accept those tags given by affixes # دراسة توافق الزوائد مع خصائص الاسم، # مثلا هل يقبل الاسم التأنيث. # if not self.validateTags(noun_tuple, affix_tags, procletic, encletic, suffix_conj): if self.validateTags(noun_tuple, affix_tags, procletic, encletic, suffix_conj): # if the result vocalized noun is not the same length vocalized = self.vocalize(infnoun, procletic, prefix_conj, suffix_conj, encletic) # the noun can have some harakat or shadda, then we must remove all tashkeel and compare # vocalized_nm=araby.stripTashkeel(vocalized); # noun_nm=araby.stripTashkeel(noun); original = noun_tuple['original'] wordtype = noun_tuple['wordtype'] #add some tags from dictionary entry as mamnou3 min sarf and broken plural originalTags = [] if noun_tuple['mamnou3_sarf'] == u"ممنوع من الصرف": originalTags.append(u"ممنوع من الصرف") if noun_tuple['number'] == u"جمع تكسير": originalTags.append(u"جمع تكسير") # affix_tags+=(,); detailed_result.append( stemmedword.stemmedWord({ 'word': noun, 'procletic': procletic, 'encletic': encletic, 'prefix': prefix_conj, 'suffix': suffix_conj, 'stem': stem_conj, 'original': infnoun, #original, 'vocalized': vocalized, 'tags': u':'.join(affix_tags), 'type': u':'.join(['Noun', wordtype]), #'Noun', 'root': '', 'template': '', 'freq': 'freqnoun', # to note the frequency type 'originaltags': u':'.join(originalTags), 'syntax': '', })) return detailed_result
def check_text(self, text, mode='all'): """ Analyze text morphologically. The analyzed data given by morphological analyzer Qalsadi have the following format: "<th>المدخل</th>", "<th>تشكيل</th>","<th>الأصل</th>","<th>السابقة</th>", "<th>الجذع</th>", "<th>اللاحقة</th>", "<th>الحالة الإعرابية</th>","<th>الجذر</th>", "<th>النوع</th><th>شيوع</th>", "</tr>" morphological Result is a list of list of dict. The list contains all possible morphological analysis as a dict [ [ { "word": "الحياة", # input word "vocalized": "الْحَيَاةُ", # vocalized form of the input word "procletic": "ال", # the syntaxic pprefix called procletic "prefix": "", # the conjugation or inflection prefix "stem": "حياة", # the word stem "suffix": "ُ", # the conjugation suffix of the word "encletic": "", # the syntaxic suffix "tags": "تعريف::مرفوع*", # tags of affixes and tags extracted form lexical dictionary "freq": 0, # the word frequency from Word Frequency database "root": "", # the word root; not yet used "template": "", # the template وزن "type": "Noun:مصدر", # the word type "original": "حَيَاةٌ" #original word from lexical dictionary "syntax":"" # used for syntaxique analysis porpos }, {"vocalized": "الْحَيَاةِ", "suffix": "ِ", "tags": "تعريف::مجرور", "stem": "حياة", "prefix": "", "freq": 0, "encletic": "", "word": "الحياة", "procletic": "ال", "root": "", "template": "", "type": "Noun:مصدر", "original": "حَيَاةٌ", "syntax":""}, {"vocalized": "الْحَيَاةَ", "suffix": "َ", "tags": "تعريف::منصوب", "stem": "حياة", "prefix": "", "freq": 0, "encletic": "", "word": "الحياة", "procletic": "ال", "root": "", "template": "", "type": "Noun:مصدر", "original": "حَيَاةٌ", "syntax":""} ], [ {"vocalized": "جَمِيلَةُ", "suffix": "َةُ", "tags": "::مؤنث:مرفوع:ممنوع من الصرف", "stem": "جميل", "prefix": "", "freq": 63140, "encletic": "", "word": "جميلة", "procletic": "", "root": "", "template": "", "type": "Noun:صيغة مبالغة", "original": "جَمِيلٌ", "syntax":""}, {"vocalized": "جَمِيلَةِ", "suffix": "َةِ", "tags": "::مؤنث:مجرور:ممنوع من الصرف", "stem": "جميل", "prefix": "", "freq": 63140, "encletic": "", "word": "جميلة", "procletic": "", "root": "", "template": "", "type": "Noun:صيغة مبالغة", "original": "جَمِيلٌ"}, {"vocalized": "جَمِيلَةَ", "suffix": "َةَ", "tags": "::مؤنث:منصوب:ممنوع من الصرف", "stem": "جميل", "prefix": "", "freq": 63140, "encletic": "", "word": "جميلة", "procletic": "", "root": "", "template": "", "type": "Noun:صيغة مبالغة", "original": "جَمِيلٌ", "syntax":""} ] ], @param text: the input text. @type text: unicode. @param mode: the mode of analysis as 'verbs', 'nouns', or 'all'. @type mode: unicode. @return: list of dictionaries of analyzed words with tags. @rtype: list. """ list_word = self.text_tokenize(text) if self.allowTagGuessing: list_guessed_tag = self.tagger.wordTagging(list_word) # avoid errors if len(list_guessed_tag) != len(list_word): #if the two lists have'nt the same length, # we construct a empty list for tags with the same length # print "error on guess tags" # sys.exit(); list_guessed_tag = ['nv'] * len(list_word) # disambiguate some words to speed up the analysis # newWordlist = self.disambiguator.disambiguateWords( list_word, list_guessed_tag); if self.allowDisambiguation: newWordlist = self.disambiguator.disambiguateWords( list_word, list_guessed_tag) # avoid the incomplete list if len(newWordlist) == len(list_word): list_word = newWordlist # print u" ".join(list_word).encode('utf8'); # print u" ".join(list_guessed_tag).encode('utf8'); resulted_text = u"" resulted_data = [] #checkedWords={}; #global if mode == 'all': for i in range(len(list_word[:self.limit])): word = list_word[i] self.count_word(word) #~ if self.allowCacheUse and word in self.cache['checkedWords']: #.has_key(word): if self.allowCacheUse and self.cache.isAlreadyChecked(word): #~ print (u"'%s'"%word).encode('utf8'), 'found' one_data_list = self.cache.getChecked(word) Stemmed_one_data_list = [ stemmedword.stemmedWord(w) for w in one_data_list ] resulted_data.append(Stemmed_one_data_list) else: guessedTag = list_guessed_tag[i] #~ print (u"'%s'"%word).encode('utf8'), ' not' one_data_list = self.check_word(word, guessedTag) Stemmed_one_data_list = [ stemmedword.stemmedWord(w) for w in one_data_list ] resulted_data.append(Stemmed_one_data_list) #~ resulted_data.append(one_data_list); #~ if self.allowCacheUse: self.cache['checkedWords'][word]=one_data_list; one_data_list_to_serialize = [ w.__dict__ for w in one_data_list ] if self.allowCacheUse: self.cache.addChecked(word, one_data_list_to_serialize) elif mode == 'nouns': for word in list_word[:self.limit]: one_data_list = self.check_word_as_noun(word) Stemmed_one_data_list = [ stemmedword.stemmedWord(w) for w in one_data_list ] resulted_data.append(Stemmed_one_data_list) #~ resulted_data.append(one_data_list); elif mode == 'verbs': for word in list_word[:self.limit]: one_data_list = self.check_word_as_verb(word) Stemmed_one_data_list = [ stemmedword.stemmedWord(w) for w in one_data_list ] resulted_data.append(Stemmed_one_data_list) #~ resulted_data.append(one_data_list); return resulted_data
def check_word(self, word, guessedTag=""): """ Analyze one word morphologically as verbs @param word: the input word. @type word: unicode. @return: list of dictionaries of analyzed words with tags. @rtype: list. """ word = araby.stripTatweel(word) word_vocalised = word word_nm = araby.stripTashkeel(word) resulted_text = u"" resulted_data = [] # if word is a pounctuation resulted_data += self.check_word_as_pounct(word_nm) # Done: if the word is a stop word we have some problems, # the stop word can also be another normal word (verb or noun), # we must consider it in future works # if word is stopword allow stop words analysis resulted_data += self.check_word_as_stopword(word_nm) # if word is verb # مشكلة بعض الكلمات المستبعدة تعتبر أفعلا أو اسماء if self.tagger.hasVerbTag(guessedTag) or self.tagger.isStopWordTag(guessedTag): resulted_data += self.check_word_as_verb(word_nm) # print "is verb", rabti,len(resulted_data); # if word is noun if self.tagger.hasNounTag(guessedTag) or self.tagger.isStopWordTag(guessedTag): resulted_data += self.check_word_as_noun(word_nm) if len(resulted_data) == 0: # check the word as unkonwn resulted_data += self.check_word_as_unknown(word_nm) # check if the word is nomralized and sollution are equivalent resulted_data = self.check_normalized(word_vocalised, resulted_data) # check if the word is shadda like resulted_data = self.check_shadda(word_vocalised, resulted_data) # check if the word is vocalized like results if self.partial_vocalization_support: resulted_data = self.check_partial_vocalized(word_vocalised, resulted_data) # add word frequency information in tags resulted_data = self.addWordFrequency(resulted_data) if len(resulted_data) == 0: resulted_data.append( stemmedword.stemmedWord( { "word": word, "procletic": "", "encletic": "", "prefix": "", "suffix": "", "stem": "", "original": word, "vocalized": word, "tags": u"", "type": "unknown", "root": "", "template": "", "freq": self.wordfreq.getFreq(word, "unknown"), "syntax": "", } ) ) return resulted_data
def check_text(self, text, mode="all"): """ Analyze text morphologically. The analyzed data given by morphological analyzer Qalsadi have the following format: "<th>المدخل</th>", "<th>تشكيل</th>","<th>الأصل</th>","<th>السابقة</th>", "<th>الجذع</th>", "<th>اللاحقة</th>", "<th>الحالة الإعرابية</th>","<th>الجذر</th>", "<th>النوع</th><th>شيوع</th>", "</tr>" morphological Result is a list of list of dict. The list contains all possible morphological analysis as a dict [ [ { "word": "الحياة", # input word "vocalized": "الْحَيَاةُ", # vocalized form of the input word "procletic": "ال", # the syntaxic pprefix called procletic "prefix": "", # the conjugation or inflection prefix "stem": "حياة", # the word stem "suffix": "ُ", # the conjugation suffix of the word "encletic": "", # the syntaxic suffix "tags": "تعريف::مرفوع*", # tags of affixes and tags extracted form lexical dictionary "freq": 0, # the word frequency from Word Frequency database "root": "", # the word root; not yet used "template": "", # the template وزن "type": "Noun:مصدر", # the word type "original": "حَيَاةٌ" #original word from lexical dictionary "syntax":"" # used for syntaxique analysis porpos }, {"vocalized": "الْحَيَاةِ", "suffix": "ِ", "tags": "تعريف::مجرور", "stem": "حياة", "prefix": "", "freq": 0, "encletic": "", "word": "الحياة", "procletic": "ال", "root": "", "template": "", "type": "Noun:مصدر", "original": "حَيَاةٌ", "syntax":""}, {"vocalized": "الْحَيَاةَ", "suffix": "َ", "tags": "تعريف::منصوب", "stem": "حياة", "prefix": "", "freq": 0, "encletic": "", "word": "الحياة", "procletic": "ال", "root": "", "template": "", "type": "Noun:مصدر", "original": "حَيَاةٌ", "syntax":""} ], [ {"vocalized": "جَمِيلَةُ", "suffix": "َةُ", "tags": "::مؤنث:مرفوع:ممنوع من الصرف", "stem": "جميل", "prefix": "", "freq": 63140, "encletic": "", "word": "جميلة", "procletic": "", "root": "", "template": "", "type": "Noun:صيغة مبالغة", "original": "جَمِيلٌ", "syntax":""}, {"vocalized": "جَمِيلَةِ", "suffix": "َةِ", "tags": "::مؤنث:مجرور:ممنوع من الصرف", "stem": "جميل", "prefix": "", "freq": 63140, "encletic": "", "word": "جميلة", "procletic": "", "root": "", "template": "", "type": "Noun:صيغة مبالغة", "original": "جَمِيلٌ"}, {"vocalized": "جَمِيلَةَ", "suffix": "َةَ", "tags": "::مؤنث:منصوب:ممنوع من الصرف", "stem": "جميل", "prefix": "", "freq": 63140, "encletic": "", "word": "جميلة", "procletic": "", "root": "", "template": "", "type": "Noun:صيغة مبالغة", "original": "جَمِيلٌ", "syntax":""} ] ], @param text: the input text. @type text: unicode. @param mode: the mode of analysis as 'verbs', 'nouns', or 'all'. @type mode: unicode. @return: list of dictionaries of analyzed words with tags. @rtype: list. """ list_word = self.text_tokenize(text) if self.allowTagGuessing: list_guessed_tag = self.tagger.wordTagging(list_word) # avoid errors if len(list_guessed_tag) != len(list_word): # if the two lists have'nt the same length, # we construct a empty list for tags with the same length # print "error on guess tags" # sys.exit(); list_guessed_tag = ["nv"] * len(list_word) # disambiguate some words to speed up the analysis # newWordlist = self.disambiguator.disambiguateWords( list_word, list_guessed_tag); if self.allowDisambiguation: newWordlist = self.disambiguator.disambiguateWords(list_word, list_guessed_tag) # avoid the incomplete list if len(newWordlist) == len(list_word): list_word = newWordlist # print u" ".join(list_word).encode('utf8'); # print u" ".join(list_guessed_tag).encode('utf8'); resulted_text = u"" resulted_data = [] # checkedWords={}; #global if mode == "all": for i in range(len(list_word[: self.limit])): word = list_word[i] self.count_word(word) # ~ if self.allowCacheUse and word in self.cache['checkedWords']: #.has_key(word): if self.allowCacheUse and self.cache.isAlreadyChecked(word): # ~ print (u"'%s'"%word).encode('utf8'), 'found' one_data_list = self.cache.getChecked(word) Stemmed_one_data_list = [stemmedword.stemmedWord(w) for w in one_data_list] resulted_data.append(Stemmed_one_data_list) else: guessedTag = list_guessed_tag[i] # ~ print (u"'%s'"%word).encode('utf8'), ' not' one_data_list = self.check_word(word, guessedTag) Stemmed_one_data_list = [stemmedword.stemmedWord(w) for w in one_data_list] resulted_data.append(Stemmed_one_data_list) # ~ resulted_data.append(one_data_list); # ~ if self.allowCacheUse: self.cache['checkedWords'][word]=one_data_list; one_data_list_to_serialize = [w.__dict__ for w in one_data_list] if self.allowCacheUse: self.cache.addChecked(word, one_data_list_to_serialize) elif mode == "nouns": for word in list_word[: self.limit]: one_data_list = self.check_word_as_noun(word) Stemmed_one_data_list = [stemmedword.stemmedWord(w) for w in one_data_list] resulted_data.append(Stemmed_one_data_list) # ~ resulted_data.append(one_data_list); elif mode == "verbs": for word in list_word[: self.limit]: one_data_list = self.check_word_as_verb(word) Stemmed_one_data_list = [stemmedword.stemmedWord(w) for w in one_data_list] resulted_data.append(Stemmed_one_data_list) # ~ resulted_data.append(one_data_list); return resulted_data
def steming_second_level(self, noun, noun2, procletic, encletic): """ Analyze word morphologically by stemming the conjugation affixes. @param noun: the input noun. @type noun: unicode. @param noun2: the noun stemed from syntaxic affixes. @type noun2: unicode. @param procletic: the syntaxic prefixe extracted in the fisrt stage. @type procletic: unicode. @param encletic: the syntaxic suffixe extracted in the fisrt stage. @type encletic: unicode. @return: list of dictionaries of analyzed words with tags. @rtype: list. """ detailed_result = [] # segment the coinjugated verb list_seg_conj = self.conjStemmer.segment(noun2) # verify affix compatibility list_seg_conj = self.verify_affix(noun2, list_seg_conj, stem_noun_const.NOMINAL_CONJUGATION_AFFIX) # add vocalized forms of suffixes list_seg_conj_voc = [] for seg_conj in list_seg_conj: prefix_conj = noun2[: seg_conj[0]] stem_conj = noun2[seg_conj[0] : seg_conj[1]] suffix_conj = noun2[seg_conj[1] :] affix_conj = prefix_conj + "-" + suffix_conj # get all vocalized form of suffixes for vocalized_suffix in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]["vocalized"]: seg_conj_voc = {"prefix": "", "suffix": vocalized_suffix, "stem": stem_conj} # verify compatibility between procletics and afix if self.is_compatible_proaffix_affix(procletic, encletic, vocalized_suffix): # verify the existing of a noun stamp in the dictionary # if self.NOUN_DICTIONARY_STAMP.has_key(stamp): # list_seg_conj2.append(seg_conj) list_seg_conj_voc.append(seg_conj_voc) list_seg_conj = list_seg_conj_voc for seg_conj in list_seg_conj: prefix_conj = seg_conj["prefix"] stem_conj = seg_conj["stem"] suffix_conj = seg_conj["suffix"] has_plural_suffix = (u"جمع" in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]["tags"]) or ( u"مثنى" in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]["tags"] ) # print "has_plural", has_plural_suffix; affix_conj = "-".join([prefix_conj, suffix_conj]) # noirmalize hamza before gessing deffirents origines stem_conj = tashaphyne.normalize.normalize_hamza(stem_conj) if self.debug: print "*\t", "-".join([str(len(stem_conj)), prefix_conj, stem_conj, suffix_conj]).encode("utf8") # generate possible stems # add stripped letters to the stem to constitute possible noun list possible_noun_list = self.getStemVariants(stem_conj, prefix_conj, suffix_conj) if self.debug: print "\tpossible original nouns: ", "\t".join(possible_noun_list).encode("utf8") # search the noun in the dictionary # we can return the tashkeel infnoun_form_list = [] for infnoun in possible_noun_list: # get the noun and get all its forms from the dict # if the noun has plural suffix, don't look up in broken plural dictionary infnoun_foundL = self.nounDictionary.lookup(infnoun, "unknown") # infnoun_found=self.find_nouns_in_dictionary(infnoun,has_plural_suffix); ## listsingle=self.find_broken_plural(infnoun); ## print ' *****','-'.join(listsingle).encode('utf8') if len(infnoun_foundL) > 0: if self.debug: print "\t in dict", infnoun.encode("utf8") else: if self.debug: print infnoun.encode("utf8"), "not found in dictionary" infnoun_form_list += infnoun_foundL for id in infnoun_form_list: noun_tuple = self.nounDictionary.getEntryById(id) infnoun = noun_tuple["vocalized"] originalTags = () original = noun_tuple["vocalized"] wordtype = noun_tuple["word_type"] detailed_result.append( stemmedword.stemmedWord( { "word": noun, "procletic": procletic, "encletic": encletic, "prefix": prefix_conj, "suffix": suffix_conj, "stem": stem_conj, "original": infnoun, # original, "vocalized": self.vocalize(infnoun, procletic, prefix_conj, suffix_conj, encletic), "tags": u":".join( stem_noun_const.COMP_PREFIX_LIST_TAGS[procletic]["tags"] + stem_noun_const.COMP_SUFFIX_LIST_TAGS[encletic]["tags"] + stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]["tags"] ), "type": u":".join(["Noun", wordtype]), #'Noun', "root": "", "template": "", "freq": noun_tuple["freq"], # self.wordfreq.getFreq(infnoun,'noun'), "originaltags": u":".join(originalTags), "syntax": "", } ) ) return detailed_result
def steming_second_level(self,noun,noun2,procletic,encletic): """ Analyze word morphologically by stemming the conjugation affixes. @param noun: the input noun. @type noun: unicode. @param noun2: the noun stemed from syntaxic affixes. @type noun2: unicode. @param procletic: the syntaxic prefixe extracted in the fisrt stage. @type procletic: unicode. @param encletic: the syntaxic suffixe extracted in the fisrt stage. @type encletic: unicode. @return: list of dictionaries of analyzed words with tags. @rtype: list. """ detailed_result=[]; #segment the coinjugated verb list_seg_conj=self.conjStemmer.segment(noun2); # verify affix compatibility list_seg_conj=self.verify_affix(noun2,list_seg_conj,stem_noun_const.NOMINAL_CONJUGATION_AFFIX); # add vocalized forms of suffixes list_seg_conj_voc=[]; for seg_conj in list_seg_conj: prefix_conj=noun2[:seg_conj[0]]; stem_conj=noun2[seg_conj[0]:seg_conj[1]] suffix_conj=noun2[seg_conj[1]:] affix_conj=prefix_conj+'-'+suffix_conj; # get all vocalized form of suffixes for vocalized_suffix in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['vocalized']: # if u'تنوين' not in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[vocalized_suffix]['tags']: seg_conj_voc={'prefix':'','suffix':vocalized_suffix,'stem':stem_conj} # verify compatibility between procletics and afix if (self.is_compatible_proaffix_affix(procletic, encletic, vocalized_suffix)): # verify the existing of a noun stamp in the dictionary # if self.NOUN_DICTIONARY_STAMP.has_key(stamp): # list_seg_conj2.append(seg_conj) list_seg_conj_voc.append(seg_conj_voc) list_seg_conj=list_seg_conj_voc; for seg_conj in list_seg_conj: prefix_conj=seg_conj['prefix']; stem_conj=seg_conj['stem'] suffix_conj=seg_conj['suffix'] has_plural_suffix=((u"جمع" in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['tags']) or( u"مثنى" in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['tags'])) #print "has_plural", has_plural_suffix; affix_conj='-'.join([prefix_conj,suffix_conj]) # noirmalize hamza before gessing deffirents origines stem_conj = araby.normalizeHamza(stem_conj) if self.debug: print "*\t", "-".join([str(len(stem_conj)),prefix_conj,stem_conj,suffix_conj]).encode("utf8") ; # generate possible stems # add stripped letters to the stem to constitute possible noun list possible_noun_list=self.getStemVariants(stem_conj,prefix_conj,suffix_conj); if self.debug: print "\tpossible original nouns: ","\t".join(possible_noun_list).encode('utf8'); # search the noun in the dictionary # we can return the tashkeel infnoun_form_list=[]; for infnoun in possible_noun_list: # get the noun and get all its forms from the dict # if the noun has plural suffix, don't look up in broken plural dictionary infnoun_foundL=self.nounDictionary.lookup(infnoun); #infnoun_found=self.find_nouns_in_dictionary(infnoun,has_plural_suffix); ## listsingle=self.find_broken_plural(infnoun); ## print ' *****','-'.join(listsingle).encode('utf8') if len(infnoun_foundL)>0: if self.debug: print "\t in dict",infnoun.encode('utf8'); else: if self.debug: print infnoun.encode('utf8'),"not found in dictionary" infnoun_form_list+=infnoun_foundL; for id in infnoun_form_list: noun_tuple=self.nounDictionary.getEntryById(id); infnoun=noun_tuple['vocalized']; # affixes tags contains prefixes and suffixes tags affix_tags = stem_noun_const.COMP_PREFIX_LIST_TAGS[procletic]['tags'] \ +stem_noun_const.COMP_SUFFIX_LIST_TAGS[encletic]['tags'] \ +stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['tags'] #test if the given word from dictionary accept those tags given by affixes # دراسة توافق الزوائد مع خصائص الاسم، # مثلا هل يقبل الاسم التأنيث. # if not self.validateTags(noun_tuple, affix_tags, procletic, encletic, suffix_conj): if self.validateTags(noun_tuple, affix_tags, procletic, encletic, suffix_conj): # if the result vocalized noun is not the same length vocalized=self.vocalize(infnoun,procletic,prefix_conj,suffix_conj,encletic); # the noun can have some harakat or shadda, then we must remove all tashkeel and compare # vocalized_nm=araby.stripTashkeel(vocalized); # noun_nm=araby.stripTashkeel(noun); original=noun_tuple['original']; wordtype=noun_tuple['wordtype']; #add some tags from dictionary entry as mamnou3 min sarf and broken plural originalTags=[]; if noun_tuple['mamnou3_sarf']==u"ممنوع من الصرف": originalTags.append(u"ممنوع من الصرف"); if noun_tuple['number']==u"جمع تكسير": originalTags.append(u"جمع تكسير"); # affix_tags+=(,); detailed_result.append(stemmedword.stemmedWord({ 'word':noun, 'procletic':procletic, 'encletic':encletic, 'prefix':prefix_conj, 'suffix':suffix_conj, 'stem':stem_conj, 'original':infnoun,#original, 'vocalized':vocalized, 'tags':u':'.join(affix_tags), 'type':u':'.join(['Noun',wordtype]),#'Noun', 'root':'', 'template':'', 'freq':'freqnoun', # to note the frequency type 'originaltags':u':'.join(originalTags), 'syntax':'', })); return detailed_result;
def steming_second_level(self, noun, noun2, procletic, encletic): """ Analyze word morphologically by stemming the conjugation affixes. @param noun: the input noun. @type noun: unicode. @param noun2: the noun stemed from syntaxic affixes. @type noun2: unicode. @param procletic: the syntaxic prefixe extracted in the fisrt stage. @type procletic: unicode. @param encletic: the syntaxic suffixe extracted in the fisrt stage. @type encletic: unicode. @return: list of dictionaries of analyzed words with tags. @rtype: list. """ detailed_result = [] #segment the coinjugated verb list_seg_conj = self.conjStemmer.segment(noun2) # verify affix compatibility list_seg_conj = self.verify_affix( noun2, list_seg_conj, stem_noun_const.NOMINAL_CONJUGATION_AFFIX) # add vocalized forms of suffixes list_seg_conj_voc = [] for seg_conj in list_seg_conj: prefix_conj = noun2[:seg_conj[0]] stem_conj = noun2[seg_conj[0]:seg_conj[1]] suffix_conj = noun2[seg_conj[1]:] affix_conj = prefix_conj + '-' + suffix_conj # get all vocalized form of suffixes for vocalized_suffix in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[ suffix_conj]['vocalized']: seg_conj_voc = { 'prefix': '', 'suffix': vocalized_suffix, 'stem': stem_conj } # verify compatibility between procletics and afix if (self.is_compatible_proaffix_affix(procletic, encletic, vocalized_suffix)): # verify the existing of a noun stamp in the dictionary # if self.NOUN_DICTIONARY_STAMP.has_key(stamp): # list_seg_conj2.append(seg_conj) list_seg_conj_voc.append(seg_conj_voc) list_seg_conj = list_seg_conj_voc for seg_conj in list_seg_conj: prefix_conj = seg_conj['prefix'] stem_conj = seg_conj['stem'] suffix_conj = seg_conj['suffix'] has_plural_suffix = ( (u"جمع" in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['tags']) or (u"مثنى" in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['tags']) ) #print "has_plural", has_plural_suffix; affix_conj = '-'.join([prefix_conj, suffix_conj]) # noirmalize hamza before gessing deffirents origines stem_conj = tashaphyne.normalize.normalize_hamza(stem_conj) if self.debug: print "*\t", "-".join( [str(len(stem_conj)), prefix_conj, stem_conj, suffix_conj]).encode("utf8") # generate possible stems # add stripped letters to the stem to constitute possible noun list possible_noun_list = self.getStemVariants(stem_conj, prefix_conj, suffix_conj) if self.debug: print "\tpossible original nouns: ", "\t".join( possible_noun_list).encode('utf8') # search the noun in the dictionary # we can return the tashkeel infnoun_form_list = [] for infnoun in possible_noun_list: # get the noun and get all its forms from the dict # if the noun has plural suffix, don't look up in broken plural dictionary infnoun_foundL = self.nounDictionary.lookup( infnoun, 'unknown') #infnoun_found=self.find_nouns_in_dictionary(infnoun,has_plural_suffix); ## listsingle=self.find_broken_plural(infnoun); ## print ' *****','-'.join(listsingle).encode('utf8') if len(infnoun_foundL) > 0: if self.debug: print "\t in dict", infnoun.encode('utf8') else: if self.debug: print infnoun.encode('utf8'), "not found in dictionary" infnoun_form_list += infnoun_foundL for id in infnoun_form_list: noun_tuple = self.nounDictionary.getEntryById(id) infnoun = noun_tuple['vocalized'] originalTags = () original = noun_tuple['vocalized'] wordtype = noun_tuple['word_type'] detailed_result.append( stemmedword.stemmedWord({ 'word': noun, 'procletic': procletic, 'encletic': encletic, 'prefix': prefix_conj, 'suffix': suffix_conj, 'stem': stem_conj, 'original': infnoun, #original, 'vocalized': self.vocalize(infnoun, procletic, prefix_conj, suffix_conj, encletic), 'tags': u':'.join(stem_noun_const. COMP_PREFIX_LIST_TAGS[procletic]['tags'] + stem_noun_const. COMP_SUFFIX_LIST_TAGS[encletic]['tags'] + stem_noun_const. CONJ_SUFFIX_LIST_TAGS[suffix_conj]['tags']), 'type': u':'.join(['Noun', wordtype]), #'Noun', 'root': '', 'template': '', 'freq': noun_tuple[ 'freq'], #self.wordfreq.getFreq(infnoun,'noun'), 'originaltags': u':'.join(originalTags), 'syntax': '', })) return detailed_result