def stemming_stopword(self, word): """ Analyze word morphologically as noun @param word: the input word. @type word: unicode. @return: list of dictionaries of analyzed words with tags. @rtype: list. """ # the detailled stemmming result detailed_result=[]; # search the sw in the dictionary # we can return the tashkeel #list of IDs of found stopwords in dictionary swIdList = []; # search in database by word, and return all ids #word = araby.stripTashkeel(word); swIdList = self.swDictionary.lookup(word); for sw_tuple in swIdList: # sw_tuple = self.swDictionary.getEntryById(id); detailed_result.append(wordCase.wordCase({ 'word': word, 'affix': (sw_tuple['procletic'], '', '', sw_tuple['encletic']), 'stem': sw_tuple['stem'], 'original': sw_tuple['original'], 'vocalized': sw_tuple['vocalized'], 'tags': sw_tuple['tags'], 'type': sw_tuple['type'], 'freq':'freqstopword', 'originaltags':sw_tuple['tags'], 'syntax':'', })); return detailed_result;
def check_word(self,word, guessedTag=""): """ Analyze one word morphologically as verbs @param word: the input word. @type word: unicode. @return: list of dictionaries of analyzed words with tags. @rtype: list. """ word=araby.stripTatweel(word); word_vocalised=word; word_nm=araby.stripTashkeel(word); resulted_text=u""; resulted_data=[]; # if word is a pounctuation resulted_data+=self.check_word_as_pounct(word_nm); # Done: if the word is a stop word we have some problems, # the stop word can also be another normal word (verb or noun), # we must consider it in future works # if word is stopword allow stop words analysis resulted_data+=self.check_word_as_stopword(word_nm); #if word is verb # مشكلة بعض الكلمات المستبعدة تعتبر أفعلا أو اسماء if self.tagger.hasVerbTag(guessedTag) or self.tagger.isStopWordTag(guessedTag): resulted_data+=self.check_word_as_verb(word_nm); #print "is verb", rabti,len(resulted_data); #if word is noun if self.tagger.hasNounTag(guessedTag) or self.tagger.isStopWordTag(guessedTag): resulted_data+=self.check_word_as_noun(word_nm); if len(resulted_data)==0: #check the word as unkonwn resulted_data+=self.check_word_as_unknown(word_nm); #check if the word is nomralized and solution are equivalent resulted_data = self.check_normalized(word_vocalised, resulted_data) #check if the word is shadda like resulted_data = self.check_shadda(word_vocalised, resulted_data) #check if the word is vocalized like results if self.partial_vocalization_support: resulted_data=self.check_partial_vocalized(word_vocalised, resulted_data); # add word frequency information in tags resulted_data = self.addWordFrequency(resulted_data); if len(resulted_data)==0: resulted_data.append(wordCase.wordCase({ 'word':word, 'affix': ('' , '', '', ''), 'stem':'', 'original':word, 'vocalized':word, 'tags':u'', 'type':'unknown', 'root':'', 'template':'', 'freq':self.wordfreq.getFreq(word, 'unknown'), 'syntax':'', }) ); return resulted_data;
def check_word_as_pounct(self, word): """ Check if the word is a pounctuation, @param word: the input word. @type word: unicode. @return: list of dictionaries of analyzed words with tags. @rtype: list. """ detailed_result = [] # ToDo : fix it to isdigit, by moatz saad if word.isnumeric(): detailed_result.append( wordCase.wordCase({ 'word': word, 'affix': ('', '', '', ''), 'stem': '', 'original': word, 'vocalized': word, 'tags': self.get_number_tags(word), 'type': 'NUMBER', 'freq': 0, 'syntax': '', })) if word in stem_pounct_const.POUNCTUATION: detailed_result.append( wordCase.wordCase({ 'word': word, 'affix': ('', '', '', ''), 'stem': '', 'original': word, 'vocalized': word, 'tags': stem_pounct_const.POUNCTUATION[word]['tags'], 'type': 'POUNCT', 'freq': 0, 'syntax': '', })) return detailed_result
def check_word_as_pounct(self, word): """ Check if the word is a pounctuation, @param word: the input word. @type word: unicode. @return: list of dictionaries of analyzed words with tags. @rtype: list. """ detailed_result = [] # ToDo : fix it to isdigit, by moatz saad if word.isnumeric(): detailed_result.append( wordCase.wordCase( { "word": word, "affix": ("", "", "", ""), "stem": "", "original": word, "vocalized": word, "tags": self.get_number_tags(word), "type": "NUMBER", "freq": 0, "syntax": "", } ) ) if word in stem_pounct_const.POUNCTUATION: detailed_result.append( wordCase.wordCase( { "word": word, "affix": ("", "", "", ""), "stem": "", "original": word, "vocalized": word, "tags": stem_pounct_const.POUNCTUATION[word]["tags"], "type": "POUNCT", "freq": 0, "syntax": "", } ) ) return detailed_result
def check_word_as_pounct(self,word): """ Check if the word is a pounctuation, @param word: the input word. @type word: unicode. @return: list of dictionaries of analyzed words with tags. @rtype: list. """ detailed_result=[] # ToDo : fix it to isdigit, by moatz saad if word.isnumeric(): detailed_result.append(wordCase.wordCase({ 'word':word, 'affix': ('', '', '', ''), 'stem':'', 'original':word, 'vocalized':word, 'tags':self.get_number_tags(word), 'type':'NUMBER', 'freq':0, 'syntax':'', })); if word in stem_pounct_const.POUNCTUATION: detailed_result.append(wordCase.wordCase({ 'word':word, 'affix': ('', '', '', ''), 'stem':'', 'original':word, 'vocalized':word, 'tags':stem_pounct_const.POUNCTUATION[word]['tags'], 'type':'POUNCT', 'freq':0, 'syntax':'', })); return detailed_result;
def stemming_stopword(self, word): """ Analyze word morphologically as noun @param word: the input word. @type word: unicode. @return: list of dictionaries of analyzed words with tags. @rtype: list. """ # the detailled stemmming result detailed_result = [] # search the sw in the dictionary # we can return the tashkeel # list of IDs of found stopwords in dictionary swIdList = [] # search in database by word, and return all ids # word = araby.stripTashkeel(word); swIdList = self.swDictionary.lookup(word) for sw_tuple in swIdList: # sw_tuple = self.swDictionary.getEntryById(id); detailed_result.append( wordCase.wordCase( { "word": word, "affix": (sw_tuple["procletic"], "", "", sw_tuple["encletic"]), "stem": sw_tuple["stem"], "original": sw_tuple["original"], "vocalized": sw_tuple["vocalized"], "tags": sw_tuple["tags"], "type": sw_tuple["type"], "freq": "freqstopword", "originaltags": sw_tuple["tags"], "syntax": "", } ) ) return detailed_result
def steming_second_level(self, noun, noun2, procletic, encletic): """ Analyze word morphologically by stemming the conjugation affixes. @param noun: the input noun. @type noun: unicode. @param noun2: the noun stemed from syntaxic affixes. @type noun2: unicode. @param procletic: the syntaxic prefixe extracted in the fisrt stage. @type procletic: unicode. @param encletic: the syntaxic suffixe extracted in the fisrt stage. @type encletic: unicode. @return: list of dictionaries of analyzed words with tags. @rtype: list. """ detailed_result = [] #segment the coinjugated verb list_seg_conj = self.conjStemmer.segment(noun2) # verify affix compatibility list_seg_conj = self.verify_affix( noun2, list_seg_conj, stem_noun_const.NOMINAL_CONJUGATION_AFFIX) # add vocalized forms of suffixes list_seg_conj_voc = [] for seg_conj in list_seg_conj: prefix_conj = noun2[:seg_conj[0]] stem_conj = noun2[seg_conj[0]:seg_conj[1]] suffix_conj = noun2[seg_conj[1]:] affix_conj = prefix_conj + '-' + suffix_conj # get all vocalized form of suffixes for vocalized_suffix in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[ suffix_conj]['vocalized']: seg_conj_voc = { 'prefix': '', 'suffix': vocalized_suffix, 'stem': stem_conj } # verify compatibility between procletics and afix if (self.is_compatible_proaffix_affix(procletic, encletic, vocalized_suffix)): # verify the existing of a noun stamp in the dictionary # if self.NOUN_DICTIONARY_STAMP.has_key(stamp): # list_seg_conj2.append(seg_conj) list_seg_conj_voc.append(seg_conj_voc) list_seg_conj = list_seg_conj_voc for seg_conj in list_seg_conj: prefix_conj = seg_conj['prefix'] stem_conj = seg_conj['stem'] suffix_conj = seg_conj['suffix'] has_plural_suffix = ( (u"جمع" in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['tags']) or (u"مثنى" in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['tags']) ) #print "has_plural", has_plural_suffix; affix_conj = '-'.join([prefix_conj, suffix_conj]) # noirmalize hamza before gessing deffirents origines stem_conj = tashaphyne.normalize.normalize_hamza(stem_conj) if self.debug: print "*\t", "-".join( [str(len(stem_conj)), prefix_conj, stem_conj, suffix_conj]).encode("utf8") # generate possible stems # add stripped letters to the stem to constitute possible noun list possible_noun_list = self.getStemVariants(stem_conj, prefix_conj, suffix_conj) if self.debug: print "\tpossible original nouns: ", "\t".join( possible_noun_list).encode('utf8') # search the noun in the dictionary # we can return the tashkeel infnoun_form_list = [] for infnoun in possible_noun_list: # get the noun and get all its forms from the dict # if the noun has plural suffix, don't look up in broken plural dictionary infnoun_foundL = self.nounDictionary.lookup( infnoun, 'unknown') #infnoun_found=self.find_nouns_in_dictionary(infnoun,has_plural_suffix); ## listsingle=self.find_broken_plural(infnoun); ## print ' *****','-'.join(listsingle).encode('utf8') if len(infnoun_foundL) > 0: if self.debug: print "\t in dict", infnoun.encode('utf8') else: if self.debug: print infnoun.encode('utf8'), "not found in dictionary" infnoun_form_list += infnoun_foundL for noun_tuple in infnoun_form_list: # noun_tuple=self.nounDictionary.getEntryById(id); infnoun = noun_tuple['vocalized'] originalTags = () original = noun_tuple['vocalized'] wordtype = noun_tuple['word_type'] detailed_result.append( wordCase.wordCase({ 'word': noun, 'affix': (procletic, prefix_conj, suffix_conj, encletic), #~ 'procletic':procletic, #~ 'encletic':encletic, #~ 'prefix':prefix_conj, #~ 'suffix':suffix_conj, 'stem': stem_conj, 'original': infnoun, #original, 'vocalized': self.vocalize(infnoun, procletic, prefix_conj, suffix_conj, encletic), 'tags': u':'.join(stem_noun_const. COMP_PREFIX_LIST_TAGS[procletic]['tags'] + stem_noun_const. COMP_SUFFIX_LIST_TAGS[encletic]['tags'] + stem_noun_const. CONJ_SUFFIX_LIST_TAGS[suffix_conj]['tags']), 'type': u':'.join(['Noun', wordtype]), #'Noun', #~ 'root':'', #~ 'template':'', 'freq': noun_tuple[ 'freq'], #self.wordfreq.getFreq(infnoun,'noun'), 'originaltags': u':'.join(originalTags), 'syntax': '', })) return detailed_result
def stemming_verb(self, verb): list_found = []; display_conj_result=False; detailed_result = []; verb = verb.strip(); verb_list = [verb]; if verb.startswith(araby.ALEF_MADDA): verb_list.append(araby.ALEF_HAMZA_ABOVE + araby.ALEF_HAMZA_ABOVE+verb[1:]) verb_list.append(araby.HAMZA+araby.ALEF+verb[1:]) for verb in verb_list: list_seg_comp=self.compStemmer.segment(verb); for seg in list_seg_comp: procletic=verb[:seg[0]]; stem=verb[seg[0]:seg[1]] encletic=verb[seg[1]:] secondsuffix=u''; # حالة الفعل المتعدي لمفعولين if stem_verb_const.TableDoubleTransitiveSuffix.has_key(encletic ): firstsuffix=stem_verb_const.TableDoubleTransitiveSuffix[encletic]['first']; secondsuffix=stem_verb_const.TableDoubleTransitiveSuffix[encletic]['second']; encletic=firstsuffix; affix = u'-'.join([procletic, encletic]) #if self.debug: print "\t", "-".join([procletic, stem, encletic]).encode("utf8") ; # ajusting verbs variant list_stem=[stem]; if encletic: #!="": transitive=True; if stem.endswith(araby.TEH + araby.MEEM + araby.WAW): list_stem.append(stem[:-1]); elif stem.endswith(araby.WAW): list_stem.append(stem+ araby.ALEF); elif stem.endswith( araby.ALEF): list_stem.append(stem[:-1]+ araby.ALEF_MAKSURA); else: transitive=False; if verb.startswith(araby.ALEF_MADDA): # االبداية بألف مد list_stem.append(araby.ALEF_HAMZA_ABOVE + araby.ALEF_HAMZA_ABOVE+verb[1:]) list_stem.append(araby.HAMZA+ araby.ALEF+verb[1:]) # stem reduced verb : level two result=[]; for verb2 in list_stem: #segment the coinjugated verb list_seg_conj=self.conjStemmer.segment(verb2); # verify affix compatibility list_seg_conj = self.verify_affix(verb2, list_seg_conj, stem_verb_const.VERBAL_CONJUGATION_AFFIX); # verify procletics and enclitecs # verify length pof stem list_seg_conj2=[]; for seg_conj in list_seg_conj: if (seg_conj[1] - seg_conj[0])<=6 : prefix_conj = verb2[:seg_conj[0]]; stem_conj = verb2[seg_conj[0]:seg_conj[1]] suffix_conj = verb2[seg_conj[1]:] affix_conj = prefix_conj+'-'+suffix_conj; # verify compatibility between procletics and afix if (self.is_compatible_proaffix_affix(procletic, encletic, affix_conj)): # verify the existing of a verb stamp in the dictionary if self.verbDictionary.existsAsStamp(stem_conj): list_seg_conj2.append(seg_conj) list_seg_conj = list_seg_conj2; list_correct_conj = []; for seg_conj in list_seg_conj: prefix_conj = verb2[:seg_conj[0]]; stem_conj = verb2[seg_conj[0]:seg_conj[1]] suffix_conj = verb2[seg_conj[1]:] affix_conj = '-'.join([prefix_conj, suffix_conj]) # search the verb in the dictionary by stamp # if the verb exists in dictionary, # The transitivity is consedered # if is trilateral return its forms and Tashkeel # if not return forms without tashkeel, because the conjugator can vocalized it, # we can return the tashkeel if we don't need the conjugation step infverb_dict=self.getInfinitiveVerbByStem(stem_conj, transitive); infverb_dict = self.verifyInfinitiveVerbs(stem_conj, infverb_dict); for item in infverb_dict: #The haraka from is given from the dict inf_verb = item['verb']; haraka = item['haraka']; transtag = item['transitive'] #=='y'or not item['transitive']); transitive = (item['transitive']=='y'or not item['transitive']); originalTags = transtag; # dict tag is used to mention word dictionary tags: the original word tags like transitive attribute unstemed_verb= verb2; # conjugation step # ToDo, conjugate the verb with affix, # if exists one verb which match, return it # تصريف الفعل مع الزوائد # إذا توافق التصريف مع الكلمة الناتجة # تعرض النتيجة onelist_correct_conj = []; onelist_correct_conj = self.generate_possible_conjug(inf_verb, unstemed_verb, affix_conj, haraka, procletic, encletic, transitive); if len(onelist_correct_conj)>0: list_correct_conj+=onelist_correct_conj; # if not list_correct_conj : print "No Verb Found "; for conj in list_correct_conj: result.append(conj['verb']) detailed_result.append(wordCase.wordCase({ 'word':verb, 'affix': ( procletic, prefix_conj, suffix_conj, encletic), #~ 'procletic':procletic, #~ 'encletic':encletic, #~ 'prefix':prefix_conj, #~ 'suffix':suffix_conj, 'stem':stem_conj, 'original':conj['verb'], 'vocalized':self.vocalize(conj['vocalized'], procletic, encletic), 'tags':u':'.join((conj['tense'], conj['pronoun'])+stem_verb_const.COMP_PREFIX_LIST_TAGS[procletic]['tags']+stem_verb_const.COMP_SUFFIX_LIST_TAGS[encletic]['tags']), 'type':'Verb', #~ 'root':'', #~ 'template':'', 'freq':'freqverb', 'originaltags':originalTags, 'syntax':'', })); ## result+=detect_arabic_verb(verb2, transitive, prefix_conj, suffix_conj, debug); list_found+=result; list_found=set(list_found); return detailed_result
def steming_second_level(self,noun,noun2,procletic,encletic): """ Analyze word morphologically by stemming the conjugation affixes. @param noun: the input noun. @type noun: unicode. @param noun2: the noun stemed from syntaxic affixes. @type noun2: unicode. @param procletic: the syntaxic prefixe extracted in the fisrt stage. @type procletic: unicode. @param encletic: the syntaxic suffixe extracted in the fisrt stage. @type encletic: unicode. @return: list of dictionaries of analyzed words with tags. @rtype: list. """ detailed_result=[]; #segment the coinjugated verb list_seg_conj=self.conjStemmer.segment(noun2); # verify affix compatibility list_seg_conj=self.verify_affix(noun2,list_seg_conj,stem_noun_const.NOMINAL_CONJUGATION_AFFIX); # add vocalized forms of suffixes list_seg_conj_voc=[]; for seg_conj in list_seg_conj: prefix_conj=noun2[:seg_conj[0]]; stem_conj=noun2[seg_conj[0]:seg_conj[1]] suffix_conj=noun2[seg_conj[1]:] affix_conj=prefix_conj+'-'+suffix_conj; # get all vocalized form of suffixes for vocalized_suffix in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['vocalized']: seg_conj_voc={'prefix':'','suffix':vocalized_suffix,'stem':stem_conj} # verify compatibility between procletics and afix if (self.is_compatible_proaffix_affix(procletic, encletic, vocalized_suffix)): # verify the existing of a noun stamp in the dictionary # if self.NOUN_DICTIONARY_STAMP.has_key(stamp): # list_seg_conj2.append(seg_conj) list_seg_conj_voc.append(seg_conj_voc) list_seg_conj=list_seg_conj_voc; for seg_conj in list_seg_conj: prefix_conj=seg_conj['prefix']; stem_conj=seg_conj['stem'] suffix_conj=seg_conj['suffix'] has_plural_suffix=((u"جمع" in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['tags']) or( u"مثنى" in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['tags'])) #print "has_plural", has_plural_suffix; affix_conj='-'.join([prefix_conj,suffix_conj]) # noirmalize hamza before gessing deffirents origines stem_conj=tashaphyne.normalize.normalize_hamza(stem_conj) if self.debug: print "*\t", "-".join([str(len(stem_conj)),prefix_conj,stem_conj,suffix_conj]).encode("utf8") ; # generate possible stems # add stripped letters to the stem to constitute possible noun list possible_noun_list=self.getStemVariants(stem_conj,prefix_conj,suffix_conj); if self.debug: print "\tpossible original nouns: ","\t".join(possible_noun_list).encode('utf8'); # search the noun in the dictionary # we can return the tashkeel infnoun_form_list=[]; for infnoun in possible_noun_list: # get the noun and get all its forms from the dict # if the noun has plural suffix, don't look up in broken plural dictionary infnoun_foundL=self.nounDictionary.lookup(infnoun,'unknown'); #infnoun_found=self.find_nouns_in_dictionary(infnoun,has_plural_suffix); ## listsingle=self.find_broken_plural(infnoun); ## print ' *****','-'.join(listsingle).encode('utf8') if len(infnoun_foundL)>0: if self.debug: print "\t in dict",infnoun.encode('utf8'); else: if self.debug: print infnoun.encode('utf8'),"not found in dictionary" infnoun_form_list+=infnoun_foundL; for noun_tuple in infnoun_form_list: # noun_tuple=self.nounDictionary.getEntryById(id); infnoun=noun_tuple['vocalized']; originalTags=() original=noun_tuple['vocalized']; wordtype=noun_tuple['word_type']; detailed_result.append(wordCase.wordCase({ 'word':noun, 'affix': ( procletic, prefix_conj, suffix_conj, encletic), #~ 'procletic':procletic, #~ 'encletic':encletic, #~ 'prefix':prefix_conj, #~ 'suffix':suffix_conj, 'stem':stem_conj, 'original':infnoun,#original, 'vocalized':self.vocalize(infnoun,procletic,prefix_conj,suffix_conj,encletic), 'tags':u':'.join(stem_noun_const.COMP_PREFIX_LIST_TAGS[procletic]['tags']+stem_noun_const.COMP_SUFFIX_LIST_TAGS[encletic]['tags']+stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['tags']), 'type':u':'.join(['Noun',wordtype]),#'Noun', #~ 'root':'', #~ 'template':'', 'freq':noun_tuple['freq'],#self.wordfreq.getFreq(infnoun,'noun'), 'originaltags':u':'.join(originalTags), 'syntax':'', })); return detailed_result;
def steming_second_level(self, noun, noun2, procletic, encletic, encletic_nm): """ Analyze word morphologically by stemming the conjugation affixes. @param noun: the input noun. @type noun: unicode. @param noun2: the noun stemed from syntaxic affixes. @type noun2: unicode. @param procletic: the syntaxic prefixe extracted in the fisrt stage. @type procletic: unicode. @param encletic: the syntaxic suffixe extracted in the fisrt stage. @type encletic: unicode. @param encletic_nm: the syntaxic suffixe extracted in the fisrt stage (not vocalized). @type encletic_nm: unicode. @return: list of dictionaries of analyzed words with tags. @rtype: list. """ detailed_result=[]; #segment the coinjugated verb list_seg_conj = self.conjStemmer.segment(noun2); # verify affix compatibility list_seg_conj = self.verify_affix(noun2, list_seg_conj, stem_noun_const.NOMINAL_CONJUGATION_AFFIX); # add vocalized forms of suffixes # and create the real affixes from the word list_seg_conj_voc=[]; for seg_conj in list_seg_conj: stem_conj = noun2[seg_conj[0]:seg_conj[1]] suffix_conj_nm = noun2[seg_conj[1]:] # noirmalize hamza before gessing differents origines stem_conj = araby.normalizeHamza(stem_conj) # generate possible stems # add stripped letters to the stem to constitute possible noun list possible_noun_list=self.getStemVariants(stem_conj, suffix_conj_nm); # search the noun in the dictionary # we can return the tashkeel infnoun_form_list=[]; for infnoun in set(possible_noun_list): # get the noun and get all its forms from the dict # if the noun has plural suffix, don't look up in broken plural dictionary if not self.CacheDictSearch.has_key(infnoun): infnoun_foundL = self.nounDictionary.lookup(infnoun); self.CacheDictSearch[infnoun] = self.createDictWord(infnoun_foundL); else: infnoun_foundL = self.CacheDictSearch[infnoun] ; infnoun_form_list.extend(infnoun_foundL); #print "len loooked up noun in dictionnary ",len(infnoun_form_list), len(set(infnoun_form_list)); for noun_tuple in infnoun_form_list: # noun_tuple=self.nounDictionary.getEntryById(id); infnoun = noun_tuple['vocalized']; # affixes tags contains prefixes and suffixes tags affix_tags = stem_noun_const.COMP_PREFIX_LIST_TAGS[procletic]['tags'] \ +stem_noun_const.COMP_SUFFIX_LIST_TAGS[encletic_nm]['tags'] \ +stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj_nm]['tags'] #test if the given word from dictionary accept those tags given by affixes # دراسة توافق الزوائد مع خصائص الاسم، # مثلا هل يقبل الاسم التأنيث. if self.validateTags(noun_tuple, affix_tags, procletic, encletic_nm, suffix_conj_nm): ## get all vocalized form of suffixes for vocalized_encletic in stem_noun_const.COMP_SUFFIX_LIST_TAGS[encletic_nm]['vocalized']: for vocalized_suffix in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj_nm]['vocalized']: ## verify compatibility between procletics and affix if (self.is_compatible_proaffix_affix(noun_tuple, procletic, vocalized_encletic, vocalized_suffix)): vocalized, semiVocalized = self.vocalize(infnoun, procletic, vocalized_suffix, vocalized_encletic); vocalized_affix_tags = stem_noun_const.COMP_PREFIX_LIST_TAGS[procletic]['tags'] \ +stem_noun_const.COMP_SUFFIX_LIST_TAGS[vocalized_encletic]['tags'] \ +stem_noun_const.CONJ_SUFFIX_LIST_TAGS[vocalized_suffix]['tags'] #add some tags from dictionary entry as mamnou3 min sarf and broken plural originalTags=[]; if noun_tuple['mamnou3_sarf']==u"ممنوع من الصرف": originalTags.append(u"ممنوع من الصرف"); if noun_tuple['number']==u"جمع تكسير": originalTags.append(u"جمع تكسير"); # affix_tags+=(, ); detailed_result.append(wordCase.wordCase({ 'word':noun, #~ 'affix': analex_const.AffixTuple((procletic=procletic, encletic=vocalized_encletic, prefix='', suffix=vocalized_suffix)), 'affix': (procletic, '', vocalized_suffix, vocalized_encletic), #~ 'procletic': , #~ 'encletic': , #~ 'prefix': '', #~ 'suffix': vocalized_suffix, 'stem': stem_conj, 'original': infnoun, #original, 'vocalized': vocalized, 'semivocalized':semiVocalized, 'tags': u':'.join(vocalized_affix_tags), 'type': u':'.join(['Noun', noun_tuple['wordtype']]), #'Noun', #~ 'root': '', #~ 'template': '', 'freq':'freqnoun', # to note the frequency type 'originaltags':u':'.join(originalTags), 'syntax':'', })); return detailed_result;
def stemming_verb(self, verb): list_found = [] display_conj_result = False detailed_result = [] verb = verb.strip() verb_list = [verb] if verb.startswith(araby.ALEF_MADDA): verb_list.append(araby.ALEF_HAMZA_ABOVE + araby.ALEF_HAMZA_ABOVE + verb[1:]) verb_list.append(araby.HAMZA + araby.ALEF + verb[1:]) for verb in verb_list: list_seg_comp = self.compStemmer.segment(verb) for seg in list_seg_comp: procletic = verb[:seg[0]] stem = verb[seg[0]:seg[1]] encletic = verb[seg[1]:] secondsuffix = u'' # حالة الفعل المتعدي لمفعولين if stem_verb_const.TableDoubleTransitiveSuffix.has_key( encletic): firstsuffix = stem_verb_const.TableDoubleTransitiveSuffix[ encletic]['first'] secondsuffix = stem_verb_const.TableDoubleTransitiveSuffix[ encletic]['second'] encletic = firstsuffix affix = u'-'.join([procletic, encletic]) #if self.debug: print "\t", "-".join([procletic, stem, encletic]).encode("utf8") ; # ajusting verbs variant list_stem = [stem] if encletic: #!="": transitive = True if stem.endswith(araby.TEH + araby.MEEM + araby.WAW): list_stem.append(stem[:-1]) elif stem.endswith(araby.WAW): list_stem.append(stem + araby.ALEF) elif stem.endswith(araby.ALEF): list_stem.append(stem[:-1] + araby.ALEF_MAKSURA) else: transitive = False if verb.startswith(araby.ALEF_MADDA): # االبداية بألف مد list_stem.append(araby.ALEF_HAMZA_ABOVE + araby.ALEF_HAMZA_ABOVE + verb[1:]) list_stem.append(araby.HAMZA + araby.ALEF + verb[1:]) # stem reduced verb : level two result = [] for verb2 in list_stem: #segment the coinjugated verb list_seg_conj = self.conjStemmer.segment(verb2) # verify affix compatibility list_seg_conj = self.verify_affix( verb2, list_seg_conj, stem_verb_const.VERBAL_CONJUGATION_AFFIX) # verify procletics and enclitecs # verify length pof stem list_seg_conj2 = [] for seg_conj in list_seg_conj: if (seg_conj[1] - seg_conj[0]) <= 6: prefix_conj = verb2[:seg_conj[0]] stem_conj = verb2[seg_conj[0]:seg_conj[1]] suffix_conj = verb2[seg_conj[1]:] affix_conj = prefix_conj + '-' + suffix_conj # verify compatibility between procletics and afix if (self.is_compatible_proaffix_affix( procletic, encletic, affix_conj)): # verify the existing of a verb stamp in the dictionary if self.verbDictionary.existsAsStamp( stem_conj): list_seg_conj2.append(seg_conj) list_seg_conj = list_seg_conj2 list_correct_conj = [] for seg_conj in list_seg_conj: prefix_conj = verb2[:seg_conj[0]] stem_conj = verb2[seg_conj[0]:seg_conj[1]] suffix_conj = verb2[seg_conj[1]:] affix_conj = '-'.join([prefix_conj, suffix_conj]) # search the verb in the dictionary by stamp # if the verb exists in dictionary, # The transitivity is consedered # if is trilateral return its forms and Tashkeel # if not return forms without tashkeel, because the conjugator can vocalized it, # we can return the tashkeel if we don't need the conjugation step infverb_dict = self.getInfinitiveVerbByStem( stem_conj, transitive) infverb_dict = self.verifyInfinitiveVerbs( stem_conj, infverb_dict) for item in infverb_dict: #The haraka from is given from the dict inf_verb = item['verb'] haraka = item['haraka'] transtag = item[ 'transitive'] #=='y'or not item['transitive']); transitive = (item['transitive'] == 'y' or not item['transitive']) originalTags = transtag # dict tag is used to mention word dictionary tags: the original word tags like transitive attribute unstemed_verb = verb2 # conjugation step # ToDo, conjugate the verb with affix, # if exists one verb which match, return it # تصريف الفعل مع الزوائد # إذا توافق التصريف مع الكلمة الناتجة # تعرض النتيجة onelist_correct_conj = [] onelist_correct_conj = self.generate_possible_conjug( inf_verb, unstemed_verb, affix_conj, haraka, procletic, encletic, transitive) if len(onelist_correct_conj) > 0: list_correct_conj += onelist_correct_conj # if not list_correct_conj : print "No Verb Found "; for conj in list_correct_conj: result.append(conj['verb']) detailed_result.append( wordCase.wordCase({ 'word': verb, 'affix': (procletic, prefix_conj, suffix_conj, encletic), #~ 'procletic':procletic, #~ 'encletic':encletic, #~ 'prefix':prefix_conj, #~ 'suffix':suffix_conj, 'stem': stem_conj, 'original': conj['verb'], 'vocalized': self.vocalize(conj['vocalized'], procletic, encletic), 'tags': u':'.join( (conj['tense'], conj['pronoun']) + stem_verb_const. COMP_PREFIX_LIST_TAGS[procletic]['tags'] + stem_verb_const. COMP_SUFFIX_LIST_TAGS[encletic]['tags']), 'type': 'Verb', #~ 'root':'', #~ 'template':'', 'freq': 'freqverb', 'originaltags': originalTags, 'syntax': '', })) ## result+=detect_arabic_verb(verb2, transitive, prefix_conj, suffix_conj, debug); list_found += result list_found = set(list_found) return detailed_result