def lookup(self, normalized): """ look up for all word forms in the dictionary @param normalized: the normalized word. @type normalized: unicode. @return: list of dictionary entries IDs. @rtype: list. """ idList = [] normword = araby.normalizeHamza(normalized) # print "###", normword.encode('utf8'); sql = u"select id FROM %s WHERE normalized='%s'" % (self.tableName, normword) try: self.cursor.execute(sql) if self.cursor: for row in self.cursor: idList.append(row[0]) return idList except: return []
def lookup(self,normalized): """ look up for all word forms in the dictionary @param normalized: the normalized word. @type normalized: unicode. @return: list of dictionary entries . @rtype: list. """ idList=[]; normword=araby.normalizeHamza(normalized) #print "###", normword.encode('utf8'); sql = u"select * FROM %s WHERE normalized='%s'"%(self.tableName,normword); try: self.cursor.execute(sql); if self.cursor: # return self.curser.fetchall(); for row in self.cursor: idList.append(row); return idList; except: return [];
def verbStamp(self, word): """ generate a stamp for a verb, the verb stamp is different of word stamp, by hamza noralization remove all letters which can change form in the word : - ALEF, - YEH, - WAW, - ALEF_MAKSURA - SHADDA @return: stamped word """ word=araby.stripTashkeel(word); #The vowels are striped in stamp function word=araby.normalizeHamza(word); if word.startswith(araby.HAMZA): #strip The first hamza word=word[1:]; # strip the last letter if is doubled if word[-1:]== word[-2:-1]: word=word[:-1]; return self.VerbSTAMP_pat.sub('', word)
def verbStamp(self, word): """ generate a stamp for a verb, the verb stamp is different of word stamp, by hamza noralization remove all letters which can change form in the word : - ALEF, - YEH, - WAW, - ALEF_MAKSURA - SHADDA @return: stamped word """ word = araby.stripTashkeel(word) #The vowels are striped in stamp function word = araby.normalizeHamza(word) if word.startswith(araby.HAMZA): #strip The first hamza word = word[1:] # strip the last letter if is doubled if word[-1:] == word[-2:-1]: word = word[:-1] return self.VerbSTAMP_pat.sub('', word)
def steming_second_level(self, noun, noun2, procletic, encletic): """ Analyze word morphologically by stemming the conjugation affixes. @param noun: the input noun. @type noun: unicode. @param noun2: the noun stemed from syntaxic affixes. @type noun2: unicode. @param procletic: the syntaxic prefixe extracted in the fisrt stage. @type procletic: unicode. @param encletic: the syntaxic suffixe extracted in the fisrt stage. @type encletic: unicode. @return: list of dictionaries of analyzed words with tags. @rtype: list. """ detailed_result = [] #segment the coinjugated verb list_seg_conj = self.conjStemmer.segment(noun2) # verify affix compatibility list_seg_conj = self.verify_affix( noun2, list_seg_conj, stem_noun_const.NOMINAL_CONJUGATION_AFFIX) # add vocalized forms of suffixes list_seg_conj_voc = [] for seg_conj in list_seg_conj: prefix_conj = noun2[:seg_conj[0]] stem_conj = noun2[seg_conj[0]:seg_conj[1]] suffix_conj = noun2[seg_conj[1]:] affix_conj = prefix_conj + '-' + suffix_conj # get all vocalized form of suffixes for vocalized_suffix in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[ suffix_conj]['vocalized']: # if u'تنوين' not in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[vocalized_suffix]['tags']: seg_conj_voc = { 'prefix': '', 'suffix': vocalized_suffix, 'stem': stem_conj } # verify compatibility between procletics and afix if (self.is_compatible_proaffix_affix(procletic, encletic, vocalized_suffix)): # verify the existing of a noun stamp in the dictionary # if self.NOUN_DICTIONARY_STAMP.has_key(stamp): # list_seg_conj2.append(seg_conj) list_seg_conj_voc.append(seg_conj_voc) list_seg_conj = list_seg_conj_voc for seg_conj in list_seg_conj: prefix_conj = seg_conj['prefix'] stem_conj = seg_conj['stem'] suffix_conj = seg_conj['suffix'] has_plural_suffix = ( (u"جمع" in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['tags']) or (u"مثنى" in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['tags']) ) #print "has_plural", has_plural_suffix; affix_conj = '-'.join([prefix_conj, suffix_conj]) # noirmalize hamza before gessing deffirents origines stem_conj = araby.normalizeHamza(stem_conj) if self.debug: print "*\t", "-".join( [str(len(stem_conj)), prefix_conj, stem_conj, suffix_conj]).encode("utf8") # generate possible stems # add stripped letters to the stem to constitute possible noun list possible_noun_list = self.getStemVariants(stem_conj, prefix_conj, suffix_conj) if self.debug: print "\tpossible original nouns: ", "\t".join( possible_noun_list).encode('utf8') # search the noun in the dictionary # we can return the tashkeel infnoun_form_list = [] for infnoun in possible_noun_list: # get the noun and get all its forms from the dict # if the noun has plural suffix, don't look up in broken plural dictionary infnoun_foundL = self.nounDictionary.lookup(infnoun) #infnoun_found=self.find_nouns_in_dictionary(infnoun,has_plural_suffix); ## listsingle=self.find_broken_plural(infnoun); ## print ' *****','-'.join(listsingle).encode('utf8') if len(infnoun_foundL) > 0: if self.debug: print "\t in dict", infnoun.encode('utf8') else: if self.debug: print infnoun.encode('utf8'), "not found in dictionary" infnoun_form_list += infnoun_foundL for id in infnoun_form_list: noun_tuple = self.nounDictionary.getEntryById(id) infnoun = noun_tuple['vocalized'] # affixes tags contains prefixes and suffixes tags affix_tags = stem_noun_const.COMP_PREFIX_LIST_TAGS[procletic]['tags'] \ +stem_noun_const.COMP_SUFFIX_LIST_TAGS[encletic]['tags'] \ +stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['tags'] #test if the given word from dictionary accept those tags given by affixes # دراسة توافق الزوائد مع خصائص الاسم، # مثلا هل يقبل الاسم التأنيث. # if not self.validateTags(noun_tuple, affix_tags, procletic, encletic, suffix_conj): if self.validateTags(noun_tuple, affix_tags, procletic, encletic, suffix_conj): # if the result vocalized noun is not the same length vocalized = self.vocalize(infnoun, procletic, prefix_conj, suffix_conj, encletic) # the noun can have some harakat or shadda, then we must remove all tashkeel and compare # vocalized_nm=araby.stripTashkeel(vocalized); # noun_nm=araby.stripTashkeel(noun); original = noun_tuple['original'] wordtype = noun_tuple['wordtype'] #add some tags from dictionary entry as mamnou3 min sarf and broken plural originalTags = [] if noun_tuple['mamnou3_sarf'] == u"ممنوع من الصرف": originalTags.append(u"ممنوع من الصرف") if noun_tuple['number'] == u"جمع تكسير": originalTags.append(u"جمع تكسير") # affix_tags+=(,); detailed_result.append( stemmedword.stemmedWord({ 'word': noun, 'procletic': procletic, 'encletic': encletic, 'prefix': prefix_conj, 'suffix': suffix_conj, 'stem': stem_conj, 'original': infnoun, #original, 'vocalized': vocalized, 'tags': u':'.join(affix_tags), 'type': u':'.join(['Noun', wordtype]), #'Noun', 'root': '', 'template': '', 'freq': 'freqnoun', # to note the frequency type 'originaltags': u':'.join(originalTags), 'syntax': '', })) return detailed_result
def steming_second_level(self, noun, noun2, procletic, encletic, encletic_nm): """ Analyze word morphologically by stemming the conjugation affixes. @param noun: the input noun. @type noun: unicode. @param noun2: the noun stemed from syntaxic affixes. @type noun2: unicode. @param procletic: the syntaxic prefixe extracted in the fisrt stage. @type procletic: unicode. @param encletic: the syntaxic suffixe extracted in the fisrt stage. @type encletic: unicode. @param encletic_nm: the syntaxic suffixe extracted in the fisrt stage (not vocalized). @type encletic_nm: unicode. @return: list of dictionaries of analyzed words with tags. @rtype: list. """ detailed_result = [] #segment the coinjugated verb list_seg_conj = self.conjStemmer.segment(noun2) # verify affix compatibility list_seg_conj = self.verify_affix( noun2, list_seg_conj, stem_noun_const.NOMINAL_CONJUGATION_AFFIX) # add vocalized forms of suffixes # and create the real affixes from the word list_seg_conj_voc = [] for seg_conj in list_seg_conj: stem_conj = noun2[seg_conj[0]:seg_conj[1]] suffix_conj_nm = noun2[seg_conj[1]:] # noirmalize hamza before gessing differents origines stem_conj = araby.normalizeHamza(stem_conj) # generate possible stems # add stripped letters to the stem to constitute possible noun list possible_noun_list = self.getStemVariants(stem_conj, suffix_conj_nm) # search the noun in the dictionary # we can return the tashkeel infnoun_form_list = [] for infnoun in set(possible_noun_list): # get the noun and get all its forms from the dict # if the noun has plural suffix, don't look up in broken plural dictionary if infnoun not in self.CacheDictSearch: infnoun_foundL = self.nounDictionary.lookup(infnoun) self.CacheDictSearch[infnoun] = self.createDictWord( infnoun_foundL) else: infnoun_foundL = self.CacheDictSearch[infnoun] infnoun_form_list.extend(infnoun_foundL) #print "len loooked up noun in dictionnary ",len(infnoun_form_list), len(set(infnoun_form_list)); for noun_tuple in infnoun_form_list: # noun_tuple=self.nounDictionary.getEntryById(id); infnoun = noun_tuple['vocalized'] # affixes tags contains prefixes and suffixes tags affix_tags = stem_noun_const.COMP_PREFIX_LIST_TAGS[procletic]['tags'] \ +stem_noun_const.COMP_SUFFIX_LIST_TAGS[encletic_nm]['tags'] \ +stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj_nm]['tags'] #test if the given word from dictionary accept those tags given by affixes # دراسة توافق الزوائد مع خصائص الاسم، # مثلا هل يقبل الاسم التأنيث. if self.validateTags(noun_tuple, affix_tags, procletic, encletic_nm, suffix_conj_nm): ## get all vocalized form of suffixes for vocalized_encletic in stem_noun_const.COMP_SUFFIX_LIST_TAGS[ encletic_nm]['vocalized']: for vocalized_suffix in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[ suffix_conj_nm]['vocalized']: ## verify compatibility between procletics and affix if (self.is_compatible_proaffix_affix( noun_tuple, procletic, vocalized_encletic, vocalized_suffix)): vocalized, semiVocalized = self.vocalize( infnoun, procletic, vocalized_suffix, vocalized_encletic) vocalized_affix_tags = stem_noun_const.COMP_PREFIX_LIST_TAGS[procletic]['tags'] \ +stem_noun_const.COMP_SUFFIX_LIST_TAGS[vocalized_encletic]['tags'] \ +stem_noun_const.CONJ_SUFFIX_LIST_TAGS[vocalized_suffix]['tags'] #add some tags from dictionary entry as mamnou3 min sarf and broken plural originalTags = [] if noun_tuple[ 'mamnou3_sarf'] == u"ممنوع من الصرف": originalTags.append(u"ممنوع من الصرف") if noun_tuple['number'] == u"جمع تكسير": originalTags.append(u"جمع تكسير") # affix_tags+=(, ); detailed_result.append( wordCase.wordCase({ 'word': noun, #~ 'affix': analex_const.AffixTuple((procletic=procletic, encletic=vocalized_encletic, prefix='', suffix=vocalized_suffix)), 'affix': (procletic, '', vocalized_suffix, vocalized_encletic), #~ 'procletic': , #~ 'encletic': , #~ 'prefix': '', #~ 'suffix': vocalized_suffix, 'stem': stem_conj, 'original': infnoun, #original, 'vocalized': vocalized, 'semivocalized': semiVocalized, 'tags': u':'.join(vocalized_affix_tags), 'type': u':'.join([ 'Noun', noun_tuple['wordtype'] ]), #'Noun', #~ 'root': '', #~ 'template': '', 'freq': 'freqnoun', # to note the frequency type 'originaltags': u':'.join(originalTags), 'syntax': '', })) return detailed_result
def steming_second_level(self, noun, noun2, procletic, encletic, encletic_nm): """ Analyze word morphologically by stemming the conjugation affixes. @param noun: the input noun. @type noun: unicode. @param noun2: the noun stemed from syntaxic affixes. @type noun2: unicode. @param procletic: the syntaxic prefixe extracted in the fisrt stage. @type procletic: unicode. @param encletic: the syntaxic suffixe extracted in the fisrt stage. @type encletic: unicode. @param encletic_nm: the syntaxic suffixe extracted in the fisrt stage (not vocalized). @type encletic_nm: unicode. @return: list of dictionaries of analyzed words with tags. @rtype: list. """ detailed_result=[]; #segment the coinjugated verb list_seg_conj = self.conjStemmer.segment(noun2); # verify affix compatibility list_seg_conj = self.verify_affix(noun2, list_seg_conj, stem_noun_const.NOMINAL_CONJUGATION_AFFIX); # add vocalized forms of suffixes # and create the real affixes from the word list_seg_conj_voc=[]; for seg_conj in list_seg_conj: stem_conj = noun2[seg_conj[0]:seg_conj[1]] suffix_conj_nm = noun2[seg_conj[1]:] # noirmalize hamza before gessing differents origines stem_conj = araby.normalizeHamza(stem_conj) # generate possible stems # add stripped letters to the stem to constitute possible noun list possible_noun_list=self.getStemVariants(stem_conj, suffix_conj_nm); # search the noun in the dictionary # we can return the tashkeel infnoun_form_list=[]; for infnoun in set(possible_noun_list): # get the noun and get all its forms from the dict # if the noun has plural suffix, don't look up in broken plural dictionary if infnoun not in self.CacheDictSearch: infnoun_foundL = self.nounDictionary.lookup(infnoun); self.CacheDictSearch[infnoun] = self.createDictWord(infnoun_foundL); else: infnoun_foundL = self.CacheDictSearch[infnoun] ; infnoun_form_list.extend(infnoun_foundL); #print "len loooked up noun in dictionnary ",len(infnoun_form_list), len(set(infnoun_form_list)); for noun_tuple in infnoun_form_list: # noun_tuple=self.nounDictionary.getEntryById(id); infnoun = noun_tuple['vocalized']; # affixes tags contains prefixes and suffixes tags affix_tags = stem_noun_const.COMP_PREFIX_LIST_TAGS[procletic]['tags'] \ +stem_noun_const.COMP_SUFFIX_LIST_TAGS[encletic_nm]['tags'] \ +stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj_nm]['tags'] #test if the given word from dictionary accept those tags given by affixes # دراسة توافق الزوائد مع خصائص الاسم، # مثلا هل يقبل الاسم التأنيث. if self.validateTags(noun_tuple, affix_tags, procletic, encletic_nm, suffix_conj_nm): ## get all vocalized form of suffixes for vocalized_encletic in stem_noun_const.COMP_SUFFIX_LIST_TAGS[encletic_nm]['vocalized']: for vocalized_suffix in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj_nm]['vocalized']: ## verify compatibility between procletics and affix if (self.is_compatible_proaffix_affix(noun_tuple, procletic, vocalized_encletic, vocalized_suffix)): vocalized, semiVocalized = self.vocalize(infnoun, procletic, vocalized_suffix, vocalized_encletic); vocalized_affix_tags = stem_noun_const.COMP_PREFIX_LIST_TAGS[procletic]['tags'] \ +stem_noun_const.COMP_SUFFIX_LIST_TAGS[vocalized_encletic]['tags'] \ +stem_noun_const.CONJ_SUFFIX_LIST_TAGS[vocalized_suffix]['tags'] #add some tags from dictionary entry as mamnou3 min sarf and broken plural originalTags=[]; if noun_tuple['mamnou3_sarf']==u"ممنوع من الصرف": originalTags.append(u"ممنوع من الصرف"); if noun_tuple['number']==u"جمع تكسير": originalTags.append(u"جمع تكسير"); # affix_tags+=(, ); detailed_result.append(wordCase.wordCase({ 'word':noun, #~ 'affix': analex_const.AffixTuple((procletic=procletic, encletic=vocalized_encletic, prefix='', suffix=vocalized_suffix)), 'affix': (procletic, '', vocalized_suffix, vocalized_encletic), #~ 'procletic': , #~ 'encletic': , #~ 'prefix': '', #~ 'suffix': vocalized_suffix, 'stem': stem_conj, 'original': infnoun, #original, 'vocalized': vocalized, 'semivocalized':semiVocalized, 'tags': u':'.join(vocalized_affix_tags), 'type': u':'.join(['Noun', noun_tuple['wordtype']]), #'Noun', #~ 'root': '', #~ 'template': '', 'freq':'freqnoun', # to note the frequency type 'originaltags':u':'.join(originalTags), 'syntax':'', })); return detailed_result;
def steming_second_level(self,noun,noun2,procletic,encletic): """ Analyze word morphologically by stemming the conjugation affixes. @param noun: the input noun. @type noun: unicode. @param noun2: the noun stemed from syntaxic affixes. @type noun2: unicode. @param procletic: the syntaxic prefixe extracted in the fisrt stage. @type procletic: unicode. @param encletic: the syntaxic suffixe extracted in the fisrt stage. @type encletic: unicode. @return: list of dictionaries of analyzed words with tags. @rtype: list. """ detailed_result=[]; #segment the coinjugated verb list_seg_conj=self.conjStemmer.segment(noun2); # verify affix compatibility list_seg_conj=self.verify_affix(noun2,list_seg_conj,stem_noun_const.NOMINAL_CONJUGATION_AFFIX); # add vocalized forms of suffixes list_seg_conj_voc=[]; for seg_conj in list_seg_conj: prefix_conj=noun2[:seg_conj[0]]; stem_conj=noun2[seg_conj[0]:seg_conj[1]] suffix_conj=noun2[seg_conj[1]:] affix_conj=prefix_conj+'-'+suffix_conj; # get all vocalized form of suffixes for vocalized_suffix in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['vocalized']: # if u'تنوين' not in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[vocalized_suffix]['tags']: seg_conj_voc={'prefix':'','suffix':vocalized_suffix,'stem':stem_conj} # verify compatibility between procletics and afix if (self.is_compatible_proaffix_affix(procletic, encletic, vocalized_suffix)): # verify the existing of a noun stamp in the dictionary # if self.NOUN_DICTIONARY_STAMP.has_key(stamp): # list_seg_conj2.append(seg_conj) list_seg_conj_voc.append(seg_conj_voc) list_seg_conj=list_seg_conj_voc; for seg_conj in list_seg_conj: prefix_conj=seg_conj['prefix']; stem_conj=seg_conj['stem'] suffix_conj=seg_conj['suffix'] has_plural_suffix=((u"جمع" in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['tags']) or( u"مثنى" in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['tags'])) #print "has_plural", has_plural_suffix; affix_conj='-'.join([prefix_conj,suffix_conj]) # noirmalize hamza before gessing deffirents origines stem_conj = araby.normalizeHamza(stem_conj) if self.debug: print "*\t", "-".join([str(len(stem_conj)),prefix_conj,stem_conj,suffix_conj]).encode("utf8") ; # generate possible stems # add stripped letters to the stem to constitute possible noun list possible_noun_list=self.getStemVariants(stem_conj,prefix_conj,suffix_conj); if self.debug: print "\tpossible original nouns: ","\t".join(possible_noun_list).encode('utf8'); # search the noun in the dictionary # we can return the tashkeel infnoun_form_list=[]; for infnoun in possible_noun_list: # get the noun and get all its forms from the dict # if the noun has plural suffix, don't look up in broken plural dictionary infnoun_foundL=self.nounDictionary.lookup(infnoun); #infnoun_found=self.find_nouns_in_dictionary(infnoun,has_plural_suffix); ## listsingle=self.find_broken_plural(infnoun); ## print ' *****','-'.join(listsingle).encode('utf8') if len(infnoun_foundL)>0: if self.debug: print "\t in dict",infnoun.encode('utf8'); else: if self.debug: print infnoun.encode('utf8'),"not found in dictionary" infnoun_form_list+=infnoun_foundL; for id in infnoun_form_list: noun_tuple=self.nounDictionary.getEntryById(id); infnoun=noun_tuple['vocalized']; # affixes tags contains prefixes and suffixes tags affix_tags = stem_noun_const.COMP_PREFIX_LIST_TAGS[procletic]['tags'] \ +stem_noun_const.COMP_SUFFIX_LIST_TAGS[encletic]['tags'] \ +stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['tags'] #test if the given word from dictionary accept those tags given by affixes # دراسة توافق الزوائد مع خصائص الاسم، # مثلا هل يقبل الاسم التأنيث. # if not self.validateTags(noun_tuple, affix_tags, procletic, encletic, suffix_conj): if self.validateTags(noun_tuple, affix_tags, procletic, encletic, suffix_conj): # if the result vocalized noun is not the same length vocalized=self.vocalize(infnoun,procletic,prefix_conj,suffix_conj,encletic); # the noun can have some harakat or shadda, then we must remove all tashkeel and compare # vocalized_nm=araby.stripTashkeel(vocalized); # noun_nm=araby.stripTashkeel(noun); original=noun_tuple['original']; wordtype=noun_tuple['wordtype']; #add some tags from dictionary entry as mamnou3 min sarf and broken plural originalTags=[]; if noun_tuple['mamnou3_sarf']==u"ممنوع من الصرف": originalTags.append(u"ممنوع من الصرف"); if noun_tuple['number']==u"جمع تكسير": originalTags.append(u"جمع تكسير"); # affix_tags+=(,); detailed_result.append(stemmedword.stemmedWord({ 'word':noun, 'procletic':procletic, 'encletic':encletic, 'prefix':prefix_conj, 'suffix':suffix_conj, 'stem':stem_conj, 'original':infnoun,#original, 'vocalized':vocalized, 'tags':u':'.join(affix_tags), 'type':u':'.join(['Noun',wordtype]),#'Noun', 'root':'', 'template':'', 'freq':'freqnoun', # to note the frequency type 'originaltags':u':'.join(originalTags), 'syntax':'', })); return detailed_result;