def check_word_as_pounct(self, word): """ Check if the word is a pounctuation, @param word: the input word. @type word: unicode. @return: list of dictionaries of analyzed words with tags. @rtype: list. """ detailed_result = [] if not word: return detailed_result # ToDo : fix it to isdigit, by moatz saad if word.isnumeric(): detailed_result.append( wordcase.WordCase({ 'word': word, 'affix': ('', '', '', ''), 'stem': '', 'original': word, 'vocalized': word, 'tags': u"عدد", 'type': 'NUMBER', 'freq': 0, 'syntax': '', 'root': '', })) # test if all chars in word are punctuation for char in word: # if one char is not a pounct, break if char not in stem_pounct_const.POUNCTUATION: break else: # if all chars are pounct, the word take tags of the first char detailed_result.append( wordcase.WordCase({ 'word': word, 'affix': ('', '', '', ''), 'stem': '', 'original': word, 'vocalized': word, 'tags': stem_pounct_const.POUNCTUATION[word[0]]['tags'], 'type': 'POUNCT', 'freq': 0, 'syntax': '', 'root': '', })) return detailed_result
def stemming_noun(self, noun): """ Analyze word morphologically as noun @param noun: the input noun. @type noun: unicode. @return: list of dictionaries of analyzed words with tags. @rtype: list. """ detailed_result = [] detailed_result.append(wordcase.WordCase({ 'word':noun, 'affix': "", 'stem':noun, 'original':noun, #original, 'type':u'unkonwn', 'original':noun, })) return detailed_result
def steming_second_level(self, noun, noun2, procletic, encletic): """ Analyze word morphologically by stemming the conjugation affixes. @param noun: the input noun. @type noun: unicode. @param noun2: the noun stemed from syntaxic affixes. @type noun2: unicode. @param procletic: the syntaxic prefixe extracted in the fisrt stage. @type procletic: unicode. @param encletic: the syntaxic suffixe extracted in the fisrt stage. @type encletic: unicode. @return: list of dictionaries of analyzed words with tags. @rtype: list. """ detailed_result = [] #segment the coinjugated verb list_seg_conj = self.conj_stemmer.segment(noun2) # verify affix compatibility list_seg_conj = verify_affix(noun2, list_seg_conj, snconst.NOMINAL_CONJUGATION_AFFIX) # add vocalized forms of suffixes list_seg_conj_voc = [] for seg_conj in list_seg_conj: prefix_conj = noun2[:seg_conj[0]] stem_conj = noun2[seg_conj[0]:seg_conj[1]] suffix_conj = noun2[seg_conj[1]:] #~affix_conj = prefix_conj+'-'+suffix_conj # get all vocalized form of suffixes for vocalized_suffix in \ snconst.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['vocalized']: seg_conj_voc = { 'prefix': '', 'suffix': vocalized_suffix, 'stem': stem_conj } # verify compatibility between procletics and afix if (is_compatible_proaffix_affix(procletic, encletic, vocalized_suffix)): # verify the existing of a noun stamp in the dictionary # if self.NOUN_DICTIONARY_STAMP.has_key(stamp): # list_seg_conj2.append(seg_conj) list_seg_conj_voc.append(seg_conj_voc) list_seg_conj = list_seg_conj_voc for seg_conj in list_seg_conj: prefix_conj = seg_conj['prefix'] stem_conj = seg_conj['stem'] suffix_conj = seg_conj['suffix'] #~has_plural_suffix = ((u"جمع" in \ #~snconst.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['tags']) or\ #~( u"مثنى" in snconst.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['tags'])) #print "has_plural", has_plural_suffix #~affix_conj = '-'.join([prefix_conj, suffix_conj]) # noirmalize hamza before gessing deffirents origines stem_conj = tashaphyne.normalize.normalize_hamza(stem_conj) # generate possible stems # add stripped letters to the stem to constitute possible noun list possible_noun_list = get_stem_variants(stem_conj, \ prefix_conj, suffix_conj) # search the noun in the dictionary # we can return the tashkeel infnoun_form_list = [] for infnoun in possible_noun_list: # get the noun and get all its forms from the dict # if the noun has plural suffix, don't look up # in broken plural dictionary infnoun_foundlist = self.noun_dictionary.lookup( infnoun, 'unknown') infnoun_form_list += infnoun_foundlist for noun_tuple in infnoun_form_list: # noun_tuple = self.noun_dictionary.getEntryById(id) infnoun = noun_tuple['vocalized'] original_tags = () #~original = noun_tuple['vocalized'] wordtype = noun_tuple['word_type'] vocalized = vocalize(infnoun, procletic, prefix_conj, suffix_conj, encletic) #print "v", vocalized.encode('utf8') detailed_result.append(wordcase.WordCase({ 'word':noun, 'affix': (procletic, prefix_conj, suffix_conj, encletic), 'stem':stem_conj, 'original':infnoun, #original, 'vocalized':vocalized, 'semivocalized':vocalized, 'tags':u':'.join(snconst.COMP_PREFIX_LIST_TAGS[procletic]['tags']\ +snconst.COMP_SUFFIX_LIST_TAGS[encletic]['tags']+\ snconst.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['tags']), 'type':u':'.join(['Noun', wordtype]), #'Noun', 'freq':noun_tuple['freq'], 'originaltags':u':'.join(original_tags), 'syntax':'', })) return detailed_result
def steming_second_level(self, stop, stop2, procletic, encletic_nm): """ Analyze word morphologically by stemming the conjugation affixes. @param stop: the input stop. @type stop: unicode. @param stop2: the stop stemed from syntaxic affixes. @type stop2: unicode. @param procletic: the syntaxic prefixe extracted in the fisrt stage. @type procletic: unicode. @param encletic_nm: the syntaxic suffixe extracted in the first stage (not vocalized). @type encletic_nm: unicode. @return: list of dictionaries of analyzed words with tags. @rtype: list. """ detailed_result = [] #segment the coinjugated verb list_seg_conj = self.conj_stemmer.segment(stop2) # verify affix compatibility list_seg_conj = self.verify_affix(stop2, list_seg_conj, ssconst.STOPWORDS_CONJUGATION_AFFIX) # add vocalized forms of suffixes # and create the real affixes from the word #~list_seg_conj_voc = [] for seg_conj in list_seg_conj: stem_conj = stop2[seg_conj[0]:seg_conj[1]] suffix_conj_nm = stop2[seg_conj[1]:] # noirmalize hamza before gessing differents origines #~stem_conj = araby.normalize_hamza(stem_conj) # generate possible stems # add stripped letters to the stem to constitute possible stop list possible_stop_list = self.get_stem_variants(stem_conj, suffix_conj_nm) # search the stop in the dictionary # we can return the tashkeel infstop_form_list = [] for infstop in set(possible_stop_list): # get the stop and get all its forms from the dict # if the stop has plural suffix, don't look up in #broken plural dictionary if infstop not in self.cache_dict_search: infstop_foundlist = self.stop_dictionary.lookup(infstop) self.cache_dict_search[infstop] = self.create_dict_word( infstop_foundlist) else: infstop_foundlist = self.cache_dict_search[infstop] infstop_form_list.extend(infstop_foundlist) for stop_tuple in infstop_form_list: # stop_tuple = self.stop_dictionary.getEntryById(id) original = stop_tuple['vocalized'] #test if the given word from dictionary accept those # tags given by affixes # دراسة توافق الزوائد مع خصائص الاسم، # مثلا هل يقبل الاسم التأنيث. #~if validate_tags(stop_tuple, affix_tags, procletic, encletic_nm, suffix_conj_nm): for vocalized_encletic in ssconst.COMP_SUFFIX_LIST_TAGS[ encletic_nm]['vocalized']: for vocalized_suffix in ssconst.CONJ_SUFFIX_LIST_TAGS[ suffix_conj_nm]['vocalized']: # affixes tags contains prefixes and suffixes tags affix_tags = ssconst.COMP_PREFIX_LIST_TAGS[procletic]['tags'] \ +ssconst.COMP_SUFFIX_LIST_TAGS[vocalized_encletic]['tags'] \ +ssconst.CONJ_SUFFIX_LIST_TAGS[vocalized_suffix]['tags'] ## verify compatibility between procletics and affix valid = self.validate_tags(stop_tuple, affix_tags, procletic, encletic_nm) compatible = self.is_compatible_proaffix_affix( stop_tuple, procletic, vocalized_encletic, vocalized_suffix) if valid and compatible: vocalized, semi_vocalized = self.vocalize( original, procletic, vocalized_suffix, vocalized_encletic) vocalized = self.ajust_vocalization(vocalized) #ToDo: # if the stop word is inflected or not is_inflected = u"مبني" if stop_tuple[ 'is_inflected'] == 0 else u"معرب" #add some tags from dictionary entry as # use action and object_type original_tags = u":".join([ stop_tuple['word_type'], stop_tuple['word_class'], is_inflected, stop_tuple['action'], ]) #~print "STOP_TUPEL[action]:", stop_tuple['action'].encode("utf8") # generate word case detailed_result.append( wordcase.WordCase({ 'word': stop, 'affix': (procletic, '', vocalized_suffix, vocalized_encletic), 'stem': stem_conj, 'original': original, 'vocalized': vocalized, 'semivocalized': semi_vocalized, 'tags': u':'.join(affix_tags), 'type': u':'.join( ['STOPWORD', stop_tuple['word_type']]), 'freq': 'freqstopword', # to note the frequency type 'originaltags': original_tags, "action": stop_tuple['action'], "object_type": stop_tuple['object_type'], "need": stop_tuple['need'], 'syntax': '', })) return detailed_result
def stemming_noun(self, noun_in): """ Analyze word morphologically as noun @param noun_in: the input noun. @type noun_in: unicode. @return: list of dictionaries of analyzed words with tags. @rtype: list. """ self.set_error_code('') if not noun_in: self.set_error_code('Empty word') return None debug = self.debug #~list_found = [] detailed_result = [] noun_list = [ noun_in, ] + self.get_noun_variants(noun_in) word_segmented_list = [] for noun in noun_list: list_seg_comp = self.comp_stemmer.segment(noun) # filter list_seg_comp = self.verify_affix(noun, list_seg_comp, SNC.COMP_NOUN_AFFIXES) # treat multi vocalization enclitic for seg in list_seg_comp: proclitic_nm = noun[:seg[0]] stem = noun[seg[0]:seg[1]] enclitic_nm = noun[seg[1]:] # ajusting nouns variant list_stem = [ stem, ] + self.get_input_stem_variants(stem, enclitic_nm) # stem reduced noun : level two for stem in list_stem: word_seg = { 'noun': noun, 'stem_comp': stem, 'pro': proclitic_nm, 'enc': enclitic_nm, } word_segmented_list.append(word_seg) if not word_segmented_list: self.set_error_code(" First level segmentation error") # level two tmp_list = [] if debug: print("after first level") if debug: #~ print(repr(word_segmented_list).replace( #~ '},', '},\n').decode("unicode-escape")) print(arepr(noun_in)) print(print_table(word_segmented_list)) for word_seg in word_segmented_list: #~ detailed_result.extend( #~ self.steming_second_level(word_seg['noun'], word_seg['stem_comp'], #~ word_seg['pro'], word_seg['enc'])) #~ detailed_result_one = [] #segment the coinjugated noun list_seg_conj = self.conj_stemmer.segment(word_seg['stem_comp']) # verify affix compatibility # filter list_seg_conj = self.verify_affix(word_seg['stem_comp'], list_seg_conj, SNC.NOMINAL_CONJUGATION_AFFIX) # add vocalized forms of suffixes # and create the real affixes from the word for seg_conj in list_seg_conj: stem_conj = word_seg['stem_comp'][:seg_conj[1]] suffix = word_seg['stem_comp'][seg_conj[1]:] stem_conj = ar.normalize_hamza(stem_conj) stem_conj_list = self.get_stem_variants(stem_conj, suffix) # generate possible stems # add stripped letters to the stem to constitute possible noun list for stem in stem_conj_list: word_seg_l2 = word_seg.copy() # normalize hamza before gessing differents origines word_seg_l2['stem_conj'] = stem word_seg_l2['suffix'] = suffix #affixes tags contains prefixes and suffixes tags word_seg_l2['affix_tags'] = list( set(SNC.COMP_PREFIX_LIST_TAGS[word_seg_l2['pro']] ['tags'] + SNC.COMP_SUFFIX_LIST_TAGS[ word_seg_l2['enc']]['tags'] + SNC.CONJ_SUFFIX_LIST_TAGS[ word_seg_l2['suffix']]['tags'])) tmp_list.append(word_seg_l2) if debug: print("after second level") if debug: print(arepr(noun_in)) print(print_table(tmp_list)) # lookup in dictionary if not tmp_list: self.set_error_code(" Second level segmentation error") word_segmented_list = tmp_list tmp_list = [] for word_seg in word_segmented_list: # search the noun in the dictionary # we can return the tashkeel inf_noun = word_seg['stem_conj'] # get the noun and get all its forms from the dict # if the noun has plural suffix, don't look up in #broken plural dictionary if inf_noun in self.cache_dict_search: infnoun_foundlist = self.cache_dict_search[inf_noun] else: infnoun_foundlist = self.lookup_dict(inf_noun) self.cache_dict_search[inf_noun] = infnoun_foundlist for noun_tuple in infnoun_foundlist: word_seg_l3 = word_seg.copy() word_seg_l3["original"] = noun_tuple['vocalized'] word_seg_l3["noun_tuple"] = dict(noun_tuple) tmp_list.append(word_seg_l3) if debug: print("after lookup dict") if debug: print(arepr(noun_in)) noun_tuples = [item['noun_tuple'] for item in tmp_list] print(print_table(noun_tuples)) # test compatiblity noun_tuple with affixes and proaffixes # and generate vocalized affixes and suffixes if not tmp_list: self.set_error_code("Not exists in dictionary") word_segmented_list = tmp_list tmp_list = [] for word_seg in word_segmented_list: #test if the given word from dictionary accept those # tags given by affixes # دراسة توافق الزوائد مع خصائص الاسم، # مثلا هل يقبل الاسم التأنيث. if self.validate_tags(word_seg['noun_tuple'], word_seg['affix_tags'], word_seg['pro'], word_seg['enc'], word_seg['suffix']): ## get all vocalized form of suffixes for pro_voc in SNC.COMP_PREFIX_LIST_TAGS[ word_seg['pro']]['vocalized']: for enc_voc in SNC.COMP_SUFFIX_LIST_TAGS[ word_seg['enc']]['vocalized']: for suf_voc in SNC.CONJ_SUFFIX_LIST_TAGS[ word_seg['suffix']]['vocalized']: ## verify compatibility between proclitics and affix if self.__check_clitic_affix( word_seg['noun_tuple'], pro_voc, enc_voc, suf_voc): # get affix tags affix_tags_voc = SNC.COMP_PREFIX_LIST_TAGS[pro_voc]['tags']\ +SNC.COMP_SUFFIX_LIST_TAGS[enc_voc]['tags']\ +SNC.CONJ_SUFFIX_LIST_TAGS[suf_voc]['tags'] word_seg_l4 = word_seg.copy() word_seg_l4['suf_voc'] = suf_voc word_seg_l4['enc_voc'] = enc_voc word_seg_l4['affix_tags'] = affix_tags_voc tmp_list.append(word_seg_l4) if debug: print("after check compatibility") if debug: print(arepr(noun_in)) noun_tuples = [item['noun_tuple'] for item in tmp_list] print(print_table(noun_tuples)) # Generate results if not tmp_list: self.set_error_code("Affixes not compatible") word_segmented_list = tmp_list tmp_list = [] for word_seg in word_segmented_list: # get voalized and vocalized without inflection #~ vocalized, semi_vocalized, _ = self.vocalize( voca_tuple_list = self.vocalize( word_seg['noun_tuple']['vocalized'], word_seg['pro'], word_seg['suf_voc'], word_seg['enc_voc']) for vocalized, semi_vocalized, _ in voca_tuple_list: #add some tags from dictionary entry as #mamnou3 min sarf and broken plural original_tags = [] if word_seg['noun_tuple']['mankous'] == u"Tk": original_tags.append(u"منقوص") # if there are many cases like feminin plural with mansoub and majrour if 'cases' in SNC.CONJ_SUFFIX_LIST_TAGS[word_seg['suf_voc']]: list_cases = SNC.CONJ_SUFFIX_LIST_TAGS[ word_seg['suf_voc']]['cases'] else: list_cases = ('', ) for case in list_cases: voc_affix_case = word_seg['affix_tags'] + (case, ) # filter empty voc_affix_case = [vac for vac in voc_affix_case if vac] detailed_result.append( wordcase.WordCase({ 'word': noun_in, 'affix': (word_seg['pro'], '', word_seg['suf_voc'], word_seg['enc_voc']), 'stem': word_seg['stem_conj'], 'root': ar.normalize_hamza(word_seg['noun_tuple'].get( 'root', '')), 'original': word_seg['noun_tuple']['vocalized'], #original, 'vocalized': vocalized, 'semivocalized': semi_vocalized, 'tags': u':'.join(voc_affix_case), 'type': u':'.join( ['Noun', word_seg['noun_tuple']['wordtype']]), 'number': word_seg['noun_tuple']['number'], 'gender': word_seg['noun_tuple']['gender'], 'freq': 'freqnoun', # to note the frequency type 'originaltags': u':'.join(original_tags), 'syntax': '', })) if not detailed_result: self.set_error_code("Forms are not generated") if debug: print("after generate result") if debug: print(len(detailed_result)) #~ if debug: print repr(detailed_result).replace('},','},\n').decode("unicode-escape") return detailed_result
def stemming_verb(self, verb_in): """ Stemming verb @param verb_in: given verb @type verb_in: unicode @return : stemmed words @rtype: """ #~ list_found = [] detailed_result = [] verb_list = [ verb_in, ] + self.get_verb_variants(verb_in) #list of segmented words word_segmented_list = [] for verb in verb_list: list_seg_comp = self.comp_stemmer.segment(verb) for seg in list_seg_comp: proclitic = verb[:seg[0]] stem = verb[seg[0]:seg[1]] enclitic = verb[seg[1]:] #~ print "stem_verb affix 93", "-".join([proclitic, stem, enclitic]).encode('utf8') #~secondsuffix = u'' # حالة الفعل المتعدي لمفعولين if enclitic in SVC.TABLE_DOUBLE_TRANSITIVE_SUFFIX: firstsuffix = \ SVC.TABLE_DOUBLE_TRANSITIVE_SUFFIX[enclitic]['first'] enclitic = firstsuffix list_stem = [stem] + self.get_in_stem_variants(stem, enclitic) #if enclitic, then transitive is ok transitive_comp = bool(enclitic) for stm in list_stem: word_seg = { "verb": verb, "pro": proclitic, "enc": enclitic, 'stem_comp': stm, 'trans_comp': transitive_comp, } word_segmented_list.append(word_seg) # second level for segmented word tmp_list = [] #~ print 'first level', verb_in, len(word_segmented_list) for word_seg in word_segmented_list: verb2 = word_seg['stem_comp'] # stem reduced verb : level two #segment the conjugated verb list_seg_conj = self.conj_stemmer.segment(verb2) # verify affix compatibility list_seg_conj = self.verify_affix(verb2, list_seg_conj, SVC.VERBAL_CONJUGATION_AFFIX) # verify proclitics and enclitecs # verify length pof stem for seg_conj in list_seg_conj: if (seg_conj[1] - seg_conj[0]) <= 6: #word seg in level 2 word_seg_l2 = word_seg.copy() word_seg_l2["prefix"] = verb2[:seg_conj[0]] word_seg_l2["stem_conj"] = verb2[seg_conj[0]:seg_conj[1]] word_seg_l2["suffix"] = verb2[seg_conj[1]:] tmp_list.append(word_seg_l2) # verify compatibilty between proclitic and affixes word_segmented_list = tmp_list #~ print 'compatibility', verb_in, len(tmp_list) tmp_list = [] for word_seg in word_segmented_list: # verify compatibility between proclitics and affixes proclitic = word_seg['pro'] enclitic = word_seg['enc'] affix_conj = u"-".join([word_seg['prefix'], word_seg['suffix']]) if self.__check_clitic_affix(proclitic, enclitic, affix_conj): tmp_list.append(word_seg.copy()) #~ print 'stamp', verb_in, len(tmp_list) # verify existance of condidate verb by stamp word_segmented_list = tmp_list #~ tmp_list = [] #~ for word_seg in word_segmented_list: #~ # verify existance of condidate verb by stamp #~ if self.verb_dictionary.exists_as_stamp(word_seg['stem_conj']): #~ tmp_list.append(word_seg.copy()) #print 'infinitive', verb_in, len(tmp_list) #~ # get infinitive of condidate verbs #~ word_segmented_list = tmp_list #~ tmp_list = [] #~ for word_seg in word_segmented_list: #~ # get infinitive of condidate verb by stamp #~ # search the verb in the dictionary by stamp #~ # if the verb exists in dictionary, #~ # The transitivity is consedered #~ # if is trilateral return its forms and Tashkeel #~ # if not return forms without tashkeel, #~ #because the conjugator can vocalized it, #~ # we can return the tashkeel if we don't need the #~ #conjugation step #~ infverb_dict = self.__get_infinitive_verb_by_stem( #~ word_seg['stem_conj'], word_seg['trans_comp']) #print "list possible verbs", len(infverb_dict) #for item in infverb_dict: #print item['verb'] #~ # filter verbs #~ infverb_dict = self.__verify_infinitive_verbs( #~ word_seg['stem_conj'], infverb_dict) #~ for item in infverb_dict: #~ #The haraka from is given from the dict #~ word_seg_l3 = word_seg.copy() #~ word_seg_l3['inf'] = item['verb'] #~ word_seg_l3['haraka'] = item['haraka'] #~ word_seg_l3['root'] = item.get('root','') #~ word_seg_l3['transitive'] = bool(item['transitive'] in ('y', #~ 1)) #~ tmp_list.append(word_seg_l3) #~ # conjugation step #~ print repr(tmp_list).replace('},','},\n').decode("unicode-escape") #~ print 'conj', verb_in, len(tmp_list) # get conjugation for every infinitive verb #~ word_segmented_list = tmp_list #~ tmp_list = [] #~ for word_seg in word_segmented_list: #~ # ToDo, conjugate the verb with affix, #~ # if exists one verb which match, return it #~ # تصريف الفعل مع الزوائد #~ # إذا توافق التصريف مع الكلمة الناتجة #~ # تعرض النتيجة #~ one_correct_conj = self.__generate_possible_conjug( #~ word_seg['inf'], word_seg['stem_comp'], #~ word_seg['prefix'] + '-' + word_seg['suffix'], #~ word_seg['haraka'], word_seg['pro'], word_seg['enc'], #~ word_seg['transitive']) ##~ print "len correct_conj", len(one_correct_conj) #~ for conj in one_correct_conj: #~ word_seg_l4 = word_seg.copy() #~ word_seg_l4['conj'] = conj.copy() #~ tmp_list.append(word_seg_l4) #~ print 'result', verb_in, len(tmp_list) # generate all resulted data #~ word_segmented_list = tmp_list #filter invalid verb stem like the case of TEH Marbuta word_segmented_list = [ x for x in word_segmented_list if self.is_valid_verb_stem(x['stem_conj']) ] # add root ans lemma for word_seg in word_segmented_list: # Choose a root word_seg['root'] = self.choose_wazn_root(stem) # remove empty roots tmp_list = [x for x in word_segmented_list if x['root']] # if the tmp list is empty, there are no root, # in this case we sould remove this case # make it temporary for test if tmp_list: word_segmented_list = tmp_list # create result for word_seg in word_segmented_list: #~ conj = word_seg['conj'] #~ vocalized, semivocalized = self.vocalize( #~ conj['vocalized'], word_seg['pro'], word_seg['enc']) tag_type = 'Verb' #~ original_tags = "y" if conj['transitive'] else "n" stem = word_seg['stem_conj'] detailed_result.append( wordcase.WordCase({ 'word': word_seg['verb'], 'affix': (word_seg['pro'], word_seg['prefix'], word_seg['suffix'], word_seg['enc']), 'stem': stem, #~ 'root':"VTODO", 'root': self.choose_wazn_root(stem), "original": "VTODO", #~ 'root':ar.normalize_hamza(word_seg.get('root','')), #~ 'original':conj['verb'], #~ 'vocalized':vocalized, #~ 'semivocalized':semivocalized, #~ 'tags':u':'.join((conj['tense'], conj['pronoun'])+\ #~ SVC.COMP_PREFIX_LIST_TAGS[proclitic]['tags']+\ #~ SVC.COMP_SUFFIX_LIST_TAGS[enclitic]['tags']),#\ 'type': tag_type, #~ 'number': conj['pronoun_tags'].get('number', ''), #~ 'gender': conj['pronoun_tags'].get('gender', ''), #~ 'person': conj['pronoun_tags'].get('person', ''), #~ 'tense2': conj['tense_tags'].get('tense', ''), #~ 'voice': conj['tense_tags'].get('voice', ''), #~ 'mood': conj['tense_tags'].get('mood', ''), #~ 'confirmed': conj['tense_tags'].get('confirmed', ''), #~ 'transitive': conj['transitive'], #~ 'tense': conj['tense'], #~ 'pronoun': conj['pronoun'], #~ 'freq':'freqverb', #~ 'originaltags':original_tags, #~ 'syntax':'', })) return detailed_result
def check_word(self, word, guessedtag=""): """ Analyze one word morphologically as verbs @param word: the input word. @type word: unicode. @return: list of dictionaries of analyzed words with tags. @rtype: list. """ word = araby.strip_tatweel(word) word_vocalised = word word_nm = araby.strip_tashkeel(word) # get analysed details from cache if used if self.allow_cache_use and self.cache.is_already_checked(word_nm): #~ print (u"'%s'"%word).encode('utf8'), 'found' resulted_data = self.cache.get_checked(word_nm) else: resulted_data = [] # if word is a pounctuation resulted_data += self.check_word_as_pounct(word_nm) # Done: if the word is a stop word we have some problems, # the stop word can also be another normal word (verb or noun), # we must consider it in future works # if word is stopword allow stop words analysis if araby.is_arabicword(word_nm): resulted_data += self.check_word_as_stopword(word_nm) #if word is verb # مشكلة بعض الكلمات المستبعدة تعتبر أفعلا أو اسماء #~if self.tagger.has_verb_tag(guessedtag) or \ #~self.tagger.is_stopword_tag(guessedtag): #~resulted_data += self.check_word_as_verb(word_nm) resulted_data += self.check_word_as_verb(word_nm) #print "is verb", rabti, len(resulted_data) #if word is noun #~if self.tagger.has_noun_tag(guessedtag) or \ #~self.tagger.is_stopword_tag(guessedtag): #~resulted_data += self.check_word_as_noun(word_nm) resulted_data += self.check_word_as_noun(word_nm) if len(resulted_data) == 0: #print (u"1 _unknown %s-%s"%(word, word_nm)).encode('utf8') #check the word as unkonwn resulted_data += self.check_word_as_unknown(word_nm) #check if the word is nomralized and solution are equivalent resulted_data = self.check_normalized(word_vocalised, resulted_data) #check if the word is shadda like resulted_data = self.check_shadda(word_vocalised, resulted_data, self.fully_vocalized_input) # add word frequency information in tags resulted_data = self.add_word_frequency(resulted_data) # add the stemmed words details into Cache data_list_to_serialize = [w.__dict__ for w in resulted_data] if self.allow_cache_use: self.cache.add_checked(word_nm, data_list_to_serialize) #check if the word is vocalized like results if self.partial_vocalization_support: resulted_data = self.check_partial_vocalized( word_vocalised, resulted_data) if len(resulted_data) == 0: error_code = self.get_error_code() resulted_data.append( wordcase.WordCase({ 'word': word, 'affix': ('', '', '', ''), 'stem': word, 'original': word, 'vocalized': word, 'semivocalized': word, 'tags': u'%s' % error_code, 'type': 'unknown', 'root': '', 'template': '', 'freq': self.wordfreq.get_freq(word, 'unknown'), 'syntax': '', })) return resulted_data
def stemming_verb(self, verb_in): """ Stemming verb @param verb_in: given verb @type verb_in: unicode @return : stemmed words @rtype: """ if not verb_in: return None #~ list_found = [] detailed_result = [] verb_list = [ verb_in, ] + self.get_verb_variants(verb_in) debug = self.debug #list of segmented words word_segmented_list = [] for verb in verb_list: list_seg_comp = self.comp_stemmer.segment(verb) for seg in list_seg_comp: proclitic = verb[:seg[0]] stem = verb[seg[0]:seg[1]] enclitic = verb[seg[1]:] #~ print "stem_verb affix 93", "-".join([proclitic, stem, enclitic]).encode('utf8') #~secondsuffix = u'' # حالة الفعل المتعدي لمفعولين if enclitic in SVC.TABLE_DOUBLE_TRANSITIVE_SUFFIX: firstsuffix = \ SVC.TABLE_DOUBLE_TRANSITIVE_SUFFIX[enclitic]['first'] enclitic = firstsuffix list_stem = [stem] + self.get_in_stem_variants(stem, enclitic) #if enclitic, then transitive is ok transitive_comp = bool(enclitic) for stm in list_stem: word_seg = { "verb": verb, "pro": proclitic, "enc": enclitic, 'stem_comp': stm, 'trans_comp': transitive_comp, } word_segmented_list.append(word_seg) if debug: print("after first level") if debug: #~ print(repr(word_segmented_list).replace( #~ '},', '},\n').decode("unicode-escape")) print(arepr(verb_in)) print(print_table(word_segmented_list)) # second level for segmented word tmp_list = [] #~ print 'first level', verb_in, len(word_segmented_list) for word_seg in word_segmented_list: verb2 = word_seg['stem_comp'] # stem reduced verb : level two #segment the conjugated verb list_seg_conj = self.conj_stemmer.segment(verb2) # verify affix compatibility list_seg_conj = self.verify_affix(verb2, list_seg_conj, SVC.VERBAL_CONJUGATION_AFFIX) # verify proclitics and enclitecs # verify length pof stem for seg_conj in list_seg_conj: if (seg_conj[1] - seg_conj[0]) <= 6: #word seg in level 2 word_seg_l2 = word_seg.copy() word_seg_l2["prefix"] = verb2[:seg_conj[0]] word_seg_l2["stem_conj"] = verb2[seg_conj[0]:seg_conj[1]] word_seg_l2["suffix"] = verb2[seg_conj[1]:] tmp_list.append(word_seg_l2) # verify compatibilty between proclitic and affixes word_segmented_list = tmp_list #~ print 'compatibility', verb_in, len(tmp_list) tmp_list = [] for word_seg in word_segmented_list: # verify compatibility between proclitics and affixes proclitic = word_seg['pro'] enclitic = word_seg['enc'] affix_conj = u"-".join([word_seg['prefix'], word_seg['suffix']]) if self.__check_clitic_affix(proclitic, enclitic, affix_conj): tmp_list.append(word_seg.copy()) #~ print 'stamp', verb_in, len(tmp_list) # verify existance of condidate verb by stamp word_segmented_list = tmp_list tmp_list = [] for word_seg in word_segmented_list: # verify existance of condidate verb by stamp if self.exists_as_stamp(word_seg['stem_conj']): tmp_list.append(word_seg.copy()) if debug: print("after second level") if debug: print(arepr(verb_in)) print(print_table(tmp_list)) #~ print 'infinitive', verb_in, len(tmp_list) # get infinitive of condidate verbs word_segmented_list = tmp_list tmp_list = [] for word_seg in word_segmented_list: # get infinitive of condidate verb by stamp # search the verb in the dictionary by stamp # if the verb exists in dictionary, # The transitivity is consedered # if is trilateral return its forms and Tashkeel # if not return forms without tashkeel, #because the conjugator can vocalized it, # we can return the tashkeel if we don't need the #conjugation step infverb_dict = self.__get_infinitive_verb_by_stem( word_seg['stem_conj'], word_seg['trans_comp']) if debug: print("infinitive candidat verbs") if debug: print(arepr(verb_in)) print(print_table(infverb_dict)) #~ print "list possible verbs", len(infverb_dict) #~ for item in infverb_dict: #~ print item['verb'] # filter verbs infverb_dict = self.__verify_infinitive_verbs( word_seg['stem_conj'], infverb_dict) if debug: print("valid infinitive candidat verbs") if debug: print(arepr(verb_in)) print(print_table(infverb_dict)) for item in infverb_dict: #The haraka from is given from the dict word_seg_l3 = word_seg.copy() word_seg_l3['inf'] = item['verb'] word_seg_l3['haraka'] = item['haraka'] word_seg_l3['root'] = item.get('root', '') word_seg_l3['transitive'] = bool(item['transitive'] in ('y', 1)) tmp_list.append(word_seg_l3) # conjugation step if debug: print("after lookup dict") if debug: print(arepr(verb_in)) print(print_table(tmp_list)) #~ print repr(tmp_list).replace('},','},\n').decode("unicode-escape") #~ print 'conj', verb_in, len(tmp_list) # get conjugation for every infinitive verb word_segmented_list = tmp_list tmp_list = [] for word_seg in word_segmented_list: # ToDo, conjugate the verb with affix, # if exists one verb which match, return it # تصريف الفعل مع الزوائد # إذا توافق التصريف مع الكلمة الناتجة # تعرض النتيجة one_correct_conj = self.__generate_possible_conjug( word_seg['inf'], word_seg['stem_comp'], word_seg['prefix'] + '-' + word_seg['suffix'], word_seg['haraka'], word_seg['pro'], word_seg['enc'], word_seg['transitive']) #~ print "len correct_conj", len(one_correct_conj) for conj in one_correct_conj: word_seg_l4 = word_seg.copy() word_seg_l4['conj'] = conj.copy() tmp_list.append(word_seg_l4) if debug: print("after generating conjugation") if debug: print(arepr(verb_in)) conjs = [item['conj'] for item in tmp_list] print(print_table(conjs)) #~ print 'result', verb_in, len(tmp_list) # generate all resulted data word_segmented_list = tmp_list #~ tmp_list = [] for word_seg in word_segmented_list: conj = word_seg['conj'] #~ vocalized, semivocalized = self.vocalize( vocal_tuple_list = self.vocalize(conj['vocalized'], word_seg['pro'], word_seg['enc']) tag_type = 'Verb' original_tags = "y" if conj['transitive'] else "n" for vocalized, semivocalized in vocal_tuple_list: # prepare tags tags = self.prepare_tags(conj, proclitic, enclitic) detailed_result.append( wordcase.WordCase({ 'word': word_seg['verb'], 'affix': (word_seg['pro'], word_seg['prefix'], word_seg['suffix'], word_seg['enc']), 'stem': word_seg['stem_conj'], 'root': ar.normalize_hamza(word_seg.get('root', '')), 'original': conj['verb'], 'vocalized': vocalized, 'semivocalized': semivocalized, 'tags': tags, #\ 'type': tag_type, 'number': conj['pronoun_tags'].get('number', ''), 'gender': conj['pronoun_tags'].get('gender', ''), 'person': conj['pronoun_tags'].get('person', ''), 'tense2': conj['tense_tags'].get('tense', ''), 'voice': conj['tense_tags'].get('voice', ''), 'mood': conj['tense_tags'].get('mood', ''), 'confirmed': conj['tense_tags'].get('confirmed', ''), 'transitive': conj['transitive'], 'tense': conj['tense'], 'pronoun': conj['pronoun'], 'freq': 'freqverb', 'originaltags': original_tags, 'syntax': '', })) return detailed_result