def __init__(self, ): abstractStemmer.__init__(self) infixes_letters_custom = u"توطيدا" self.set_infix_letters(infixes_letters_custom) self.config["root_dict"] = "yes" #~ rootslib.create_stamped_roots() #~ rootslib.create_virtual_roots() self.rootdict = rootslibclass.rootDict()
def __init__(self, ): abstractStemmer.__init__(self) infixes_letters_custom = u"توطيدا" self.set_infix_letters(infixes_letters_custom) self.set_prefix_list(REVIEWED_PREFIX_LIST) self.set_suffix_list(REVIEWED_SUFFIX_LIST) self.config["root_dict"] = "yes" self.rootdict = rootslibclass.rootDict()
def test2(): #test with tashaphyne #~ rootslib.create_stamped_roots() #~ rootslib.create_virtual_roots() #~ print repr(rootslib.VIRTUAL_DICT).replace('],','],\n').decode('unicode-escape').encode('utf8') from tashaphyne.stemming import ArabicLightStemmer asl = ArabicLightStemmer() asl_custom = abstractstemmer.customStemmer_roots() words = [(u'أفتضاربانني',u'ضرب'), (u'بأبأ',u'بءبء'), (u'يريدون',u'ريد'), (u'يستطعن', u'طوع'), (u'يستطيعون', u'طوع'), (u'الصيام', u'صوم'), (u'يخاف', u'خوف'), (u'كتاب',u'كتب'), (u"بالميدان",u'ميد'), (u"بالأسيهم",u'سهم'), (u"آخرين",u'ءخر'), (u"بالآخرة",u'ءخر'), (u"لارتاب",u'ريب'), (u"وسائل",u'وسل'), (u"وصائل",u'وصل'), (u"أخاه",u'ءخو'), (u"أخوه",u'ءخو'), (u"أخاهم",u'ءخو'), (u"أخانا",u'ءخو'), (u"بإذن",u'ءذن'), (u"للأبرار",u"برر"), (u'واتبعوا', u'تبع'), (u'والكاظمين', u'كظم'), (u'عد', u'عود'), ] # load root dictionary with features rootdict = rootslibclass.rootDict() for word, root in words: print(u"**********%s*********"%word).encode('utf8') word = re.sub(u"[%s]"%(araby.ALEF_MADDA), araby.HAMZA+araby.ALEF, word) asl.light_stem(word) asl.segment(word) print asl.get_segment_list() seg_list = asl.get_segment_list() starstem_list =[] affixa_list = asl.get_affix_list() print repr(affixa_list).replace('},','},\n').decode('unicode-escape').encode('utf8') #~ root_result = rootslib.choose_root(affixa_list, debug=True) root_result = rootdict.choose_root(affixa_list, debug=True) #~ root_result2 = rootdict.choose_root(affixa_list, debug=True) #~ print root_result.encode('utf8'),root_result2.encode('utf8'), asl_custom.getroot(word).encode('utf8'), root_result == root, root_result == root_result2 print root_result.encode('utf8'), asl_custom.getroot(word).encode('utf8'), root_result == root return 0
def test_rooter(dataframe_result): """ """ from pyarabic.arabrepr import arepr #test with tashaphyne from tashaphyne.stemming import ArabicLightStemmer import rootslibclass asl = ArabicLightStemmer() rooter = rootslibclass.rootDict(algos=['rhyzome']) # debug in rhyzome rooter rooter.rhyzome_rooter.debug = True #~ rooter = rootslibclass.rootDict() df = dataframe_result # avoid null roots #~ total = df.size total = len(df.index) cpt = 0 for word, root in zip(df["word"], df["root"]): root_list = root.split(';') print((u"**********%s*********" % word).encode('utf8')) asl.light_stem(word) print((u"Start Word : %s" % asl.get_starword()).encode('utf8')) word = re.sub(u"[%s]" % (araby.ALEF_MADDA), araby.HAMZA + araby.ALEF, word) asl.segment(word) print(asl.get_segment_list()) seg_list = asl.get_segment_list() starstem_list = [] affixa_list = asl.get_affix_list() # stems prints stems = [d['stem'] for d in affixa_list] print("Stems: " + u' '.join(stems).encode('utf8')) roots = [d['root'] for d in affixa_list] print((u"Dafault roots: [%s] a %s" % (asl.get_root(), u' '.join(roots))).encode('utf8')) #~ root_result = rooter.choose_wazn_root(affixa_list, debug=True) root_result = rooter.choose_root(word, affixa_list, debug=True) #~ print(u"Test root",root_result.encode('utf8'), u"found root",root_result.encode('utf8'), root_result == root) print((u" ".join([ u"Test root", root, u"found root", root_result, str(root_result in root_list) ])).encode('utf8')) if root_result in root_list: cpt += 1 print("***** Percent %.2f%% [%d/%d]" % (cpt * 100.0 / total, cpt, total))
def test_matrix(dataframe_result): """ """ from pyarabic.arabrepr import arepr #test with tashaphyne from tashaphyne.stemming import ArabicLightStemmer import rootslibclass asl = ArabicLightStemmer() rooter = rootslibclass.rootDict() rooter.debug = True #test with tashaphyne df = dataframe_result total = df.size cpt = 0 for word, root in zip(df["word"], df["root"]): print((u"**********%s*********"%word).encode('utf8')) asl.light_stem(word) print((u"Start Word : %s"%asl.get_starword()).encode('utf8')) asl.segment(word) print(asl.get_segment_list() ) seg_list = asl.get_segment_list() starstem_list =[] affixa_list = asl.get_affix_list() # stems prints stems = [ d['stem'] for d in affixa_list] roots = [] for stem in stems: temp_list = rooter.matrix_root(stem,u'توطيدا') tmp_roots = [d['root'] for d in temp_list] roots.extend(tmp_roots) #~ tmp_roots = [d['root'] for d in temp_list if rooter.is_root(d['root'])] print((u"Candidats " + u"\t".join(roots)).encode('utf8')) # lookup only one time by root in dictionary set_roots = [x for x in set(roots) if rooter.is_root(x)] # remove invalid roots and keep repetition roots = [x for x in roots if x in set_roots] root_result = most_common(roots) print((u"Accepted " + u"\t".join(roots)).encode('utf8')) print((u"root " + root_result).encode('utf8')) print((u" ".join([u"Test root", root, u"found root", root_result, str(root_result == root)])).encode('utf8')) if root_result == root: cpt += 1 print("***** Percent %.2f%%"%(cpt*100/total))
def __init__(self, debug=False): # create a stemmer object for stemming enclitics and proclitics self.comp_stemmer = tashaphyne.stemming.ArabicLightStemmer() # configure the stemmer object self.comp_stemmer.set_prefix_list(SVC.COMP_PREFIX_LIST) self.comp_stemmer.set_suffix_list(SVC.COMP_SUFFIX_LIST) # create a stemmer object for stemming conjugated verb self.conj_stemmer = tashaphyne.stemming.ArabicLightStemmer() # configure the stemmer object self.conj_stemmer.set_prefix_list(SVC.CONJ_PREFIX_LIST) self.conj_stemmer.set_suffix_list(SVC.CONJ_SUFFIX_LIST) # create a stemmer object for extract root from stems self.root_stemmer = tashaphyne.stemming.ArabicLightStemmer() # configure the stemmer object self.root_stemmer.set_prefix_list(SVC.ROOT_PREFIX_LIST) self.root_stemmer.set_suffix_list(SVC.ROOT_SUFFIX_LIST) # enable the last mark (Harakat Al-I3rab) self.allow_syntax_lastmark = True # To show statistics about verbs #~statistics = {0:0, 1:0, 2:0, 3:0, 4:0, 5:0, 6:0, 7:0, 8:0, 9:0, #~10:0, 11:0, 12:0, 13:0, 14:0, 15:0, 16:0, 17:0, 18:0, 19:0, 20:0, #~} # affixes compatibility self.compatibility_cache = {} #~ self.verb_dict_cache = {} self.debug = debug self.cache_verb = {'verb': {}} #~ self.verb_dictionary = arabicdictionary.ArabicDictionary("verbs") self.verb_stamp_pat = SVC.VERB_STAMP_PAT self.rooter = rootslibclass.rootDict()
def test_rooter_matrix(dataframe_result): """ """ from pyarabic.arabrepr import arepr #test with tashaphyne from tashaphyne.stemming import ArabicLightStemmer import rootslibclass asl = ArabicLightStemmer() rooter = rootslibclass.rootDict() df = dataframe_result total = df.size cpt = 0 for word, root in zip(df["word"], df["root"]): print((u"**********%s*********"%word).encode('utf8')) asl.light_stem(word) root_list = root.split(';') print((u"Start Word : %s"%asl.get_starword()).encode('utf8')) asl.segment(word) print(asl.get_segment_list() ) seg_list = asl.get_segment_list() starstem_list =[] affixa_list = asl.get_affix_list() # stems prints stems = [ d['stem'] for d in affixa_list] print("Stems: "+u' '.join(stems).encode('utf8')) roots = [ d['root'] for d in affixa_list] print((u"Dafault roots: [%s] a %s"%(asl.get_root(),u' '.join(roots))).encode('utf8')) #~ root_result = rooter.choose_wazn_root(affixa_list, debug=True) root_result = rooter.choose_root_matrix(word, affixa_list, debug=True) #~ print(u"Test root",root_result.encode('utf8'), u"found root",root_result.encode('utf8'), root_result == root) #~ print((u" ".join([u"Test root", root, u"found root", #~ root_result, str(root_result == root)])).encode('utf8')) #~ if root_result == root: #~ cpt += 1 print((u" ".join([u"Test root", root, u"found root", root_result, str(root_result in root_list)])).encode('utf8')) if root_result in root_list: cpt += 1 #~ print("***** Percent %.2f%%"%(cpt*100/total)) print("***** Percent %.2f%% [%d/%d]"%(cpt*100.0/total, cpt, total))
def __init__(self, ): abstractStemmer.__init__(self) self.config["root_dict"] = "yes" #~ rootslib.create_stamped_roots() #~ rootslib.create_virtual_roots() self.rootdict = rootslibclass.rootDict() # tagger self.tagger = naftawayh.wordtag.WordTagger() # noun stemmer config # create stemmer self.noun_stemmer = abstractStemmer() # config prefix and suffix list self.noun_stemmer.set_prefix_list(NOUN_PREFIX_LIST) self.noun_stemmer.set_suffix_list(NOUN_SUFFIX_LIST) # verb stemmer config # create stemmer self.verb_stemmer = abstractStemmer() # config prefix and suffix list self.verb_stemmer.set_prefix_list(VERB_PREFIX_LIST) self.verb_stemmer.set_suffix_list(VERB_SUFFIX_LIST)
def __init__(self, ): snowballstemmer.arabic_stemmer.ArabicStemmer.__init__(self, ) self.rootdict = rootslibclass.rootDict() pass
def __init__(self, ): ISRIStemmer.__init__(self) self.rootdict = rootslibclass.rootDict() pass
def __init__(self, ): abstractStemmer.__init__(self, ) self.rootdict = rootslibclass.rootDict() pass
def __init__(self, ): abstractStemmer.__init__(self) infixes_letters_custom = u"توطيدا" self.set_infix_letters(infixes_letters_custom) self.config["root_dict"] = "yes" self.rootdict = rootslibclass.rootDict()
def test2(): rooter = rootslibclass.rootDict() for k in rooter.STAMP_DICT: print((u"%s\t%s"%(k, u", ".join(rooter.STAMP_DICT[k]))).encode('utf8'))