def test_rooter(dataframe_result): """ """ from pyarabic.arabrepr import arepr #test with tashaphyne from tashaphyne.stemming import ArabicLightStemmer import rootslibclass asl = ArabicLightStemmer() rooter = rootslibclass.rootDict(algos=['rhyzome']) # debug in rhyzome rooter rooter.rhyzome_rooter.debug = True #~ rooter = rootslibclass.rootDict() df = dataframe_result # avoid null roots #~ total = df.size total = len(df.index) cpt = 0 for word, root in zip(df["word"], df["root"]): root_list = root.split(';') print((u"**********%s*********" % word).encode('utf8')) asl.light_stem(word) print((u"Start Word : %s" % asl.get_starword()).encode('utf8')) word = re.sub(u"[%s]" % (araby.ALEF_MADDA), araby.HAMZA + araby.ALEF, word) asl.segment(word) print(asl.get_segment_list()) seg_list = asl.get_segment_list() starstem_list = [] affixa_list = asl.get_affix_list() # stems prints stems = [d['stem'] for d in affixa_list] print("Stems: " + u' '.join(stems).encode('utf8')) roots = [d['root'] for d in affixa_list] print((u"Dafault roots: [%s] a %s" % (asl.get_root(), u' '.join(roots))).encode('utf8')) #~ root_result = rooter.choose_wazn_root(affixa_list, debug=True) root_result = rooter.choose_root(word, affixa_list, debug=True) #~ print(u"Test root",root_result.encode('utf8'), u"found root",root_result.encode('utf8'), root_result == root) print((u" ".join([ u"Test root", root, u"found root", root_result, str(root_result in root_list) ])).encode('utf8')) if root_result in root_list: cpt += 1 print("***** Percent %.2f%% [%d/%d]" % (cpt * 100.0 / total, cpt, total))
def test_matrix(dataframe_result): """ """ from pyarabic.arabrepr import arepr #test with tashaphyne from tashaphyne.stemming import ArabicLightStemmer import rootslibclass asl = ArabicLightStemmer() rooter = rootslibclass.rootDict() rooter.debug = True #test with tashaphyne df = dataframe_result total = df.size cpt = 0 for word, root in zip(df["word"], df["root"]): print((u"**********%s*********"%word).encode('utf8')) asl.light_stem(word) print((u"Start Word : %s"%asl.get_starword()).encode('utf8')) asl.segment(word) print(asl.get_segment_list() ) seg_list = asl.get_segment_list() starstem_list =[] affixa_list = asl.get_affix_list() # stems prints stems = [ d['stem'] for d in affixa_list] roots = [] for stem in stems: temp_list = rooter.matrix_root(stem,u'توطيدا') tmp_roots = [d['root'] for d in temp_list] roots.extend(tmp_roots) #~ tmp_roots = [d['root'] for d in temp_list if rooter.is_root(d['root'])] print((u"Candidats " + u"\t".join(roots)).encode('utf8')) # lookup only one time by root in dictionary set_roots = [x for x in set(roots) if rooter.is_root(x)] # remove invalid roots and keep repetition roots = [x for x in roots if x in set_roots] root_result = most_common(roots) print((u"Accepted " + u"\t".join(roots)).encode('utf8')) print((u"root " + root_result).encode('utf8')) print((u" ".join([u"Test root", root, u"found root", root_result, str(root_result == root)])).encode('utf8')) if root_result == root: cpt += 1 print("***** Percent %.2f%%"%(cpt*100/total))
def test_rooter_matrix(dataframe_result): """ """ from pyarabic.arabrepr import arepr #test with tashaphyne from tashaphyne.stemming import ArabicLightStemmer import rootslibclass asl = ArabicLightStemmer() rooter = rootslibclass.rootDict() df = dataframe_result total = df.size cpt = 0 for word, root in zip(df["word"], df["root"]): print((u"**********%s*********"%word).encode('utf8')) asl.light_stem(word) root_list = root.split(';') print((u"Start Word : %s"%asl.get_starword()).encode('utf8')) asl.segment(word) print(asl.get_segment_list() ) seg_list = asl.get_segment_list() starstem_list =[] affixa_list = asl.get_affix_list() # stems prints stems = [ d['stem'] for d in affixa_list] print("Stems: "+u' '.join(stems).encode('utf8')) roots = [ d['root'] for d in affixa_list] print((u"Dafault roots: [%s] a %s"%(asl.get_root(),u' '.join(roots))).encode('utf8')) #~ root_result = rooter.choose_wazn_root(affixa_list, debug=True) root_result = rooter.choose_root_matrix(word, affixa_list, debug=True) #~ print(u"Test root",root_result.encode('utf8'), u"found root",root_result.encode('utf8'), root_result == root) #~ print((u" ".join([u"Test root", root, u"found root", #~ root_result, str(root_result == root)])).encode('utf8')) #~ if root_result == root: #~ cpt += 1 print((u" ".join([u"Test root", root, u"found root", root_result, str(root_result in root_list)])).encode('utf8')) if root_result in root_list: cpt += 1 #~ print("***** Percent %.2f%%"%(cpt*100/total)) print("***** Percent %.2f%% [%d/%d]"%(cpt*100.0/total, cpt, total))