Ejemplo n.º 1
0
def test_rooter(dataframe_result):
    """
    """
    from pyarabic.arabrepr import arepr
    #test with tashaphyne
    from tashaphyne.stemming import ArabicLightStemmer
    import rootslibclass
    asl = ArabicLightStemmer()
    rooter = rootslibclass.rootDict(algos=['rhyzome'])
    # debug in rhyzome rooter
    rooter.rhyzome_rooter.debug = True
    #~ rooter = rootslibclass.rootDict()
    df = dataframe_result
    # avoid null roots

    #~ total = df.size
    total = len(df.index)
    cpt = 0
    for word, root in zip(df["word"], df["root"]):
        root_list = root.split(';')
        print((u"**********%s*********" % word).encode('utf8'))
        asl.light_stem(word)
        print((u"Start Word : %s" % asl.get_starword()).encode('utf8'))

        word = re.sub(u"[%s]" % (araby.ALEF_MADDA), araby.HAMZA + araby.ALEF,
                      word)

        asl.segment(word)
        print(asl.get_segment_list())
        seg_list = asl.get_segment_list()
        starstem_list = []
        affixa_list = asl.get_affix_list()
        # stems prints
        stems = [d['stem'] for d in affixa_list]
        print("Stems: " + u' '.join(stems).encode('utf8'))
        roots = [d['root'] for d in affixa_list]
        print((u"Dafault roots: [%s] a %s" %
               (asl.get_root(), u' '.join(roots))).encode('utf8'))
        #~ root_result = rooter.choose_wazn_root(affixa_list, debug=True)
        root_result = rooter.choose_root(word, affixa_list, debug=True)
        #~ print(u"Test root",root_result.encode('utf8'), u"found root",root_result.encode('utf8'), root_result == root)
        print((u" ".join([
            u"Test root", root, u"found root", root_result,
            str(root_result in root_list)
        ])).encode('utf8'))
        if root_result in root_list:
            cpt += 1
    print("***** Percent %.2f%% [%d/%d]" % (cpt * 100.0 / total, cpt, total))
Ejemplo n.º 2
0
def test_matrix(dataframe_result):
    """
    """
    from pyarabic.arabrepr import arepr
    #test with tashaphyne
    from tashaphyne.stemming import ArabicLightStemmer
    import rootslibclass
    asl = ArabicLightStemmer()
    rooter = rootslibclass.rootDict() 
    rooter.debug = True 
    #test with tashaphyne
    df = dataframe_result
    total = df.size
    cpt = 0
    for word, root in zip(df["word"], df["root"]):
        print((u"**********%s*********"%word).encode('utf8'))
        asl.light_stem(word)
        print((u"Start Word : %s"%asl.get_starword()).encode('utf8'))        
        
        asl.segment(word)
        print(asl.get_segment_list()  )
        seg_list = asl.get_segment_list()  
        starstem_list =[]
        affixa_list = asl.get_affix_list()
        # stems prints 
        stems = [ d['stem'] for d in affixa_list]
        roots = []
        for stem in stems:
            temp_list = rooter.matrix_root(stem,u'توطيدا')
            tmp_roots = [d['root'] for d in temp_list]
            roots.extend(tmp_roots)
            #~ tmp_roots = [d['root'] for d in temp_list if rooter.is_root(d['root'])]
        print((u"Candidats " + u"\t".join(roots)).encode('utf8'))
        # lookup only one time by root in dictionary
        set_roots = [x for x in set(roots) if rooter.is_root(x)]
        # remove invalid roots and keep repetition
        roots = [x for x in roots if x in set_roots]
        root_result = most_common(roots)
        print((u"Accepted " + u"\t".join(roots)).encode('utf8'))
        print((u"root " + root_result).encode('utf8'))
        print((u" ".join([u"Test root", root, u"found root",
        root_result, str(root_result == root)])).encode('utf8'))
        if root_result == root:
            cpt += 1
    print("***** Percent %.2f%%"%(cpt*100/total))        
Ejemplo n.º 3
0
def test_rooter_matrix(dataframe_result):
    """
    """
    from pyarabic.arabrepr import arepr
    #test with tashaphyne
    from tashaphyne.stemming import ArabicLightStemmer
    import rootslibclass
    asl = ArabicLightStemmer() 
    rooter = rootslibclass.rootDict()       
    df = dataframe_result
    total = df.size
    cpt = 0
    for word, root in zip(df["word"], df["root"]):
        print((u"**********%s*********"%word).encode('utf8'))
        asl.light_stem(word)
        root_list = root.split(';')        
        print((u"Start Word : %s"%asl.get_starword()).encode('utf8'))        
        
        asl.segment(word)
        print(asl.get_segment_list()  )
        seg_list = asl.get_segment_list()  
        starstem_list =[]
        affixa_list = asl.get_affix_list()
        # stems prints 
        stems = [ d['stem'] for d in affixa_list]
        print("Stems: "+u' '.join(stems).encode('utf8'))        
        roots = [ d['root'] for d in affixa_list]
        print((u"Dafault roots: [%s] a %s"%(asl.get_root(),u' '.join(roots))).encode('utf8'))        
        #~ root_result = rooter.choose_wazn_root(affixa_list, debug=True)
        root_result = rooter.choose_root_matrix(word, affixa_list, debug=True)
        #~ print(u"Test root",root_result.encode('utf8'), u"found root",root_result.encode('utf8'), root_result == root)
        #~ print((u" ".join([u"Test root", root, u"found root",
        #~ root_result, str(root_result == root)])).encode('utf8'))
        #~ if root_result == root:
            #~ cpt += 1
        print((u" ".join([u"Test root", root, u"found root",
        root_result, str(root_result in root_list)])).encode('utf8'))
        if root_result in  root_list:
            cpt += 1            
    #~ print("***** Percent %.2f%%"%(cpt*100/total)) 
    print("***** Percent %.2f%% [%d/%d]"%(cpt*100.0/total, cpt, total))