def __init__(self, ):
     abstractStemmer.__init__(self)
     infixes_letters_custom = u"توطيدا"
     self.set_infix_letters(infixes_letters_custom)
     self.config["root_dict"] = "yes"
     #~ rootslib.create_stamped_roots()
     #~ rootslib.create_virtual_roots()
     self.rootdict = rootslibclass.rootDict()
 def __init__(self, ):
     abstractStemmer.__init__(self)
     infixes_letters_custom = u"توطيدا"
     self.set_infix_letters(infixes_letters_custom)
     self.set_prefix_list(REVIEWED_PREFIX_LIST)
     self.set_suffix_list(REVIEWED_SUFFIX_LIST)
     self.config["root_dict"] = "yes"
     self.rootdict = rootslibclass.rootDict()
def test2():

    #test with tashaphyne
    #~ rootslib.create_stamped_roots()
    #~ rootslib.create_virtual_roots()
    #~ print repr(rootslib.VIRTUAL_DICT).replace('],','],\n').decode('unicode-escape').encode('utf8')
    from tashaphyne.stemming import ArabicLightStemmer
    asl = ArabicLightStemmer()
    asl_custom = abstractstemmer.customStemmer_roots()
    words = [(u'أفتضاربانني',u'ضرب'),
    (u'بأبأ',u'بءبء'),
    (u'يريدون',u'ريد'),
    (u'يستطعن', u'طوع'),
    (u'يستطيعون', u'طوع'),
    (u'الصيام', u'صوم'),
    (u'يخاف', u'خوف'),
    (u'كتاب',u'كتب'),
    (u"بالميدان",u'ميد'),
    (u"بالأسيهم",u'سهم'),
    (u"آخرين",u'ءخر'),
    (u"بالآخرة",u'ءخر'),    
    (u"لارتاب",u'ريب'),    
    (u"وسائل",u'وسل'),    
    (u"وصائل",u'وصل'),    
    (u"أخاه",u'ءخو'),    
    (u"أخوه",u'ءخو'),    
    (u"أخاهم",u'ءخو'),    
    (u"أخانا",u'ءخو'),  
    (u"بإذن",u'ءذن'), 
    (u"للأبرار",u"برر"),
    (u'واتبعوا', u'تبع'),
    (u'والكاظمين', u'كظم'),
    (u'عد', u'عود'),


  
    
    ]
    # load root dictionary with features
    rootdict = rootslibclass.rootDict()
    for word, root in words:
        print(u"**********%s*********"%word).encode('utf8')
        word = re.sub(u"[%s]"%(araby.ALEF_MADDA), araby.HAMZA+araby.ALEF, word)

        asl.light_stem(word)
        asl.segment(word)
        print asl.get_segment_list()  
        seg_list = asl.get_segment_list()  
        starstem_list =[]
        affixa_list = asl.get_affix_list()
        print repr(affixa_list).replace('},','},\n').decode('unicode-escape').encode('utf8')
        #~ root_result = rootslib.choose_root(affixa_list, debug=True)
        root_result = rootdict.choose_root(affixa_list, debug=True)
        #~ root_result2 = rootdict.choose_root(affixa_list, debug=True)
        #~ print root_result.encode('utf8'),root_result2.encode('utf8'), asl_custom.getroot(word).encode('utf8'), root_result == root, root_result == root_result2
        print root_result.encode('utf8'), asl_custom.getroot(word).encode('utf8'), root_result == root
        
    return 0
Ejemplo n.º 4
0
def test_rooter(dataframe_result):
    """
    """
    from pyarabic.arabrepr import arepr
    #test with tashaphyne
    from tashaphyne.stemming import ArabicLightStemmer
    import rootslibclass
    asl = ArabicLightStemmer()
    rooter = rootslibclass.rootDict(algos=['rhyzome'])
    # debug in rhyzome rooter
    rooter.rhyzome_rooter.debug = True
    #~ rooter = rootslibclass.rootDict()
    df = dataframe_result
    # avoid null roots

    #~ total = df.size
    total = len(df.index)
    cpt = 0
    for word, root in zip(df["word"], df["root"]):
        root_list = root.split(';')
        print((u"**********%s*********" % word).encode('utf8'))
        asl.light_stem(word)
        print((u"Start Word : %s" % asl.get_starword()).encode('utf8'))

        word = re.sub(u"[%s]" % (araby.ALEF_MADDA), araby.HAMZA + araby.ALEF,
                      word)

        asl.segment(word)
        print(asl.get_segment_list())
        seg_list = asl.get_segment_list()
        starstem_list = []
        affixa_list = asl.get_affix_list()
        # stems prints
        stems = [d['stem'] for d in affixa_list]
        print("Stems: " + u' '.join(stems).encode('utf8'))
        roots = [d['root'] for d in affixa_list]
        print((u"Dafault roots: [%s] a %s" %
               (asl.get_root(), u' '.join(roots))).encode('utf8'))
        #~ root_result = rooter.choose_wazn_root(affixa_list, debug=True)
        root_result = rooter.choose_root(word, affixa_list, debug=True)
        #~ print(u"Test root",root_result.encode('utf8'), u"found root",root_result.encode('utf8'), root_result == root)
        print((u" ".join([
            u"Test root", root, u"found root", root_result,
            str(root_result in root_list)
        ])).encode('utf8'))
        if root_result in root_list:
            cpt += 1
    print("***** Percent %.2f%% [%d/%d]" % (cpt * 100.0 / total, cpt, total))
Ejemplo n.º 5
0
def test_matrix(dataframe_result):
    """
    """
    from pyarabic.arabrepr import arepr
    #test with tashaphyne
    from tashaphyne.stemming import ArabicLightStemmer
    import rootslibclass
    asl = ArabicLightStemmer()
    rooter = rootslibclass.rootDict() 
    rooter.debug = True 
    #test with tashaphyne
    df = dataframe_result
    total = df.size
    cpt = 0
    for word, root in zip(df["word"], df["root"]):
        print((u"**********%s*********"%word).encode('utf8'))
        asl.light_stem(word)
        print((u"Start Word : %s"%asl.get_starword()).encode('utf8'))        
        
        asl.segment(word)
        print(asl.get_segment_list()  )
        seg_list = asl.get_segment_list()  
        starstem_list =[]
        affixa_list = asl.get_affix_list()
        # stems prints 
        stems = [ d['stem'] for d in affixa_list]
        roots = []
        for stem in stems:
            temp_list = rooter.matrix_root(stem,u'توطيدا')
            tmp_roots = [d['root'] for d in temp_list]
            roots.extend(tmp_roots)
            #~ tmp_roots = [d['root'] for d in temp_list if rooter.is_root(d['root'])]
        print((u"Candidats " + u"\t".join(roots)).encode('utf8'))
        # lookup only one time by root in dictionary
        set_roots = [x for x in set(roots) if rooter.is_root(x)]
        # remove invalid roots and keep repetition
        roots = [x for x in roots if x in set_roots]
        root_result = most_common(roots)
        print((u"Accepted " + u"\t".join(roots)).encode('utf8'))
        print((u"root " + root_result).encode('utf8'))
        print((u" ".join([u"Test root", root, u"found root",
        root_result, str(root_result == root)])).encode('utf8'))
        if root_result == root:
            cpt += 1
    print("***** Percent %.2f%%"%(cpt*100/total))        
Ejemplo n.º 6
0
    def __init__(self, debug=False):
        # create a stemmer object for stemming enclitics and proclitics
        self.comp_stemmer = tashaphyne.stemming.ArabicLightStemmer()

        # configure the stemmer object
        self.comp_stemmer.set_prefix_list(SVC.COMP_PREFIX_LIST)
        self.comp_stemmer.set_suffix_list(SVC.COMP_SUFFIX_LIST)

        # create a stemmer object for stemming conjugated verb
        self.conj_stemmer = tashaphyne.stemming.ArabicLightStemmer()

        # configure the stemmer object
        self.conj_stemmer.set_prefix_list(SVC.CONJ_PREFIX_LIST)
        self.conj_stemmer.set_suffix_list(SVC.CONJ_SUFFIX_LIST)

        # create a stemmer object for extract root from stems
        self.root_stemmer = tashaphyne.stemming.ArabicLightStemmer()

        # configure the stemmer object
        self.root_stemmer.set_prefix_list(SVC.ROOT_PREFIX_LIST)
        self.root_stemmer.set_suffix_list(SVC.ROOT_SUFFIX_LIST)

        # enable the last mark (Harakat Al-I3rab)
        self.allow_syntax_lastmark = True

        # To show statistics about verbs
        #~statistics = {0:0, 1:0, 2:0, 3:0, 4:0, 5:0, 6:0, 7:0, 8:0, 9:0,
        #~10:0, 11:0, 12:0, 13:0, 14:0, 15:0, 16:0, 17:0, 18:0, 19:0, 20:0,
        #~}
        # affixes compatibility
        self.compatibility_cache = {}
        #~ self.verb_dict_cache = {}

        self.debug = debug
        self.cache_verb = {'verb': {}}

        #~ self.verb_dictionary = arabicdictionary.ArabicDictionary("verbs")

        self.verb_stamp_pat = SVC.VERB_STAMP_PAT

        self.rooter = rootslibclass.rootDict()
Ejemplo n.º 7
0
def test_rooter_matrix(dataframe_result):
    """
    """
    from pyarabic.arabrepr import arepr
    #test with tashaphyne
    from tashaphyne.stemming import ArabicLightStemmer
    import rootslibclass
    asl = ArabicLightStemmer() 
    rooter = rootslibclass.rootDict()       
    df = dataframe_result
    total = df.size
    cpt = 0
    for word, root in zip(df["word"], df["root"]):
        print((u"**********%s*********"%word).encode('utf8'))
        asl.light_stem(word)
        root_list = root.split(';')        
        print((u"Start Word : %s"%asl.get_starword()).encode('utf8'))        
        
        asl.segment(word)
        print(asl.get_segment_list()  )
        seg_list = asl.get_segment_list()  
        starstem_list =[]
        affixa_list = asl.get_affix_list()
        # stems prints 
        stems = [ d['stem'] for d in affixa_list]
        print("Stems: "+u' '.join(stems).encode('utf8'))        
        roots = [ d['root'] for d in affixa_list]
        print((u"Dafault roots: [%s] a %s"%(asl.get_root(),u' '.join(roots))).encode('utf8'))        
        #~ root_result = rooter.choose_wazn_root(affixa_list, debug=True)
        root_result = rooter.choose_root_matrix(word, affixa_list, debug=True)
        #~ print(u"Test root",root_result.encode('utf8'), u"found root",root_result.encode('utf8'), root_result == root)
        #~ print((u" ".join([u"Test root", root, u"found root",
        #~ root_result, str(root_result == root)])).encode('utf8'))
        #~ if root_result == root:
            #~ cpt += 1
        print((u" ".join([u"Test root", root, u"found root",
        root_result, str(root_result in root_list)])).encode('utf8'))
        if root_result in  root_list:
            cpt += 1            
    #~ print("***** Percent %.2f%%"%(cpt*100/total)) 
    print("***** Percent %.2f%% [%d/%d]"%(cpt*100.0/total, cpt, total))
    def __init__(self, ):
        abstractStemmer.__init__(self)
        self.config["root_dict"] = "yes"
        #~ rootslib.create_stamped_roots()
        #~ rootslib.create_virtual_roots()
        self.rootdict = rootslibclass.rootDict()

        # tagger
        self.tagger = naftawayh.wordtag.WordTagger()
        # noun stemmer config
        # create stemmer
        self.noun_stemmer = abstractStemmer()
        # config prefix and suffix list
        self.noun_stemmer.set_prefix_list(NOUN_PREFIX_LIST)
        self.noun_stemmer.set_suffix_list(NOUN_SUFFIX_LIST)
        # verb stemmer config
        # create stemmer
        self.verb_stemmer = abstractStemmer()
        # config prefix and suffix list
        self.verb_stemmer.set_prefix_list(VERB_PREFIX_LIST)
        self.verb_stemmer.set_suffix_list(VERB_SUFFIX_LIST)
 def __init__(self, ):
     snowballstemmer.arabic_stemmer.ArabicStemmer.__init__(self, )
     self.rootdict = rootslibclass.rootDict()
     pass
    def __init__(self, ):
        ISRIStemmer.__init__(self)
        self.rootdict = rootslibclass.rootDict()

        pass
 def __init__(self, ):
     abstractStemmer.__init__(self, )
     self.rootdict = rootslibclass.rootDict()
     pass
 def __init__(self, ):
     abstractStemmer.__init__(self)
     infixes_letters_custom = u"توطيدا"
     self.set_infix_letters(infixes_letters_custom)
     self.config["root_dict"] = "yes"
     self.rootdict = rootslibclass.rootDict()
Ejemplo n.º 13
0
def test2():
    
    rooter = rootslibclass.rootDict()
    for k in rooter.STAMP_DICT:
        print((u"%s\t%s"%(k, u", ".join(rooter.STAMP_DICT[k]))).encode('utf8'))