Esempio n. 1
0
def test2():
    #test with tashaphyne
    from tashaphyne.stemming import ArabicLightStemmer
    asl = ArabicLightStemmer()
    words = [
        (u'أفتضاربانني', u'ضرب'),
        (u'بأبأ', u'بءبء'),
        (u'يريدون', u'ريد'),
        (u'يستطعن', u'ريد'),
        (u'كتاب', u'كتب'),
        (u"بالميدان", u'ميد'),
        (u"بالأسيهم", u'سهم'),
        (u"آخرين", u'ءخر'),
        (u"بالأخرة", u'ءخر'),
    ]
    for word, root in words:
        print(u"**********%s*********" % word)
        asl.light_stem(word)
        asl.segment(word)
        print(asl.get_segment_list())
        seg_list = asl.get_segment_list()
        seg_list = asl.get_segment_list()
        starstem_list = []
        affixa_list = asl.get_affix_list()
        #~ root_result = choose_root(affixa_list, debug=True)
        root_result = choose_root(word, affixa_list, debug=True)
        print(root_result, root_result == root)
    return 0
def test2():

    #test with tashaphyne
    #~ rootslib.create_stamped_roots()
    #~ rootslib.create_virtual_roots()
    #~ print repr(rootslib.VIRTUAL_DICT).replace('],','],\n').decode('unicode-escape').encode('utf8')
    from tashaphyne.stemming import ArabicLightStemmer
    asl = ArabicLightStemmer()
    asl_custom = abstractstemmer.customStemmer_roots()
    words = [(u'أفتضاربانني',u'ضرب'),
    (u'بأبأ',u'بءبء'),
    (u'يريدون',u'ريد'),
    (u'يستطعن', u'طوع'),
    (u'يستطيعون', u'طوع'),
    (u'الصيام', u'صوم'),
    (u'يخاف', u'خوف'),
    (u'كتاب',u'كتب'),
    (u"بالميدان",u'ميد'),
    (u"بالأسيهم",u'سهم'),
    (u"آخرين",u'ءخر'),
    (u"بالآخرة",u'ءخر'),    
    (u"لارتاب",u'ريب'),    
    (u"وسائل",u'وسل'),    
    (u"وصائل",u'وصل'),    
    (u"أخاه",u'ءخو'),    
    (u"أخوه",u'ءخو'),    
    (u"أخاهم",u'ءخو'),    
    (u"أخانا",u'ءخو'),  
    (u"بإذن",u'ءذن'), 
    (u"للأبرار",u"برر"),
    (u'واتبعوا', u'تبع'),
    (u'والكاظمين', u'كظم'),
    (u'عد', u'عود'),


  
    
    ]
    # load root dictionary with features
    rootdict = rootslibclass.rootDict()
    for word, root in words:
        print(u"**********%s*********"%word).encode('utf8')
        word = re.sub(u"[%s]"%(araby.ALEF_MADDA), araby.HAMZA+araby.ALEF, word)

        asl.light_stem(word)
        asl.segment(word)
        print asl.get_segment_list()  
        seg_list = asl.get_segment_list()  
        starstem_list =[]
        affixa_list = asl.get_affix_list()
        print repr(affixa_list).replace('},','},\n').decode('unicode-escape').encode('utf8')
        #~ root_result = rootslib.choose_root(affixa_list, debug=True)
        root_result = rootdict.choose_root(affixa_list, debug=True)
        #~ root_result2 = rootdict.choose_root(affixa_list, debug=True)
        #~ print root_result.encode('utf8'),root_result2.encode('utf8'), asl_custom.getroot(word).encode('utf8'), root_result == root, root_result == root_result2
        print root_result.encode('utf8'), asl_custom.getroot(word).encode('utf8'), root_result == root
        
    return 0
Esempio n. 3
0
def stemming(pos_tag):
    Ar_Listem = ArabicLightStemmer()
    adjective_tags = ['JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS']
    stemmed_text = []

    for word in pos_tag:
        p = word[1].split('/')
        if p[-1] in adjective_tags:
            stemmed_text.append(str(Ar_Listem.light_stem(p[0])))
        else:
            stemmed_text.append(str(Ar_Listem.light_stem(p[0])))
    # print("Text tokens after lemmatization of adjectives and nouns: \n")
    return stemmed_text
Esempio n. 4
0
def test_rooter(dataframe_result):
    """
    """
    from pyarabic.arabrepr import arepr
    #test with tashaphyne
    from tashaphyne.stemming import ArabicLightStemmer
    import rootslibclass
    asl = ArabicLightStemmer()
    rooter = rootslibclass.rootDict(algos=['rhyzome'])
    # debug in rhyzome rooter
    rooter.rhyzome_rooter.debug = True
    #~ rooter = rootslibclass.rootDict()
    df = dataframe_result
    # avoid null roots

    #~ total = df.size
    total = len(df.index)
    cpt = 0
    for word, root in zip(df["word"], df["root"]):
        root_list = root.split(';')
        print((u"**********%s*********" % word).encode('utf8'))
        asl.light_stem(word)
        print((u"Start Word : %s" % asl.get_starword()).encode('utf8'))

        word = re.sub(u"[%s]" % (araby.ALEF_MADDA), araby.HAMZA + araby.ALEF,
                      word)

        asl.segment(word)
        print(asl.get_segment_list())
        seg_list = asl.get_segment_list()
        starstem_list = []
        affixa_list = asl.get_affix_list()
        # stems prints
        stems = [d['stem'] for d in affixa_list]
        print("Stems: " + u' '.join(stems).encode('utf8'))
        roots = [d['root'] for d in affixa_list]
        print((u"Dafault roots: [%s] a %s" %
               (asl.get_root(), u' '.join(roots))).encode('utf8'))
        #~ root_result = rooter.choose_wazn_root(affixa_list, debug=True)
        root_result = rooter.choose_root(word, affixa_list, debug=True)
        #~ print(u"Test root",root_result.encode('utf8'), u"found root",root_result.encode('utf8'), root_result == root)
        print((u" ".join([
            u"Test root", root, u"found root", root_result,
            str(root_result in root_list)
        ])).encode('utf8'))
        if root_result in root_list:
            cpt += 1
    print("***** Percent %.2f%% [%d/%d]" % (cpt * 100.0 / total, cpt, total))
Esempio n. 5
0
def test_matrix(dataframe_result):
    """
    """
    from pyarabic.arabrepr import arepr
    #test with tashaphyne
    from tashaphyne.stemming import ArabicLightStemmer
    import rootslibclass
    asl = ArabicLightStemmer()
    rooter = rootslibclass.rootDict() 
    rooter.debug = True 
    #test with tashaphyne
    df = dataframe_result
    total = df.size
    cpt = 0
    for word, root in zip(df["word"], df["root"]):
        print((u"**********%s*********"%word).encode('utf8'))
        asl.light_stem(word)
        print((u"Start Word : %s"%asl.get_starword()).encode('utf8'))        
        
        asl.segment(word)
        print(asl.get_segment_list()  )
        seg_list = asl.get_segment_list()  
        starstem_list =[]
        affixa_list = asl.get_affix_list()
        # stems prints 
        stems = [ d['stem'] for d in affixa_list]
        roots = []
        for stem in stems:
            temp_list = rooter.matrix_root(stem,u'توطيدا')
            tmp_roots = [d['root'] for d in temp_list]
            roots.extend(tmp_roots)
            #~ tmp_roots = [d['root'] for d in temp_list if rooter.is_root(d['root'])]
        print((u"Candidats " + u"\t".join(roots)).encode('utf8'))
        # lookup only one time by root in dictionary
        set_roots = [x for x in set(roots) if rooter.is_root(x)]
        # remove invalid roots and keep repetition
        roots = [x for x in roots if x in set_roots]
        root_result = most_common(roots)
        print((u"Accepted " + u"\t".join(roots)).encode('utf8'))
        print((u"root " + root_result).encode('utf8'))
        print((u" ".join([u"Test root", root, u"found root",
        root_result, str(root_result == root)])).encode('utf8'))
        if root_result == root:
            cpt += 1
    print("***** Percent %.2f%%"%(cpt*100/total))        
    def get(self, text):
        ArListem = ArabicLightStemmer()
        list_Stemming = []

        tokens = nltk.word_tokenize(text)
        for word in tokens:
            stem = ArListem.light_stem(word)
            list_Stemming.append(ArListem.get_stem())
        return {"Stemming": list_Stemming}
 def Get_root_word(self, body):
     ArListem = ArabicLightStemmer()
     word = body.split(u" ")
     word_stem = list()
     for w in word:
         w_stem = ArListem.light_stem(w)
         word_stem.append(ArListem.get_root())
     body = " ".join(word_stem) 
     return body
def one_string_Lemmatizing(sentence, language):
    '''
    Argument:
        String of words
    return:
        list of words with Lemmatizing
    '''
    sentence = one_string_tokenization(sentence)
    stemmer = ArabicLightStemmer()
    sentence = [stemmer.light_stem(word) for word in sentence]
    return sentence
Esempio n. 9
0
def test_rooter_matrix(dataframe_result):
    """
    """
    from pyarabic.arabrepr import arepr
    #test with tashaphyne
    from tashaphyne.stemming import ArabicLightStemmer
    import rootslibclass
    asl = ArabicLightStemmer() 
    rooter = rootslibclass.rootDict()       
    df = dataframe_result
    total = df.size
    cpt = 0
    for word, root in zip(df["word"], df["root"]):
        print((u"**********%s*********"%word).encode('utf8'))
        asl.light_stem(word)
        root_list = root.split(';')        
        print((u"Start Word : %s"%asl.get_starword()).encode('utf8'))        
        
        asl.segment(word)
        print(asl.get_segment_list()  )
        seg_list = asl.get_segment_list()  
        starstem_list =[]
        affixa_list = asl.get_affix_list()
        # stems prints 
        stems = [ d['stem'] for d in affixa_list]
        print("Stems: "+u' '.join(stems).encode('utf8'))        
        roots = [ d['root'] for d in affixa_list]
        print((u"Dafault roots: [%s] a %s"%(asl.get_root(),u' '.join(roots))).encode('utf8'))        
        #~ root_result = rooter.choose_wazn_root(affixa_list, debug=True)
        root_result = rooter.choose_root_matrix(word, affixa_list, debug=True)
        #~ print(u"Test root",root_result.encode('utf8'), u"found root",root_result.encode('utf8'), root_result == root)
        #~ print((u" ".join([u"Test root", root, u"found root",
        #~ root_result, str(root_result == root)])).encode('utf8'))
        #~ if root_result == root:
            #~ cpt += 1
        print((u" ".join([u"Test root", root, u"found root",
        root_result, str(root_result in root_list)])).encode('utf8'))
        if root_result in  root_list:
            cpt += 1            
    #~ print("***** Percent %.2f%%"%(cpt*100/total)) 
    print("***** Percent %.2f%% [%d/%d]"%(cpt*100.0/total, cpt, total))
Esempio n. 10
0
def prediction(clean_post,
               model_file='/19_classes_7869.h5',
               w2idx_dict_file='/1367_roots_w2idx.npy',
               max_rec=3):
    labels = [
        'تبرع بالدم', 'توظيف', 'دعوات', 'خدمات ولاد العم', 'احتياجات طبية',
        'أدوية', 'مفقودات أشخاص وأشياء', 'ملابس', 'الرفق بالحيوان',
        'قصص ولاد العم', 'استفسارات عن أي موضوع', 'استشارات طبية', 'أعطال طرق',
        'طلبات مساعدة', 'احتياجات منزلية', 'مساعدة كبار السن',
        'مساعدات تعليمية', 'توصيل', 'كتب'
    ]
    word2index = np.load(os.path.dirname(os.path.realpath(__file__)) +
                         w2idx_dict_file,
                         allow_pickle=True).item()
    vocab_size = len(root_w2idx)
    model = load_model(
        os.path.dirname(os.path.realpath(__file__)) + model_file)
    features = np.zeros((1, vocab_size))
    ArListem = ArabicLightStemmer()
    for word in clean_post.split():
        root_flag = 0
        ArListem.light_stem(word)
        roots = [dic['root'] for dic in ArListem.get_affix_list()]
        for root in roots:
            if (root in root_w2idx.keys()
                    and features[0, root_w2idx[root]] < max_rec):
                features[0, root_w2idx[root]] += 1
                root_flag = 1
                break
        if (not root_flag and features[0, root_w2idx['<unk>']] < max_rec):
            features[0, root_w2idx['<unk>']] += 1

    prediction = model.predict(features)[0].argsort()[-2:][::-1]

    if (prediction[0] == 3 and prediction[1] != 8):
        prediction = [labels[i] for i in prediction]
    else:
        prediction = labels[prediction[0]]

    return prediction
Esempio n. 11
0
    def search_engine(search_id):  
        print("Input query: ", search_id)
        hasilQuery = preprocessing_query(search_id)
        print("Preprocessing query: ", hasilQuery[-1])
        query = query_translation(hasilQuery)
        print("Query translation: ", query)
        
        ArListem = ArabicLightStemmer()
        stem = ArListem.light_stem(query) 
        hasil = ArListem.get_root()
        print("Stem: ", hasil)

        exquery = request.POST.get('exquery', None)
        print(exquery)
        
        # Query Expansion
        if(exquery=='Iya'):
            print("Pakai Ekspansi Query")
            # pass
            token = wordpunct_tokenize(hasil)
            query = []
            for word in token:
                 pq = PredictorConfig.modelFT.wv.most_similar(word)
                 print(pq)
                 words = []
                 for i in range(4):
                     words.append(pq[i][0])
                 words.append(word)
                 print(words)
                
                 query.append(' '.join(words))
                 queries = []
                 queries.append(' '.join(query))
                 print("Query Expansion: ", queries)
                 hasil = queries[0]

        query_vec = PredictorConfig.tfidf_vectorizer.transform([hasil])
        
        print(query_vec)

        results = cosine_similarity(PredictorConfig.tfidf_matrix,query_vec).reshape((-1,))

        list_object = []
        list_id = results.argsort()[-10:][::-1]
        list_id = [x+1 for x in list_id]
        for x in list_id:
            list_object.append(Kitabs.objects.filter(id=x))
        
        return list_object
def one_string_Lemmatizing(sentence, language):
    '''
    Argument:
        String of words
    return:
        list of words with Lemmatizing
    '''
    sentence = one_string_tokenization(sentence)
    if language == 'English':
        lemmatizer = WordNetLemmatizer()
        sentence = [lemmatizer.lemmatize(word) for word in sentence]
    elif language == 'Arabic':
        stemmer = ArabicLightStemmer()
        sentence = [stemmer.light_stem(word) for word in sentence]
    return sentence
def stem(string):

    # split given string into words
    words = string.split()
    stems_list = []

    arabic_light_stemmer = ArabicLightStemmer()

    for word in words:

        # stem word
        stem_word = arabic_light_stemmer.light_stem(word)
        # add new stem to dict
        stems_list.append(stem_word)

    return stems_list
Esempio n. 14
0
 def getStemmedText(self, text):
     stemmedText = []
     if self.lang == 1:
         stemmer = nltk.stem.snowball.FrenchStemmer()
         stemmedText = [
             stemmer.stem(word) for word in text if word.isalpha()
         ]
     else:
         from tashaphyne.stemming import ArabicLightStemmer
         ArListem = ArabicLightStemmer()
         for word in text:
             if word.isalpha():
                 stem = ArListem.light_stem(word)
                 root = ArListem.get_root()
                 stemmedText.append(root)
     return stemmedText
Esempio n. 15
0
def stemmingـprocess(word):
    # Initialize Arabic stemmer
    arepr = pyarabic.arabrepr.ArabicRepr()
    repr = arepr.repr
    ArListem = ArabicLightStemmer()

    if word in stem_not:
        wordRoot = word
    elif len(word) <= 3:
        wordRoot = word
    else:
        # Stemming word
        stem = ArListem.light_stem(word)
        # Extract root
        wordRoot = ArListem.get_root()
    return wordRoot
Esempio n. 16
0
    def Lemmatisation(self):
        tagger = naftawayh.wordtag.WordTagger()
        ws = self.Pretraitement()
        ArListem = ArabicLightStemmer()
        words_root = []
        words_all = {}
        words_all['words'] = []
        for w in ws:
            #if not tagger.is_noun(w):
            stem = ArListem.light_stem(w)
            ww = ArListem.get_prefix() + " + " + ArListem.get_stem(
            ) + " + " + ArListem.get_suffix()
            words_all['words'].append(ww)
            words_root.append(ArListem.get_stem())

        self.aff(words_all)

        result = json.dumps(words_all, ensure_ascii=False,
                            indent=4).encode('utf-8')
        return words_root
Esempio n. 17
0
def _stem_light(word):

    from tashaphyne.stemming import ArabicLightStemmer

    stemmer = ArabicLightStemmer()
    return stemmer.light_stem(word)
Esempio n. 18
0
#tag words
for l in corps:
    ps=nlp.pos_tag(l)
    if ps[0][0]==u'\ufeff': #ZERO WIDTH NO-BREAK SPACE
        ps=ps[1:]
    dp=nlp.dependency_parse(l)
    dp2=[]
    if len(dp)==len(ps):
        i = dp[0][2]
        for ind,w in enumerate(dp):
            if ind+1==i:
                dp2.append(w)
                dp2.append(("NONE",i,i))
            else:
                dp2.append(w)
    else:
        dp2=dp
    dp2 = dp2[1:]
    
    for ind,w in enumerate(ps) :
        stem = ArListem.light_stem(w[0])
        pre = ArListem.get_prefix()
        suf = ArListem.get_suffix()
        ls.append(w[0]+"|"+w[1]+"|"+dp2[ind][0]+"|"+str(dp2[ind][1]-1)+"|"+func([w[0],w[1]],classifier)+"p="+pre+"|s="+suf+"\n")
    ls.append(". PUNC\n")

corpw.writelines(ls)

corp.close()
corpw.close()
Esempio n. 19
0
'''
Created on 15 juin 2019

@author: KHALID-RAMI
'''
# coding=utf8
import pyarabic.arabrepr
from tashaphyne.stemming import ArabicLightStemmer
arepr = pyarabic.arabrepr.ArabicRepr()
repr = arepr.repr
ArListem = ArabicLightStemmer()
word = u'قال'
stem = ArListem.light_stem(word)
print(ArListem.get_stem())
print(ArListem.get_root())
print(ArListem.get_left())
print(ArListem.get_prefix(2))
print(ArListem.get_right())
print(ArListem.get_unvocalized())
Esempio n. 20
0
    for a in word_tokenize(tx):
        tweet = tweet + st.stem(a) + " "
    data1.append(tweet.strip())

#print(data1[:10])
#tashfeen
data2 = []
import pyarabic.arabrepr
arepr = pyarabic.arabrepr.ArabicRepr()
repr = arepr.repr
from tashaphyne.stemming import ArabicLightStemmer
ArListem = ArabicLightStemmer()
for tx in texts:
    tweet = ""
    for a in word_tokenize(tx):
        stem = ArListem.light_stem(a)
        #tweet = tweet + ArListem.get_stem()+ " "
        tweet = tweet + ArListem.get_root() + " "
    data2.append(tweet.strip())
#print(data2[:10])

# create a dataframe using texts and lables
trainDF = pandas.DataFrame()
trainDF['tweet'] = texts
trainDF['class'] = labels

# split the dataset into training and validation datasets
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(
    trainDF['tweet'], trainDF['class'], test_size=0.2)

# create a count vectorizer object
    def segmenteur_phrases(self):
        tagger = naftawayh.wordtag.WordTagger()
        ArListem = ArabicLightStemmer()

        stop_words1 = [
            u"كما", u"أيضا", u"كذالك", u"مثلا", u"وكما", u"شبيه", u"نضير",
            u"ماعدا", u"باستثناء", u"إلا", u"بسبب", u"لأن", u"لكي",
            u"والنتيجة", u"والخلاصة", u"أولا", u"ثانيا", u"يليه", u"لذالك",
            u"إذا", u"نستنتج", u"أم", u"أي", u"فقد", u"لكن", u"بينما", u"فإذا",
            u"إذا", u"حيث", u"بسبب", u"لذالك", u"لما", u"حينما", u"وذلك",
            u"حيث"
        ]
        stop_words2 = [[u"بالإضافة", u"إلى"], [u"ومن",
                                               u"ذالك"], [u"من", u"هنا"],
                       [u"ونخلص", u"إلى"], [u"وفي", u"البداية"],
                       [u"إلى", u"جانب"], [u"علاوة", u"على"], [u"غير", u"أنه"]]

        #fonction return la premier element dans la liste stop_words2
        def prem_ele(u, x):
            h = []
            for d in u:
                h.append(d[x])
            return h

        #eleminer la signe de ponctuation
        def ele_sign(s):
            if re.split(u'،', s):
                lt = re.split(u'،', s)
                if len(lt) > 0:
                    for u in lt:
                        if u != '':
                            return u

        liste1 = [
            ch
            for ch in re.split(r"[.!؟:()[]\n]+", unicode(self.text, "utf-8"))
            if ch != ''
        ]

        liste3 = []

        i = 0
        while i < len(liste1):
            liste2 = [ch for ch in re.split(r"[ ]+", liste1[i]) if ch != '']

            k = 0
            s = ''
            while k < len(liste2):
                if ele_sign(liste2[k]) == u'و':
                    stem = ArListem.light_stem(ele_sign(liste2[k + 1]))
                    if tagger.is_verb(stem) == True and tagger.is_noun(
                            stem) == False:
                        if s != '':
                            liste3.append(s)
                            s = ''
                    else:
                        s += liste2[k]
                        s += ' '
                elif ele_sign(liste2[k]) in stop_words1:
                    liste3.append(s)
                    s = ''
                elif ele_sign(liste2[k]) == u'ثم':
                    stem = ArListem.light_stem(ele_sign(liste2[k + 1]))
                    if tagger.is_verb(stem) == True and tagger.is_noun(
                            stem) == False:
                        if s != '':
                            liste3.append(s)
                            s = ''
                        else:
                            s += liste2[k]
                            s += ' '
                elif ele_sign(liste2[k][0]) == u'ف':
                    stem = ArListem.light_stem(ele_sign(liste2[k][1::]))
                    if tagger.is_verb(
                            ArListem.get_stem()) == True and tagger.is_noun(
                                ArListem.get_stem()) == False:
                        liste3.append(s)
                        s = ''
                    else:
                        s += liste2[k]
                        s += ' '
                elif ele_sign(liste2[k]) in prem_ele(stop_words2, 0):
                    if ele_sign(liste2[k + 1]) in prem_ele(stop_words2, 1):
                        liste3.append(s)
                        s = ''
                        k += 1
                    else:
                        s += liste2[k]
                        s += ' '
                else:
                    s += liste2[k]
                    s += ' '
                k += 1
            if len(s) != 0:
                liste3.append(s)
                s = ''
            i += 1

        liste3 = [ch for ch in liste3 if ch != '']

        with io.open('output.txt', 'a', encoding="utf-8") as file:
            file.write(
                unicode("\n\n" + "il y a " + str(len(liste3)) + " phrases\n",
                        "utf-8"))
            file.write(unicode("la liste des phrases : \n\n ", "utf-8"))
            file.write(unicode(" [ "))
            for ch in liste3:
                file.write(" ' " + ch + " ' \n\n")
            file.write(unicode(" ] "))
Esempio n. 22
0
def test3():
    from pyarabic.arabrepr import arepr
    #test with tashaphyne
    from tashaphyne.stemming import ArabicLightStemmer
    asl = ArabicLightStemmer()
    rooter = rootDict()
    words = [
        (u'أفتضاربانني', u'ضرب'),
        #~ (u'بأبأ',u'بءبء'),
        #~ (u'يسعى',u'سعى'),
        #~ (u'يريدون',u'ريد'),
        #~ (u'يستطعن', u'ريد'),
        #~ (u'كتاب',u'كتب'),
        #~ (u"بالميدان",u'ميد'),
        #~ (u"بالأسيهم",u'سهم'),
        #~ (u"آخرين",u'ءخر'),
        #~ (u"بالأخرة",u'ءخر'),
        #~ ('ويرمي',u'رمي'),
        #~ (u'ويرمي',u'رمي'),
        #~ (u'يرمون',u'رمي'),
        #~ (u'راميات',u'رمي'),
        #~ (u'وترمون',u'رمي'),
        #~ (u'ويرمين',u'رمي'),
        #~ (u'وترميان',u'رمي'),
        #~ (u'ورامون',u'رمي'),
        #~ (u'وليرميان',u'رمي'),
        #~ (u'لترميان',u'رمي'),
        #~ (u'لترمين',u'رمي'),
        #~ (u'رامي',u'رمي'),
        #~ (u'ورامي',u'رمي'),
        #~ (u'رماية',u'رمي'),
        #~ (u'رمايه',u'رمي'),
        #~ (u'الراميات',u'رمي'),
        #~ (u'المرميات',u'رمي'),
        #~ (u'المتراميات',u'رمي'),
        #~ (u'مترامية',u'رمي'),
        #~ (u'مترامي',u'رمي'),
        #~ (u'الرامون',u'رمي'),
        #~ (u'والراميات',u'رمي'),
        #~ (u'وسيقولون',u'قول'),
        #~ (u'وسيقال',u'قول'),
        #~ (u'وسيقيلوهم',u'قول'),
        #~ (u'وتقال',u'قول'),
        #~ (u'وتقولوا',u'قول'),
        #~ (u'وتقول',u'قول'),
        #~ (u'ومقاول',u'قول'),
        #~ (u'وقالوا',u'قول'),
        #~ (u'ومقال',u'قول'),
        (u'وتقل', u'قول'),
        (u'وتقلن', u'قول'),
        (u'وليقل', u'قول'),
        (u'ولتقلنا', u'قول'),
        (u'لتقل', u'قول'),
        (u'تقل', u'قول'),
        (u'ونقل', u'قول'),
        (u'ولنقل', u'قول'),
        (u'فتقل', u'قول'),
        (u'ستقل', u'قول'),
        (u'ستقلن', u'قول'),
        (u'وستقلن', u'قول'),
        (u'فستقل', u'قول'),
        (u'وقالوا', u'قول'),
        (u'قالوا', u'قول'),
        (u'وقالا', u'قول'),
        (u'قالا', u'قول'),
        (u'وقالت', u'قول'),
        (u'قالت', u'قول'),
        (u'ويقال', u'قول'),
        (u'يقال', u'قول'),
        (u'وسيقال', u'قول'),
        (u'سيقال', u'قول'),
        (u'ويقلن', u'قول'),
        (u'يقلن', u'قول'),
        (u'ويقلنا', u'قول'),
        (u'يقلنا', u'قول'),
        (u'وتقال', u'قول'),
        (u'تقال', u'قول'),
        (u'وقال', u'قول'),
        (u'قال', u'قول'),
        (u'وسأقول', u'قول'),
        (u'سأقول', u'قول'),
        (u'وقائل', u'قول'),
        (u'قائل', u'قول'),
        (u'وقائلان', u'قول'),
        (u'قائلان', u'قول'),
        (u'وقائلون', u'قول'),
        (u'قائلون', u'قول'),
        (u'وقائلا', u'قول'),
        (u'قائلا', u'قول'),
        (u'ومقال', u'قول'),
        (u'مقال', u'قول'),
        (u'وقائلتان', u'قول'),
        (u'قائلتان', u'قول'),
        (u'يعد', u'وعد'),
        (u'تعد', u'عدد'),
        (u'نعدهم', u'عدد'),
        (u'وتعدهم', u'وعد'),
        (u'تعدهم', u'وعد'),
        (u'وستعدهم', u'وعد'),
        (u'ستعدهم', u'وعد'),
        (u'وتعدهما', u'وعد'),
        (u'تعدهما', u'وعد'),
        (u'ويعدهم', u'وعد'),
        (u'يعدهم', u'وعد'),
        (u'ويعدهما', u'وعد'),
        (u'يعدهما', u'وعد'),
        (u'وسيعدهم', u'وعد'),
        (u'سيعدهم', u'وعد'),
        (u'وسيعدهما', u'وعد'),
        (u'سيعدهما', u'وعد'),
        (u'ولنعدهم', u'وعد'),
        (u'لنعدهم', u'وعد'),
        (u'ولنعدهما', u'وعد'),
        (u'لنعدهما', u'وعد'),
        (u'ولتعدهم', u'وعد'),
        (u'لتعدهم', u'وعد'),
        (u'ولتعدهما', u'وعد'),
        (u'لتعدهما', u'وعد'),
        (u'ولتعدها', u'وعد'),
        (u'لتعدها', u'وعد'),
        (u'وستعدها', u'وعد'),
        (u'ستعدها', u'وعد'),
        (u'ووعدها', u'وعد'),
        (u'وعدها', u'وعد'),
        (u'ووعدهم', u'وعد'),
        (u'وعدهم', u'وعد'),
        (u'ووعدهما', u'وعد'),
        (u'وعدهما', u'وعد'),
        (u'وتعد', u'وعد'),
        (u'تعد', u'وعد'),
        (u'وتعدني', u'وعد'),
        (u'تعدني', u'وعد'),
        (u'وتعدنا', u'وعد'),
        (u'تعدنا', u'وعد'),
        (u'وتعده', u'وعد'),
        (u'تعده', u'وعد'),
        (u'وواعدناهم', u'وعد'),
        (u'واعدناهم', u'وعد'),
        (u'ووعدناهم', u'وعد'),
        (u'وعدناهم', u'وعد'),
        (u'وتعدوهم', u'وعد'),
        (u'تعدوهم', u'وعد'),
        (u'يعتاد', u'عود'),
        (u'أحست', u'حسس'),
        (u'يحسون', u'حسس'),
        (u'ثقة', u'وثق'),
        (u'ثقات', u'وثق'),
        (u'بثقات', u'وثق'),
        (u'صفات', u'وصف'),
        (u'صلاته', u'وصل'),
    ]
    for word, root in words:
        print((u"**********%s*********" % word).encode('utf8'))
        asl.light_stem(word)
        asl.segment(word)
        print(asl.get_segment_list())
        seg_list = asl.get_segment_list()
        starstem_list = []
        affixa_list = asl.get_affix_list()
        stems = [d['stem'] for d in affixa_list]
        print(u' '.join(stems).encode('utf8'))
        #~ root_result = rooter.choose_wazn_root(affixa_list, debug=True)
        root_result = rooter.choose_root(word, affixa_list, debug=True)
        print("Test root", root_result.encode('utf8'), "found root",
              root_result.encode('utf8'), root_result == root)
    # test root_extension
    roots = [
        u"قل",
        u"دع",
    ]
    for rt in roots:
        extended = rooter.extend_root(rt)
        print(u"\t".join([rt, u";".join(extended)]).encode('utf8'))
    print('stamped roots', len(rooter.STAMP_DICT))
    print('stamped roots diff new',
          len(diff(rooter.STAMP_DICT, roots_const.ROOTS)))
    print('stamped roots removed',
          len(diff(roots_const.ROOTS, rooter.STAMP_DICT)))
    print('stamped roots max length',
          max((len(v), k, v) for k, v in rooter.STAMP_DICT.iteritems()))
    print('virtual roots', len(rooter.VIRTUAL_DICT))
    print('virtual roots diff',
          len(diff(rooter.VIRTUAL_DICT, roots_const.ROOTS)))
    print('virtual roots removed ',
          len(diff(roots_const.ROOTS, rooter.VIRTUAL_DICT)))
    print('virtual roots max length',
          max((len(v), k, v) for k, v in rooter.VIRTUAL_DICT.iteritems()))

    print('all roots', len(roots_const.ROOTS))

    return 0
Esempio n. 23
0
from tinydb import TinyDB, where
from tashaphyne.stemming import ArabicLightStemmer

ArListem = ArabicLightStemmer()
db = TinyDB('/json.json')
while True:
    x = input('Input to search or "q" to quit:\n>>> ')
    if x == 'q':
        break
    ArListem.light_stem(x)
    x = ArListem.get_root()
    data = db.search(where('name').matches('.*%s.*' % x))
    for line in data:
        print(line['name'] + ': ', end='')
        print(line['value'])
        print()
    if not data:
        print('Not found result')
Esempio n. 24
0
def test1(args):
    word = u"لعلهم"
    print(is_root(word))
    word = u"علم"
    print(is_root(word))

    #test with tashaphyne
    from tashaphyne.stemming import ArabicLightStemmer
    asl = ArabicLightStemmer()
    words = [
        u'أفتضاربانني',
        u'بأبأ',
        u'يريدون',
        u'يستطعن',
        u'كتاب',
        u"بالميدان",
        u"بالأسيهم",
    ]
    ext = extend_root(u"رم")
    print("extende")
    print(repr(ext).decode('unicode-escape').encode('utf8'))

    for word in words:
        print(u"**********%s*********" % word)
        asl.light_stem(word)
        asl.segment(word)
        print(asl.get_segment_list())
        seg_list = asl.get_segment_list()
        starstem_list = []
        for seg in seg_list:
            left, right = seg
            starstem_list.append(asl.get_starstem(left, right))
        print("star stems")

        print(u"\t".join(starstem_list)).encode('utf8')
        filtered_starstem_list = filter(valid_starstem, starstem_list)
        print("filtred star stem")
        print(u"\t".join(filtered_starstem_list)).encode('utf8')
        for st in starstem_list:
            print(st, u"\t".join(valid_starstem(st)).encode('utf8'))
        affixation_list = asl.get_affix_list()
        stems = [d['stem'] for d in affixation_list]
        print("Candidats stems%s" % u'\t'.join(stems))
        for st in stems:
            print(st, u"\t".join(valid_starstem(st)).encode('utf8'))

        print(
            repr(affixation_list).replace(
                '},', '},\n').decode('unicode-escape').encode('utf8'))
        print("reduce")
        #~ affixation_list = filter(verify_affix, affixation_list)
        print(
            repr(affixation_list).replace(
                '},', '},\n').decode('unicode-escape').encode('utf8'))

        roots = [normalize_root(d['root']) for d in affixation_list]
        print("Candidats %s" % u'\t'.join(roots))
        # get uniq root
        accepted = set(filter(is_root, roots))
        print("accepted %s" % u'\t'.join(accepted))
        if not accepted:
            # try to extend roots

            extended_roots = []
            for x in roots:
                extended_roots.extend(extend_root(x))
            print("Candidats extended %s" % u'\t'.join(extended_roots))
            accepted = set(filter(is_root, extended_roots))
            print("accepted level2 %s" % u'\t'.join(accepted))
        print('root %s' % asl.get_root())
    #~ print repr(STAMP_DICT).replace('},','},\n').decode('unicode-escape').encode('utf8')
    return 0