def get(self, text):
        ArListem = ArabicLightStemmer()
        list_Stemming = []

        tokens = nltk.word_tokenize(text)
        for word in tokens:
            stem = ArListem.light_stem(word)
            list_Stemming.append(ArListem.get_stem())
        return {"Stemming": list_Stemming}
Beispiel #2
0
    def Lemmatisation(self):
        tagger = naftawayh.wordtag.WordTagger()
        ws = self.Pretraitement()
        ArListem = ArabicLightStemmer()
        words_root = []
        words_all = {}
        words_all['words'] = []
        for w in ws:
            #if not tagger.is_noun(w):
            stem = ArListem.light_stem(w)
            ww = ArListem.get_prefix() + " + " + ArListem.get_stem(
            ) + " + " + ArListem.get_suffix()
            words_all['words'].append(ww)
            words_root.append(ArListem.get_stem())

        self.aff(words_all)

        result = json.dumps(words_all, ensure_ascii=False,
                            indent=4).encode('utf-8')
        return words_root
    def Light_Stem_word(self, body):
        ArListem = ArabicLightStemmer()
        word = body.split(u" ")
        word_stem = list()
        for w in word:
            w_stem = ArListem.light_stem(w)
            word_stem.append(ArListem.get_stem())
#             print (ArListem.get_stem())
             # extract root
#             print (ArListem.get_root())
        body = " ".join(word_stem) 
        return body
    def segmenteur_phrases(self):
        tagger = naftawayh.wordtag.WordTagger()
        ArListem = ArabicLightStemmer()

        stop_words1 = [
            u"كما", u"أيضا", u"كذالك", u"مثلا", u"وكما", u"شبيه", u"نضير",
            u"ماعدا", u"باستثناء", u"إلا", u"بسبب", u"لأن", u"لكي",
            u"والنتيجة", u"والخلاصة", u"أولا", u"ثانيا", u"يليه", u"لذالك",
            u"إذا", u"نستنتج", u"أم", u"أي", u"فقد", u"لكن", u"بينما", u"فإذا",
            u"إذا", u"حيث", u"بسبب", u"لذالك", u"لما", u"حينما", u"وذلك",
            u"حيث"
        ]
        stop_words2 = [[u"بالإضافة", u"إلى"], [u"ومن",
                                               u"ذالك"], [u"من", u"هنا"],
                       [u"ونخلص", u"إلى"], [u"وفي", u"البداية"],
                       [u"إلى", u"جانب"], [u"علاوة", u"على"], [u"غير", u"أنه"]]

        #fonction return la premier element dans la liste stop_words2
        def prem_ele(u, x):
            h = []
            for d in u:
                h.append(d[x])
            return h

        #eleminer la signe de ponctuation
        def ele_sign(s):
            if re.split(u'،', s):
                lt = re.split(u'،', s)
                if len(lt) > 0:
                    for u in lt:
                        if u != '':
                            return u

        liste1 = [
            ch
            for ch in re.split(r"[.!؟:()[]\n]+", unicode(self.text, "utf-8"))
            if ch != ''
        ]

        liste3 = []

        i = 0
        while i < len(liste1):
            liste2 = [ch for ch in re.split(r"[ ]+", liste1[i]) if ch != '']

            k = 0
            s = ''
            while k < len(liste2):
                if ele_sign(liste2[k]) == u'و':
                    stem = ArListem.light_stem(ele_sign(liste2[k + 1]))
                    if tagger.is_verb(stem) == True and tagger.is_noun(
                            stem) == False:
                        if s != '':
                            liste3.append(s)
                            s = ''
                    else:
                        s += liste2[k]
                        s += ' '
                elif ele_sign(liste2[k]) in stop_words1:
                    liste3.append(s)
                    s = ''
                elif ele_sign(liste2[k]) == u'ثم':
                    stem = ArListem.light_stem(ele_sign(liste2[k + 1]))
                    if tagger.is_verb(stem) == True and tagger.is_noun(
                            stem) == False:
                        if s != '':
                            liste3.append(s)
                            s = ''
                        else:
                            s += liste2[k]
                            s += ' '
                elif ele_sign(liste2[k][0]) == u'ف':
                    stem = ArListem.light_stem(ele_sign(liste2[k][1::]))
                    if tagger.is_verb(
                            ArListem.get_stem()) == True and tagger.is_noun(
                                ArListem.get_stem()) == False:
                        liste3.append(s)
                        s = ''
                    else:
                        s += liste2[k]
                        s += ' '
                elif ele_sign(liste2[k]) in prem_ele(stop_words2, 0):
                    if ele_sign(liste2[k + 1]) in prem_ele(stop_words2, 1):
                        liste3.append(s)
                        s = ''
                        k += 1
                    else:
                        s += liste2[k]
                        s += ' '
                else:
                    s += liste2[k]
                    s += ' '
                k += 1
            if len(s) != 0:
                liste3.append(s)
                s = ''
            i += 1

        liste3 = [ch for ch in liste3 if ch != '']

        with io.open('output.txt', 'a', encoding="utf-8") as file:
            file.write(
                unicode("\n\n" + "il y a " + str(len(liste3)) + " phrases\n",
                        "utf-8"))
            file.write(unicode("la liste des phrases : \n\n ", "utf-8"))
            file.write(unicode(" [ "))
            for ch in liste3:
                file.write(" ' " + ch + " ' \n\n")
            file.write(unicode(" ] "))
Beispiel #5
0
'''
Created on 15 juin 2019

@author: KHALID-RAMI
'''
# coding=utf8
import pyarabic.arabrepr
from tashaphyne.stemming import ArabicLightStemmer
arepr = pyarabic.arabrepr.ArabicRepr()
repr = arepr.repr
ArListem = ArabicLightStemmer()
word = u'قال'
stem = ArListem.light_stem(word)
print(ArListem.get_stem())
print(ArListem.get_root())
print(ArListem.get_left())
print(ArListem.get_prefix(2))
print(ArListem.get_right())
print(ArListem.get_unvocalized())
Beispiel #6
0
        tweet = tweet + st.stem(a)+ " "
    data1.append(tweet.strip())

#print(data1[:10])
#tashfeen
data2 = []
import pyarabic.arabrepr
arepr = pyarabic.arabrepr.ArabicRepr()
repr = arepr.repr
from tashaphyne.stemming import ArabicLightStemmer
ArListem = ArabicLightStemmer()
for tx in texts:
    tweet = ""
    for a in word_tokenize(tx):
        stem = ArListem.light_stem(a)
        tweet = tweet + ArListem.get_stem()+ " "
        #tweet = tweet + ArListem.get_root()+ " "
    data2.append(tweet.strip())
#print(data2[:10])



# create a dataframe using texts and lables
trainDF = pandas.DataFrame()
trainDF['tweet'] = data2
trainDF['class'] = labels

# split the dataset into training and validation datasets 
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['tweet'], trainDF['class'],test_size = 0.2)