def Lemmatisation(self): tagger = naftawayh.wordtag.WordTagger() ws = self.Pretraitement() ArListem = ArabicLightStemmer() words_root = [] words_all = {} words_all['words'] = [] for w in ws: #if not tagger.is_noun(w): stem = ArListem.light_stem(w) ww = ArListem.get_prefix() + " + " + ArListem.get_stem( ) + " + " + ArListem.get_suffix() words_all['words'].append(ww) words_root.append(ArListem.get_stem()) self.aff(words_all) result = json.dumps(words_all, ensure_ascii=False, indent=4).encode('utf-8') return words_root
#tag words for l in corps: ps=nlp.pos_tag(l) if ps[0][0]==u'\ufeff': #ZERO WIDTH NO-BREAK SPACE ps=ps[1:] dp=nlp.dependency_parse(l) dp2=[] if len(dp)==len(ps): i = dp[0][2] for ind,w in enumerate(dp): if ind+1==i: dp2.append(w) dp2.append(("NONE",i,i)) else: dp2.append(w) else: dp2=dp dp2 = dp2[1:] for ind,w in enumerate(ps) : stem = ArListem.light_stem(w[0]) pre = ArListem.get_prefix() suf = ArListem.get_suffix() ls.append(w[0]+"|"+w[1]+"|"+dp2[ind][0]+"|"+str(dp2[ind][1]-1)+"|"+func([w[0],w[1]],classifier)+"p="+pre+"|s="+suf+"\n") ls.append(". PUNC\n") corpw.writelines(ls) corp.close() corpw.close()
''' Created on 15 juin 2019 @author: KHALID-RAMI ''' # coding=utf8 import pyarabic.arabrepr from tashaphyne.stemming import ArabicLightStemmer arepr = pyarabic.arabrepr.ArabicRepr() repr = arepr.repr ArListem = ArabicLightStemmer() word = u'قال' stem = ArListem.light_stem(word) print(ArListem.get_stem()) print(ArListem.get_root()) print(ArListem.get_left()) print(ArListem.get_prefix(2)) print(ArListem.get_right()) print(ArListem.get_unvocalized())