def lightStemAr(word_list): result = [] arstemmer = ISRIStemmer() for word in word_list: word = arstemmer.norm(word, num=1) # remove diacritics which representing Arabic short vowels if not word in arstemmer.stop_words: # exclude stop words from being processed word = arstemmer.pre32(word) # remove length three and length two prefixes in this order word = arstemmer.suf32(word) # remove length three and length two suffixes in this order word = arstemmer.waw(word) # remove connective ‘و’ if it precedes a word beginning with ‘و’ word = arstemmer.norm(word, num=2) # normalize initial hamza to bare alif result.append(word) return ' '.join(result)
def light_stem(text): words = text result = list() stemmer = ISRIStemmer() for word in words: word = stemmer.norm(word, num=1) if word not in stemmer.stop_words: word = stemmer.pre32(word) word = stemmer.suf32(word) word = stemmer.waw(word) word = stemmer.norm(word, num=2) result.append(word) return ' '.join(result)
def read_data_from_file( filename, number_of_classes ): #"reading and preparing the training or testing datasets by extracting and separating tweets from labels." tweets = [] # list of text samples labels = [] # list of label ids istemmer = ISRIStemmer() read_file = open(filename, "r+") # read and write mode for line in read_file: tweet = "" filtered_line = line.split() # to get the tweet itself label = list(map(int, filtered_line[-11:])) for word in filtered_line[1:-11]: tweet += word + " " tweet = tweet[:-1] tweet = clean_str(tweet) tweet = istemmer.norm(tweet) tweets.append(tweet) labels.append(label) read_file.close() return [tweets, labels]
def read_data_from_file( filename ): #"reading and preparing the training or testing datasets by extracting and separating tweets from labels." tweets = [] # list of text samples labels = [] # list of label ids labels_index = {} # dictionary mapping label name to numeric id istemmer = ISRIStemmer() read_file = open(filename, "r+") # read and write mode index = 0 for line in read_file: line = line.split('\t') # to get the tweet itself label = line[0] tweet = line[1].strip(" \"") tweet = clean_str(tweet) tweet = istemmer.norm(tweet) if (label not in labels_index): labels_index[label] = index index += 1 tweets.append(tweet) labels.append(labels_index[label]) read_file.close() return [tweets, labels]
def arabic_social_media_text_filter(txt, debug=0): """ This filter is for filtering Arabic text from social media. :param txt: utf-8 text, unicode :param debug: Any value greater than 0 prints messages about normalized vs original text. :param return: """ txt = social_media_text_filter(txt, debug=debug) # Remove diacritics st = ISRIStemmer() txt = st.norm(txt) return txt
def light_stem(text): words = text.split() result = list() stemmer = ISRIStemmer() for word in words: word = stemmer.norm(word, num=1) # remove diacritics which representing Arabic short vowels if not word in stemmer.stop_words: # exclude stop words from being processed word = stemmer.pre32(word) # remove length three and length two prefixes in this order word = stemmer.suf32(word) # remove length three and length two suffixes in this order word = stemmer.waw(word) # remove connective ‘و’ if it precedes a word beginning with ‘و’ word = stemmer.norm(word, num=2) # normalize initial hamza to bare alif # word=stemmer.pro_w4(word) #process length four patterns and extract length three roots # word=stemmer.pro_w53(word) #process length five patterns and extract length three roots # word=stemmer.pro_w54(word) #process length five patterns and extract length four roots # word=stemmer.end_w5(word) #ending step (word of length five) # word=stemmer.pro_w6(word) #process length six patterns and extract length three roots # word=stemmer.pro_w64(word) #process length six patterns and extract length four roots # word=stemmer.end_w6(word) #ending step (word of length six) # word=stemmer.suf1(word) #normalize short sufix # word=stemmer.pre1(word) #normalize short prefix result.append(word) return ' '.join(result)
class BasicStemmer(Stemmer): def __init__(self): self.stemmer = ISRIStemmer() self.stopWordsIndex = ArabicStopWordsIndex(self) self.stopWordsIndex.buildIndex() def getStems(self, tokens, flag=False): rootList = [] for token in tokens: #token=stemmer.norm(token) root = self.stemmer.pre32(token) rootList.append(root) print(token, " : ", root) return rootList def stem(self, word): root = self.stemmer.pre32(word) root = self.stemmer.norm(root, 3) return root def loadStemsDictionnary(self, filePath="dictStems.txt"): lines = open(filePath, "r", encoding="windows-1256").readlines() dictionary = nltk.defaultdict(list) for line in lines: if not re.match("^;.*", line): parts = line.split('\t') if len(parts) != 4: break else: [rootStem, stem, tag, enGloss] = parts dictionary[rootStem].append( [stem, tag, ' '.join(enGloss.split(';'))]) return dictionary def verify(self, word): if self.stopWordsIndex.access(word): return True def setStopWordsIndex(self, index: ArabicStopWordsIndex): self.stopWordsIndex = index self.stopWordsIndex.buildIndex()
bigram_data = [bigrams_model[doc] for doc in no_stopwords_data] print(bigram_data[0:2]) ############# (5)lemmatizing the data ############# # produces a list of lists of the data lemmatized ... the lemmatizer does not work well when lemmatizing suffixes stemmer = ISRIStemmer() lemmatized_data = [] for items in bigram_data: lemmas = [] for token in items: token = stemmer.pre32( token) # removes the three-letter and two-letter prefixes token = stemmer.suf32( token) # removes the three-letter and two-letter suffixes token = stemmer.norm(token, num=1) # removes diacritics lemmas.append(token) lemmatized_data.append(lemmas) print(lemmatized_data[0:2]) ############# (5) Preparing the data using gensim for the Model ############# # the preprocess using gensim involves buidling the dictionary, the corpus and the bigrams # the data is (data_after_lemmatization) and it is a list of lists # The Dictionary dictionary = corpora.Dictionary(lemmatized_data) # the corpus corpus = [dictionary.doc2bow(d) for d in lemmatized_data] print(corpus[0:2])
def remove_diacritics(text): arstemmer = ISRIStemmer() result = arstemmer.norm(text, num=1) # remove diacritics which representing Arabic short vowels return result
def remove_diacritics(text): arstemmer = ISRIStemmer() result = arstemmer.norm( text, num=1) # remove diacritics which representing Arabic short vowels return result