def data_preprocessing(article): article = re.sub('\n', ' ', article) # Removing this character article = re.sub('الـ', '', article) # Removing this character article = re.sub('لـ', '', article) # Removing this character article = re.sub('بـ', '', article) # Removing this character article = re.sub('ال', '', article) # Removing this character article = re.sub('عربية نت ', '', article) # Removing this sentence # Spilt the keyword name by comma tokens = word_tokenize(str(article)) # Define a list of punctuation remove_pun = str.maketrans('', '', string.punctuation) # Remove punctuation from each word words = [w.translate(remove_pun) for w in tokens] # Remove non-alphabetic characters alphabetic_words = [word for word in words if word.isalpha()] # Remove arabic stopwords alphabetic_words = [ word for word in alphabetic_words if not word in stop_words ] # Initialize arabic stemmer stemer = ISRIStemmer() # Stem each word stemmed_words = [stemer.suf32(word) for word in alphabetic_words] # Join and return the stemmed_words return " ".join(stemmed_words)
def lightStemAr(word_list): result = [] arstemmer = ISRIStemmer() for word in word_list: word = arstemmer.norm(word, num=1) # remove diacritics which representing Arabic short vowels if not word in arstemmer.stop_words: # exclude stop words from being processed word = arstemmer.pre32(word) # remove length three and length two prefixes in this order word = arstemmer.suf32(word) # remove length three and length two suffixes in this order word = arstemmer.waw(word) # remove connective ‘و’ if it precedes a word beginning with ‘و’ word = arstemmer.norm(word, num=2) # normalize initial hamza to bare alif result.append(word) return ' '.join(result)
def light_stem(text): words = text result = list() stemmer = ISRIStemmer() for word in words: word = stemmer.norm(word, num=1) if word not in stemmer.stop_words: word = stemmer.pre32(word) word = stemmer.suf32(word) word = stemmer.waw(word) word = stemmer.norm(word, num=2) result.append(word) return ' '.join(result)
def light_stem(text): words = text.split() result = list() stemmer = ISRIStemmer() for word in words: word = stemmer.norm(word, num=1) # remove diacritics which representing Arabic short vowels if not word in stemmer.stop_words: # exclude stop words from being processed word = stemmer.pre32(word) # remove length three and length two prefixes in this order word = stemmer.suf32(word) # remove length three and length two suffixes in this order word = stemmer.waw(word) # remove connective ‘و’ if it precedes a word beginning with ‘و’ word = stemmer.norm(word, num=2) # normalize initial hamza to bare alif # word=stemmer.pro_w4(word) #process length four patterns and extract length three roots # word=stemmer.pro_w53(word) #process length five patterns and extract length three roots # word=stemmer.pro_w54(word) #process length five patterns and extract length four roots # word=stemmer.end_w5(word) #ending step (word of length five) # word=stemmer.pro_w6(word) #process length six patterns and extract length three roots # word=stemmer.pro_w64(word) #process length six patterns and extract length four roots # word=stemmer.end_w6(word) #ending step (word of length six) # word=stemmer.suf1(word) #normalize short sufix # word=stemmer.pre1(word) #normalize short prefix result.append(word) return ' '.join(result)
"=", output_word) correct = 1 break elif len(current_word) <= 3 and ed == 1: suggestions.append((ed, current_word, output_word)) elif len(current_word) > 3 and ed <= 2: suggestions.append((ed, current_word, output_word)) else: continue if len(suggestions) > 0: for suggest in suggestions: lemmas_cw = [] lemmas_cw.append(suggest[1]) lemmas_cw.append(st.suf1(suggest[1])) lemmas_cw.append(st.suf32(suggest[1])) lemmas_cw.append(st.pre1(suggest[1])) lemmas_cw.append(st.pre32(suggest[1])) lemmas_ow = [] lemmas_ow.append(suggest[2]) lemmas_ow.append(st.suf1(suggest[2])) lemmas_ow.append(st.suf32(suggest[2])) lemmas_ow.append(st.pre1(suggest[2])) lemmas_ow.append(st.pre32(suggest[2])) if correct != 1 and len(suggest[1]) > 7: for l in lemmas_cw: if l in lemmas_ow: correct = 2 print("I got the lemma; it seems correct -->",
print(bigrams_model) bigram_data = [bigrams_model[doc] for doc in no_stopwords_data] print(bigram_data[0:2]) ############# (5)lemmatizing the data ############# # produces a list of lists of the data lemmatized ... the lemmatizer does not work well when lemmatizing suffixes stemmer = ISRIStemmer() lemmatized_data = [] for items in bigram_data: lemmas = [] for token in items: token = stemmer.pre32( token) # removes the three-letter and two-letter prefixes token = stemmer.suf32( token) # removes the three-letter and two-letter suffixes token = stemmer.norm(token, num=1) # removes diacritics lemmas.append(token) lemmatized_data.append(lemmas) print(lemmatized_data[0:2]) ############# (5) Preparing the data using gensim for the Model ############# # the preprocess using gensim involves buidling the dictionary, the corpus and the bigrams # the data is (data_after_lemmatization) and it is a list of lists # The Dictionary dictionary = corpora.Dictionary(lemmatized_data) # the corpus corpus = [dictionary.doc2bow(d) for d in lemmatized_data] print(corpus[0:2])