Ejemplo n.º 1
0
def data_preprocessing(article):

    article = re.sub('\n', ' ', article)  # Removing this character
    article = re.sub('الـ', '', article)  # Removing this character
    article = re.sub('لـ', '', article)  # Removing this character
    article = re.sub('بـ', '', article)  # Removing this character
    article = re.sub('ال', '', article)  # Removing this character
    article = re.sub('عربية نت ', '', article)  # Removing this sentence

    # Spilt the keyword name by comma
    tokens = word_tokenize(str(article))
    # Define a list of punctuation
    remove_pun = str.maketrans('', '', string.punctuation)
    # Remove punctuation from each word
    words = [w.translate(remove_pun) for w in tokens]
    # Remove non-alphabetic characters
    alphabetic_words = [word for word in words if word.isalpha()]
    # Remove arabic stopwords
    alphabetic_words = [
        word for word in alphabetic_words if not word in stop_words
    ]
    # Initialize arabic stemmer
    stemer = ISRIStemmer()
    # Stem each word
    stemmed_words = [stemer.suf32(word) for word in alphabetic_words]
    # Join and return the stemmed_words
    return " ".join(stemmed_words)
Ejemplo n.º 2
0
def lightStemAr(word_list):
	result = []
	arstemmer = ISRIStemmer()
	for word in word_list:
		word = arstemmer.norm(word, num=1)  #  remove diacritics which representing Arabic short vowels  
		if not word in arstemmer.stop_words:   # exclude stop words from being processed
			word = arstemmer.pre32(word)        # remove length three and length two prefixes in this order
			word = arstemmer.suf32(word)        # remove length three and length two suffixes in this order
			word = arstemmer.waw(word)          # remove connective ‘و’ if it precedes a word beginning with ‘و’
			word = arstemmer.norm(word, num=2)       # normalize initial hamza to bare alif
		result.append(word)
	return ' '.join(result)
def lightStemAr(word_list):
	result = []
	arstemmer = ISRIStemmer()
	for word in word_list:
		word = arstemmer.norm(word, num=1)  #  remove diacritics which representing Arabic short vowels  
		if not word in arstemmer.stop_words:   # exclude stop words from being processed
			word = arstemmer.pre32(word)        # remove length three and length two prefixes in this order
			word = arstemmer.suf32(word)        # remove length three and length two suffixes in this order
			word = arstemmer.waw(word)          # remove connective ‘و’ if it precedes a word beginning with ‘و’
			word = arstemmer.norm(word, num=2)       # normalize initial hamza to bare alif
		result.append(word)
	return ' '.join(result)
Ejemplo n.º 4
0
def light_stem(text):
    words = text
    result = list()
    stemmer = ISRIStemmer()
    for word in words:
        word = stemmer.norm(word, num=1)
        if word not in stemmer.stop_words:
            word = stemmer.pre32(word)
            word = stemmer.suf32(word)
            word = stemmer.waw(word)
            word = stemmer.norm(word, num=2)
            result.append(word)
    return ' '.join(result)
Ejemplo n.º 5
0
def light_stem(text):
    words = text.split()
    result = list()
    stemmer = ISRIStemmer()
    for word in words:
        word = stemmer.norm(word, num=1)      # remove diacritics which representing Arabic short vowels
        if not word in stemmer.stop_words:    # exclude stop words from being processed
            word = stemmer.pre32(word)        # remove length three and length two prefixes in this order
            word = stemmer.suf32(word)        # remove length three and length two suffixes in this order
            word = stemmer.waw(word)          # remove connective ‘و’ if it precedes a word beginning with ‘و’
            word = stemmer.norm(word, num=2)  # normalize initial hamza to bare alif
#             word=stemmer.pro_w4(word)         #process length four patterns and extract length three roots
#             word=stemmer.pro_w53(word)        #process length five patterns and extract length three roots
#             word=stemmer.pro_w54(word)        #process length five patterns and extract length four roots
#             word=stemmer.end_w5(word)         #ending step (word of length five)
#             word=stemmer.pro_w6(word)         #process length six patterns and extract length three roots
#             word=stemmer.pro_w64(word)        #process length six patterns and extract length four roots
#             word=stemmer.end_w6(word)         #ending step (word of length six)
#             word=stemmer.suf1(word)           #normalize short sufix
#             word=stemmer.pre1(word)           #normalize short prefix
            
        result.append(word)
    return ' '.join(result)
Ejemplo n.º 6
0
                      "=", output_word)
                correct = 1
                break
            elif len(current_word) <= 3 and ed == 1:
                suggestions.append((ed, current_word, output_word))
            elif len(current_word) > 3 and ed <= 2:
                suggestions.append((ed, current_word, output_word))
            else:
                continue

        if len(suggestions) > 0:
            for suggest in suggestions:
                lemmas_cw = []
                lemmas_cw.append(suggest[1])
                lemmas_cw.append(st.suf1(suggest[1]))
                lemmas_cw.append(st.suf32(suggest[1]))
                lemmas_cw.append(st.pre1(suggest[1]))
                lemmas_cw.append(st.pre32(suggest[1]))

                lemmas_ow = []
                lemmas_ow.append(suggest[2])
                lemmas_ow.append(st.suf1(suggest[2]))
                lemmas_ow.append(st.suf32(suggest[2]))
                lemmas_ow.append(st.pre1(suggest[2]))
                lemmas_ow.append(st.pre32(suggest[2]))

                if correct != 1 and len(suggest[1]) > 7:
                    for l in lemmas_cw:
                        if l in lemmas_ow:
                            correct = 2
                            print("I got the lemma; it seems correct -->",
print(bigrams_model)

bigram_data = [bigrams_model[doc] for doc in no_stopwords_data]
print(bigram_data[0:2])

############# (5)lemmatizing the data #############
# produces a list of lists of the data lemmatized ... the lemmatizer does not work well when lemmatizing suffixes
stemmer = ISRIStemmer()

lemmatized_data = []
for items in bigram_data:
    lemmas = []
    for token in items:
        token = stemmer.pre32(
            token)  # removes the three-letter and two-letter prefixes
        token = stemmer.suf32(
            token)  # removes the three-letter and two-letter suffixes
        token = stemmer.norm(token, num=1)  # removes diacritics
        lemmas.append(token)
    lemmatized_data.append(lemmas)
print(lemmatized_data[0:2])

############# (5) Preparing the data using gensim for the Model #############
# the preprocess using gensim involves buidling the dictionary, the corpus and the bigrams
# the data is (data_after_lemmatization) and it is a list of lists

# The Dictionary
dictionary = corpora.Dictionary(lemmatized_data)

# the corpus
corpus = [dictionary.doc2bow(d) for d in lemmatized_data]
print(corpus[0:2])