Python ISRIStemmer.suf32 Examples

Programming Language: Python

Namespace/Package Name: nltk.stem.isri

Class/Type: ISRIStemmer

Method/Function: suf32

Examples at hotexamples.com: 7

Python ISRIStemmer.suf32 - 7 examples found. These are the top rated real world Python examples of nltk.stem.isri.ISRIStemmer.suf32 extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

stem(30)

ISRIStemmer(9)

norm(9)

pre32(6)

suf32(6)

__init__(3)

waw(3)

pre1(1)

suf1(1)

Example #1

Show file

def data_preprocessing(article):

    article = re.sub('\n', ' ', article)  # Removing this character
    article = re.sub('الـ', '', article)  # Removing this character
    article = re.sub('لـ', '', article)  # Removing this character
    article = re.sub('بـ', '', article)  # Removing this character
    article = re.sub('ال', '', article)  # Removing this character
    article = re.sub('عربية نت ', '', article)  # Removing this sentence

    # Spilt the keyword name by comma
    tokens = word_tokenize(str(article))
    # Define a list of punctuation
    remove_pun = str.maketrans('', '', string.punctuation)
    # Remove punctuation from each word
    words = [w.translate(remove_pun) for w in tokens]
    # Remove non-alphabetic characters
    alphabetic_words = [word for word in words if word.isalpha()]
    # Remove arabic stopwords
    alphabetic_words = [
        word for word in alphabetic_words if not word in stop_words
    ]
    # Initialize arabic stemmer
    stemer = ISRIStemmer()
    # Stem each word
    stemmed_words = [stemer.suf32(word) for word in alphabetic_words]
    # Join and return the stemmed_words
    return " ".join(stemmed_words)

Example #2

Show file

File: textpro.py Project: tazjel/comparable-text-miner

def lightStemAr(word_list):
	result = []
	arstemmer = ISRIStemmer()
	for word in word_list:
		word = arstemmer.norm(word, num=1)  #  remove diacritics which representing Arabic short vowels  
		if not word in arstemmer.stop_words:   # exclude stop words from being processed
			word = arstemmer.pre32(word)        # remove length three and length two prefixes in this order
			word = arstemmer.suf32(word)        # remove length three and length two suffixes in this order
			word = arstemmer.waw(word)          # remove connective ‘و’ if it precedes a word beginning with ‘و’
			word = arstemmer.norm(word, num=2)       # normalize initial hamza to bare alif
		result.append(word)
	return ' '.join(result)

Example #3

Show file

File: textpro.py Project: Tahraoui-abdelkader/comparable-text-miner

def lightStemAr(word_list):
	result = []
	arstemmer = ISRIStemmer()
	for word in word_list:
		word = arstemmer.norm(word, num=1)  #  remove diacritics which representing Arabic short vowels  
		if not word in arstemmer.stop_words:   # exclude stop words from being processed
			word = arstemmer.pre32(word)        # remove length three and length two prefixes in this order
			word = arstemmer.suf32(word)        # remove length three and length two suffixes in this order
			word = arstemmer.waw(word)          # remove connective ‘و’ if it precedes a word beginning with ‘و’
			word = arstemmer.norm(word, num=2)       # normalize initial hamza to bare alif
		result.append(word)
	return ' '.join(result)

Example #4

Show file

File: tweets_preprocessing.py Project: smara97/NLP-Arabic

def light_stem(text):
    words = text
    result = list()
    stemmer = ISRIStemmer()
    for word in words:
        word = stemmer.norm(word, num=1)
        if word not in stemmer.stop_words:
            word = stemmer.pre32(word)
            word = stemmer.suf32(word)
            word = stemmer.waw(word)
            word = stemmer.norm(word, num=2)
            result.append(word)
    return ' '.join(result)

Example #5

Show file

def light_stem(text):
    words = text.split()
    result = list()
    stemmer = ISRIStemmer()
    for word in words:
        word = stemmer.norm(word, num=1)      # remove diacritics which representing Arabic short vowels
        if not word in stemmer.stop_words:    # exclude stop words from being processed
            word = stemmer.pre32(word)        # remove length three and length two prefixes in this order
            word = stemmer.suf32(word)        # remove length three and length two suffixes in this order
            word = stemmer.waw(word)          # remove connective ‘و’ if it precedes a word beginning with ‘و’
            word = stemmer.norm(word, num=2)  # normalize initial hamza to bare alif
#             word=stemmer.pro_w4(word)         #process length four patterns and extract length three roots
#             word=stemmer.pro_w53(word)        #process length five patterns and extract length three roots
#             word=stemmer.pro_w54(word)        #process length five patterns and extract length four roots
#             word=stemmer.end_w5(word)         #ending step (word of length five)
#             word=stemmer.pro_w6(word)         #process length six patterns and extract length three roots
#             word=stemmer.pro_w64(word)        #process length six patterns and extract length four roots
#             word=stemmer.end_w6(word)         #ending step (word of length six)
#             word=stemmer.suf1(word)           #normalize short sufix
#             word=stemmer.pre1(word)           #normalize short prefix
            
        result.append(word)
    return ' '.join(result)

Example #6

Show file

                      "=", output_word)
                correct = 1
                break
            elif len(current_word) <= 3 and ed == 1:
                suggestions.append((ed, current_word, output_word))
            elif len(current_word) > 3 and ed <= 2:
                suggestions.append((ed, current_word, output_word))
            else:
                continue

        if len(suggestions) > 0:
            for suggest in suggestions:
                lemmas_cw = []
                lemmas_cw.append(suggest[1])
                lemmas_cw.append(st.suf1(suggest[1]))
                lemmas_cw.append(st.suf32(suggest[1]))
                lemmas_cw.append(st.pre1(suggest[1]))
                lemmas_cw.append(st.pre32(suggest[1]))

                lemmas_ow = []
                lemmas_ow.append(suggest[2])
                lemmas_ow.append(st.suf1(suggest[2]))
                lemmas_ow.append(st.suf32(suggest[2]))
                lemmas_ow.append(st.pre1(suggest[2]))
                lemmas_ow.append(st.pre32(suggest[2]))

                if correct != 1 and len(suggest[1]) > 7:
                    for l in lemmas_cw:
                        if l in lemmas_ow:
                            correct = 2
                            print("I got the lemma; it seems correct -->",

Example #7

Show file

File: topic_model_1.py Project: elsayed-issa/Arabic-News-Summaries

print(bigrams_model)

bigram_data = [bigrams_model[doc] for doc in no_stopwords_data]
print(bigram_data[0:2])

############# (5)lemmatizing the data #############
# produces a list of lists of the data lemmatized ... the lemmatizer does not work well when lemmatizing suffixes
stemmer = ISRIStemmer()

lemmatized_data = []
for items in bigram_data:
    lemmas = []
    for token in items:
        token = stemmer.pre32(
            token)  # removes the three-letter and two-letter prefixes
        token = stemmer.suf32(
            token)  # removes the three-letter and two-letter suffixes
        token = stemmer.norm(token, num=1)  # removes diacritics
        lemmas.append(token)
    lemmatized_data.append(lemmas)
print(lemmatized_data[0:2])

############# (5) Preparing the data using gensim for the Model #############
# the preprocess using gensim involves buidling the dictionary, the corpus and the bigrams
# the data is (data_after_lemmatization) and it is a list of lists

# The Dictionary
dictionary = corpora.Dictionary(lemmatized_data)

# the corpus
corpus = [dictionary.doc2bow(d) for d in lemmatized_data]
print(corpus[0:2])