Esempio n. 1
0
class BasicStemmer(Stemmer):
    def __init__(self):
        self.stemmer = ISRIStemmer()
        self.stopWordsIndex = ArabicStopWordsIndex(self)
        self.stopWordsIndex.buildIndex()

    def getStems(self, tokens, flag=False):

        rootList = []

        for token in tokens:
            #token=stemmer.norm(token)
            root = self.stemmer.pre32(token)
            rootList.append(root)
            print(token, "  :  ", root)

        return rootList

    def stem(self, word):
        root = self.stemmer.pre32(word)
        root = self.stemmer.norm(root, 3)

        return root

    def loadStemsDictionnary(self, filePath="dictStems.txt"):
        lines = open(filePath, "r", encoding="windows-1256").readlines()
        dictionary = nltk.defaultdict(list)
        for line in lines:
            if not re.match("^;.*", line):
                parts = line.split('\t')
                if len(parts) != 4:
                    break
                else:
                    [rootStem, stem, tag, enGloss] = parts
                    dictionary[rootStem].append(
                        [stem, tag, ' '.join(enGloss.split(';'))])

        return dictionary

    def verify(self, word):
        if self.stopWordsIndex.access(word):
            return True

    def setStopWordsIndex(self, index: ArabicStopWordsIndex):
        self.stopWordsIndex = index
        self.stopWordsIndex.buildIndex()
Esempio n. 2
0
def lightStemAr(word_list):
	result = []
	arstemmer = ISRIStemmer()
	for word in word_list:
		word = arstemmer.norm(word, num=1)  #  remove diacritics which representing Arabic short vowels  
		if not word in arstemmer.stop_words:   # exclude stop words from being processed
			word = arstemmer.pre32(word)        # remove length three and length two prefixes in this order
			word = arstemmer.suf32(word)        # remove length three and length two suffixes in this order
			word = arstemmer.waw(word)          # remove connective ‘و’ if it precedes a word beginning with ‘و’
			word = arstemmer.norm(word, num=2)       # normalize initial hamza to bare alif
		result.append(word)
	return ' '.join(result)
def lightStemAr(word_list):
	result = []
	arstemmer = ISRIStemmer()
	for word in word_list:
		word = arstemmer.norm(word, num=1)  #  remove diacritics which representing Arabic short vowels  
		if not word in arstemmer.stop_words:   # exclude stop words from being processed
			word = arstemmer.pre32(word)        # remove length three and length two prefixes in this order
			word = arstemmer.suf32(word)        # remove length three and length two suffixes in this order
			word = arstemmer.waw(word)          # remove connective ‘و’ if it precedes a word beginning with ‘و’
			word = arstemmer.norm(word, num=2)       # normalize initial hamza to bare alif
		result.append(word)
	return ' '.join(result)
Esempio n. 4
0
def light_stem(text):
    words = text
    result = list()
    stemmer = ISRIStemmer()
    for word in words:
        word = stemmer.norm(word, num=1)
        if word not in stemmer.stop_words:
            word = stemmer.pre32(word)
            word = stemmer.suf32(word)
            word = stemmer.waw(word)
            word = stemmer.norm(word, num=2)
            result.append(word)
    return ' '.join(result)
Esempio n. 5
0
def light_stem(text):
    words = text.split()
    result = list()
    stemmer = ISRIStemmer()
    for word in words:
        word = stemmer.norm(word, num=1)      # remove diacritics which representing Arabic short vowels
        if not word in stemmer.stop_words:    # exclude stop words from being processed
            word = stemmer.pre32(word)        # remove length three and length two prefixes in this order
            word = stemmer.suf32(word)        # remove length three and length two suffixes in this order
            word = stemmer.waw(word)          # remove connective ‘و’ if it precedes a word beginning with ‘و’
            word = stemmer.norm(word, num=2)  # normalize initial hamza to bare alif
#             word=stemmer.pro_w4(word)         #process length four patterns and extract length three roots
#             word=stemmer.pro_w53(word)        #process length five patterns and extract length three roots
#             word=stemmer.pro_w54(word)        #process length five patterns and extract length four roots
#             word=stemmer.end_w5(word)         #ending step (word of length five)
#             word=stemmer.pro_w6(word)         #process length six patterns and extract length three roots
#             word=stemmer.pro_w64(word)        #process length six patterns and extract length four roots
#             word=stemmer.end_w6(word)         #ending step (word of length six)
#             word=stemmer.suf1(word)           #normalize short sufix
#             word=stemmer.pre1(word)           #normalize short prefix
            
        result.append(word)
    return ' '.join(result)
Esempio n. 6
0
                break
            elif len(current_word) <= 3 and ed == 1:
                suggestions.append((ed, current_word, output_word))
            elif len(current_word) > 3 and ed <= 2:
                suggestions.append((ed, current_word, output_word))
            else:
                continue

        if len(suggestions) > 0:
            for suggest in suggestions:
                lemmas_cw = []
                lemmas_cw.append(suggest[1])
                lemmas_cw.append(st.suf1(suggest[1]))
                lemmas_cw.append(st.suf32(suggest[1]))
                lemmas_cw.append(st.pre1(suggest[1]))
                lemmas_cw.append(st.pre32(suggest[1]))

                lemmas_ow = []
                lemmas_ow.append(suggest[2])
                lemmas_ow.append(st.suf1(suggest[2]))
                lemmas_ow.append(st.suf32(suggest[2]))
                lemmas_ow.append(st.pre1(suggest[2]))
                lemmas_ow.append(st.pre32(suggest[2]))

                if correct != 1 and len(suggest[1]) > 7:
                    for l in lemmas_cw:
                        if l in lemmas_ow:
                            correct = 2
                            print("I got the lemma; it seems correct -->",
                                  current_word, "~", suggest[2])
bigrams_model = gensim.models.phrases.Phraser(bigrams)
print(bigrams_model)

bigram_data = [bigrams_model[doc] for doc in no_stopwords_data]
print(bigram_data[0:2])

############# (5)lemmatizing the data #############
# produces a list of lists of the data lemmatized ... the lemmatizer does not work well when lemmatizing suffixes
stemmer = ISRIStemmer()

lemmatized_data = []
for items in bigram_data:
    lemmas = []
    for token in items:
        token = stemmer.pre32(
            token)  # removes the three-letter and two-letter prefixes
        token = stemmer.suf32(
            token)  # removes the three-letter and two-letter suffixes
        token = stemmer.norm(token, num=1)  # removes diacritics
        lemmas.append(token)
    lemmatized_data.append(lemmas)
print(lemmatized_data[0:2])

############# (5) Preparing the data using gensim for the Model #############
# the preprocess using gensim involves buidling the dictionary, the corpus and the bigrams
# the data is (data_after_lemmatization) and it is a list of lists

# The Dictionary
dictionary = corpora.Dictionary(lemmatized_data)

# the corpus