def get_root(text): arabic_stemmer = ISRIStemmer() if not isinstance(text, list): word_list = text.split() else: word_list = text result = list() for word in text: root = arabic_stemmer.stem(word) result.append(root) return result
def lookUpWordScore(self, word, lexicon, use_lemma): stemmer = ISRIStemmer() for key in lexicon.iterkeys(): if key == word: return lexicon[key] for key in lexicon.iterkeys(): if stemmer.stem(key) == stemmer.stem(word): print word print key return lexicon[key] * 0.25 for key in lexicon.iterkeys(): med = nltk.metrics.edit_distance(word, key) match = 1 - (float(med) / len(word)) if match > 0.7: return lexicon[key] * 0.25 return 0
def lookUpWordScore(self, word, lexicon, use_lemma): stemmer = ISRIStemmer() for key in lexicon.iterkeys(): if key == word: return lexicon[key] for key in lexicon.iterkeys(): if stemmer.stem(key) == stemmer.stem(word): print word print key return lexicon[key]*0.25 for key in lexicon.iterkeys(): med = nltk.metrics.edit_distance(word, key) match = 1 - (float(med)/len(word)) if match > 0.7: return lexicon[key]*0.25 return 0
def preprocessing_test_data(user_string): tokens_array = n.tokenize._treebank_word_tokenizer.tokenize(user_string) stemmer = ISRIStemmer() stop_words = stopwords.words('arabic') preprocessing_result_question = list() for word in tokens_array: word = stemmer.norm(word, num=1) # remove diacritics which representing Arabic short vowels if not word in stop_words: # exclude stop words from being processed word = stemmer.pre32(word) # remove length three and length two prefixes in this order word = stemmer.suf32(word) # remove length three and length two suffixes in this order word = stemmer.waw(word) # remove connective ??? if it precedes a word beginning with ??? word = stemmer.norm(word, num=2) # normalize initial hamza to bare alif preprocessing_result_question.append(word) return preprocessing_result_question
def light_stem_word(word): original_word = word arabic_stemmer = ISRIStemmer() # remove diacritics which representing Arabic short vowels word = arabic_stemmer.norm(word, num=1) # exclude stop words from being processed if word not in arabic_stemmer.stop_words: # remove length three and length two prefixes in this order word = arabic_stemmer.pre32(word) # remove length three and length two suffixes in this order word = arabic_stemmer.suf32(word) # remove connective ‘و’ if it precedes a word beginning with ‘و’ word = arabic_stemmer.waw(word) # normalize initial hamza to bare alif word = arabic_stemmer.norm(word, num=2) if word not in ar_spell: return original_word else: return word
def Word_Steamer(self, arr): array = [] stemmer = ISRIStemmer() for words in arr: array.append(stemmer.stem(words)) return array
def get_root_word(word): arabic_stemmer = ISRIStemmer() root = arabic_stemmer.stem(word) return root