Example #1
0
def get_root(text):
    arabic_stemmer = ISRIStemmer()
    if not isinstance(text, list):
        word_list = text.split()
    else:
        word_list = text
    result = list()
    for word in text:
        root = arabic_stemmer.stem(word)
        result.append(root)
    return result
Example #2
0
 def lookUpWordScore(self, word, lexicon, use_lemma):
     stemmer = ISRIStemmer()
     for key in lexicon.iterkeys():
         if key == word:
             return lexicon[key]
     for key in lexicon.iterkeys():
         if stemmer.stem(key) == stemmer.stem(word):
             print word
             print key
             return lexicon[key] * 0.25
     for key in lexicon.iterkeys():
         med = nltk.metrics.edit_distance(word, key)
         match = 1 - (float(med) / len(word))
         if match > 0.7:
             return lexicon[key] * 0.25
     return 0
 def lookUpWordScore(self, word, lexicon, use_lemma):
     stemmer = ISRIStemmer()
     for key in lexicon.iterkeys():
         if key == word:
             return lexicon[key]
     for key in lexicon.iterkeys():
         if stemmer.stem(key) == stemmer.stem(word):
             print word
             print key
             return lexicon[key]*0.25
     for key in lexicon.iterkeys():
         med = nltk.metrics.edit_distance(word, key)
         match = 1 - (float(med)/len(word))
         if match > 0.7:
             return lexicon[key]*0.25
     return 0
Example #4
0
def preprocessing_test_data(user_string):
    tokens_array = n.tokenize._treebank_word_tokenizer.tokenize(user_string)
    stemmer = ISRIStemmer()
    stop_words = stopwords.words('arabic')
    preprocessing_result_question = list()
    for word in tokens_array:
        word = stemmer.norm(word, num=1)      # remove diacritics which representing Arabic short vowels
        if not word in stop_words:           # exclude stop words from being processed
          word = stemmer.pre32(word)        # remove length three and length two prefixes in this order
          word = stemmer.suf32(word)        # remove length three and length two suffixes in this order
          word = stemmer.waw(word)          # remove connective ??? if it precedes a word beginning with ???
          word = stemmer.norm(word, num=2)  # normalize initial hamza to bare alif
          preprocessing_result_question.append(word)
    return preprocessing_result_question
Example #5
0
def light_stem_word(word):
    original_word = word
    arabic_stemmer = ISRIStemmer()
    # remove diacritics which representing Arabic short vowels
    word = arabic_stemmer.norm(word, num=1)
    # exclude stop words from being processed
    if word not in arabic_stemmer.stop_words:
        # remove length three and length two prefixes in this order
        word = arabic_stemmer.pre32(word)
        # remove length three and length two suffixes in this order
        word = arabic_stemmer.suf32(word)
        # remove connective ‘و’ if it precedes a word beginning with ‘و’
        word = arabic_stemmer.waw(word)
        # normalize initial hamza to bare alif
        word = arabic_stemmer.norm(word, num=2)
    if word not in ar_spell:
        return original_word
    else:
        return word
Example #6
0
 def Word_Steamer(self, arr):
     array = []
     stemmer = ISRIStemmer()
     for words in arr:
         array.append(stemmer.stem(words))
     return array
Example #7
0
def get_root_word(word):
    arabic_stemmer = ISRIStemmer()
    root = arabic_stemmer.stem(word)
    return root