def getLemmatization(example_sent): word_tokens = Tokenization.tokenizationProcess(example_sent) filtered_sentence = [] for w in word_tokens: filtered_sentence.append(nltk.ISRIStemmer().suf32(w)) return filtered_sentence
def defineStemming(example_sent): word_tokens = Tokenization.tokenizationProcess(example_sent) filtered_sentence = [] ps = nltk.ISRIStemmer() for w in word_tokens: filtered_sentence.append(ps.stem(w)) return filtered_sentence
class Normalizer : stopword_list = open ('res/stop_words.txt','r').read().strip('\n') stemmer = nltk.ISRIStemmer() def tokenize_text(self,text): tokens = nltk.word_tokenize(text) tokens = [token.strip() for token in tokens] return tokens def stemming_text(self,text): tokens = self.tokenize_text(text) filtered_tokens = [self.stemmer.stem(token) for token in tokens] filtered_text = ' '.join(filtered_tokens) return filtered_text def remove_special_characters(self,text): def remove_conjunction(token): if token[0] == 'و' and len(token) > 4: t = token[1:len(token)] return t else: return token tokens = self.tokenize_text(text) pattern = re.compile(r'[?|.|:|;|,|"|\d|$|&|*|%|@|(|)|~]'.format(re.escape(string.punctuation))) filtered_tokens = filter(None, [pattern.sub('',remove_conjunction(token)) for token in tokens]) filtered_text = ' '.join(filtered_tokens) return filtered_text def remove_stopwords(self,text): tokens = self.tokenize_text(text) filtered_tokens = [token for token in tokens if token not in self.stopword_list] filtered_text = ' '.join(filtered_tokens) return filtered_text def remove_repeated_characters(self,text): tokens = self.tokenize_text(text) repeat_pattern = re.compile(r'(\w*)(\w)\2(\w*)') match_substitution = r'\1\2\3' def replace(old_word): if wn.synsets(old_word): return old_word new_word = repeat_pattern.sub(match_substitution, old_word) return replace(new_word) if new_word != old_word else new_word correct_tokens = [replace(word) for word in tokens] filtered_text = ' '.join(correct_tokens) return filtered_text def normalize_corpus(self,corpus): normalized_corpus = [] for text in corpus: text = self.remove_special_characters(text) text = self.remove_stopwords(text) text = self.remove_repeated_characters(text) text = self.stemming_text(text) text = self.remove_stopwords(text) normalized_corpus.append(text) return normalized_corpus
def getLemmatization(example_sent): lemmatizer = WordNetLemmatizer() word_tokens = Tokenization.tokenizationProcess(example_sent) lemmatized = [] for word in word_tokens: lemmatized.append(nltk.ISRIStemmer().suf32(word)) return lemmatized
def resolve_txt(root, info): text = info.context.get('txt') #.encode('utf8') text = re.sub('[%s]' % re.escape(string.punctuation), '', text) text = re.sub('\w*\d\w*', '', text) text = re.sub('[‘’“”«»…]', '', text) text = re.sub("-_،؟", '', text) lemmatizer = WordNetLemmatizer() word_tokens = word_tokenize(text) lemmatized = [] for word in word_tokens: lemmatized.append(nltk.ISRIStemmer().suf32(word)) return {"result": lemmatized}
def __init__(self, raw_data=None): self.analyzer = pyaramorph.Analyzer() self.stemmer = nltk.ISRIStemmer() self.lemmatizer = nltk.WordNetLemmatizer() self.segmenter = nltk.data.load("tokenizers/punkt/english.pickle") if raw_data is not None: self.raw_data = raw_data self.org_data = raw_data self.analyze_text() self.ambig() self.load_corpus('Tashkeela') self.select_cand() self.print_result()
def root(word): st = nltk.ISRIStemmer() tokens = nltk.word_tokenize(word) result = [st.stem(w) for w in tokens] return result
import re # for pre-processing text import string # for pre-processing text import nltk # for processing texts from nltk.corpus import stopwords nltk.download('punkt') nltk.download('stopwords') nltk.download('wordnet') # define arabic punctuations arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ''' english_punctuations = string.punctuation punctuations_list = arabic_punctuations + english_punctuations st = nltk.ISRIStemmer() def clean_text(text): """ DESCRIPTION: This function to clean text INPUT: text: string OUTPUT: text: string after clean it """ # remove english letters text = re.sub("[a-zA-Z]", " ", str(text)) # remove \n from text text = re.sub('\n', ' ', text) # remove number text = re.sub(r'\d+', '', text)
def __init__(self): self.analyzer = pyaramorph.Analyzer() self.stemmer = nltk.ISRIStemmer() self.lemmatizer = nltk.WordNetLemmatizer() self.segmenter = nltk.data.load("tokenizers/punkt/english.pickle")