def stem_data(dat): normalizer = hazm.Normalizer() dat = normalizer.normalize(dat) sent = hazm.sent_tokenize(dat) words = [] for s in sent: tagged = list(tagger.tag(hazm.word_tokenize(s))) new_tag = list(tagged) for token in tagged: if token[0] in stop_words: new_tag.remove(token) lemmatizer = hazm.Lemmatizer() for token in new_tag: stemmed = lemmatizer.lemmatize(token[0], pos=token[1]) stemmer = hazm.Stemmer() stemmed = stemmer.stem(stemmed) if len(stemmed) > 0 and ('#' not in stemmed): words.append(stemmed) return words
def prepareText(text): normalizer = hazm.Normalizer() text = normalizer.normalize(text) tokens = hazm.word_tokenize(text) stemmer = hazm.Stemmer() words = [stemmer.stem(token) for token in tokens] return words
def countTextWords(text): normalizer = hazm.Normalizer() text = normalizer.normalize(text) tokens = hazm.word_tokenize(text) stemmer = hazm.Stemmer() words = [stemmer.stem(token) for token in tokens] return len(words)
def __init__(self): self.preprocessed_docs = [] self.normalizer = hazm.Normalizer() self.word_tokenizer = hazm.WordTokenizer() self.stemmer = hazm.Stemmer() self.stop_words = hazm.stopwords_list() self.persian_garbage = { u'÷': u'', u'ٰ': u'', u'،': ' ', u'؟': ' ', u'؛': '', u'َ': '', u'ُ': '', u'ِ': '', u'ّ': '', u'ٌ': '', u'ٍ': '', u'ئ': u'ی', u'ي': u'ی', u'ة': u'ه', u'ء': u'', u'ك': u'ک', u'ْ': u'', u'أ': u'ا', u'إ': u'ا', u'ؤ': u'و', u'×': u'', u'٪': u'', u'٬': u'', u'آ': u'ا', u'●': u'' }
def textNormalizer(lousyCollection): docs = list() normalizer = hz.Normalizer() lemmatizer = hz.Lemmatizer() stemmer = hz.Stemmer() for i in range(len(lousyCollection)): normalized = normalizer.normalize(lousyCollection[i]) docs.append(delete_Punc(normalized)) for doc in docs: tokens = hz.word_tokenize(doc) for token in tokens: tokens[tokens.index(token)] = lemmatizer.lemmatize( stemmer.stem(token)) docs[docs.index(doc)] = tokens return docs
def clean_fa(self, data): data.text = self.fa_normalize(data.text) data.text = self.tokenizer(data.text) stemmer = hazm.Stemmer() lemmatizer = hazm.Lemmatizer() stopwords = hazm.stopwords_list() alphabet = set(list("ابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی")) data.text = data.apply( lambda row: self.stemLemmaStopWord( stemmer, lemmatizer, stopwords, alphabet, row.text ), axis=1, ) return data
def similar(s1, s2): normalizer = hazm.Normalizer() s1 = normalizer.normalize(s1) s2 = normalizer.normalize(s2) list_s1 = [ word for word in s1.split(" ") if word not in hazm.stopwords_list() ] list_s2 = [ word for word in s2.split(" ") if word not in hazm.stopwords_list() ] stemmer = hazm.Stemmer() stem_s1 = [stemmer.stem(word) for word in list_s1] same_words = set.intersection(set(list_s1), set(list_s2)) return len(same_words)
def stemming(self): if self.persian: stemmer = hazm.Stemmer() lemmatizer = hazm.Lemmatizer() for i in range(len(self.tokens)): self.tokens[i] = lemmatizer.lemmatize( stemmer.stem(self.tokens[i])) else: porter = nltk.PorterStemmer() self.tokens = [porter.stem(word) for word in self.tokens] lemma = nltk.WordNetLemmatizer() self.tokens = [ lemma.lemmatize(word, pos="v") for word in self.tokens ] self.tokens = [ lemma.lemmatize(word, pos="n") for word in self.tokens ]
def __init__(self, mask=None, size=900, stop_words_addr=default_stop_words_path, mask_addr=None): self.hazm_normalizer = hazm.Normalizer() self.parsivar_normalizer = parsivar.Normalizer() self.stemmer = hazm.Stemmer() self.lemmatizer = hazm.Lemmatizer() self.stop_words = set(hazm.stopwords_list(stop_words_addr)) mask = np.array( Image.open(mask_addr)) if mask_addr is not None else None self.generator = WordCloud(width=size, height=size, include_numbers=False, persian_normalize=False, collocations=True, mask=mask, background_color='white')
def word_counter(self, text: str) -> (float, dict): text = text.lower() text = text.translate(str.maketrans( {'#': ' ', '$': ' ', '/': ' ', '+': ' ', '=': ' ', ':': ' ', ',': ' ', ';': ' ', '؛': ' ', '،': ' ', '.': ' ', '!': ' ', '؟': ' ', '?': ' ', '«': ' ', '»': ' ', '(': ' ', ')': ' ', '_': ' ', '-': ' ', '@': ' '})) text = hazm.Normalizer().normalize(text) text = hazm.word_tokenize(text) stemmer = hazm.Stemmer() keywords_dic = {word: 0 for word in self.keywords.keys()} value = 0.0 for i in range(len(text)): stemmed_word = stemmer.stem(text[i]) if stemmed_word in keywords_dic: keywords_dic[stemmed_word] += 1 if keywords_dic[stemmed_word] == 1: # count each word only once value += self.keywords[stemmed_word] if stemmed_word in self.filter_words: return 0, {} return value, keywords_dic
def preprocess_farsi(text): prohibitedWords = ['[[', ']]', '{{', '}}', '{|', '|', '*', '==', '=', '\'\'\'' ,'_'] big_regex = re.compile('|'.join(map(re.escape, prohibitedWords))) new_text = big_regex.sub(" ", text) # print(new_text) ### Remove English characters new_text = re.sub(r'[a-zA-Z]','', new_text) ### Remove punctuation new_text = re.sub(r'[^\w\s]', ' ', new_text) normalizer = hazm.Normalizer(remove_extra_spaces=True, persian_style=True, persian_numbers=True, remove_diacritics=True, affix_spacing=True, token_based=False, punctuation_spacing=True) new_text = normalizer.normalize(new_text) ### Remove numbers new_text = re.sub(r'[۱۲۳۴۵۶۷۸۹۰]', ' ', new_text) ### Not in HAZM # new_text = new_text.replace('گی','ه') tokens = hazm.word_tokenize(new_text) stemmer = hazm.Stemmer() tokens = [word.replace('\u200c', '') for word in tokens ] tokens = [stemmer.stem(word) for word in tokens] tokens = [word for word in tokens if word != '' ] return tokens
def get_corrected_word(word): if word[0] in string.ascii_letters: stemmer = nltk.PorterStemmer() else: stemmer = hazm.Stemmer() word = stemmer.stem(word) if word in positional_indexer.inverted_index: return word good_words = [] for w in positional_indexer.get_all_words(): if nltk.jaccard_distance(set(nltk.ngrams(w, n=2)), set(nltk.ngrams(word, n=2))) < 0.3: good_words.append(w) best_word = '###########################' for w in good_words: if nltk.edit_distance(w, word) < nltk.edit_distance(best_word, word): best_word = w return best_word
def __init__(self, rouge_types, use_stemmer=False, lang='en'): """Initializes a new RougeScorer. Valid rouge types that can be computed are: rougen (e.g. rouge1, rouge2): n-gram based scoring. rougeL: Longest common subsequence based scoring. Args: rouge_types: A list of rouge types to calculate. use_stemmer: Bool indicating whether Porter stemmer should be used to strip word suffixes to improve matching. lang: Handling language input. Returns: A dict mapping rouge types to Score tuples. """ self.rouge_types = rouge_types self.lang = lang if lang == 'fa': self._stemmer = hazm.Stemmer() if use_stemmer else None else: self._stemmer = porter.PorterStemmer() if use_stemmer else None
normalizer = hazm.Normalizer() # normalizer.normalize('اصلاح نويسه ها و استفاده از نیمفاصله پردازش را آسان مي كند') # 'اصلاح نویسهها و استفاده از نیمفاصله پردازش را آسان میکند' # sentence tokenizer # hazm.sent_tokenize('ما هم برای وصل کردن آمدیم! ولی برای پردازش، جدا بهتر نیست؟') # ['ما هم برای وصل کردن آمدیم!', 'ولی برای پردازش، جدا بهتر نیست؟'] # word tokenizer # hazm.word_tokenize('ولی برای پردازش، جدا بهتر نیست؟') # ['ولی', 'برای', 'پردازش', '،', 'جدا', 'بهتر', 'نیست', '؟'] # Stemmer stemmer = hazm.Stemmer() # stemmer.stem('کتابها') # 'کتاب' # Lemmatizer lemmatizer = hazm.Lemmatizer() # lemmatizer.lemmatize('میروم') # 'رفت#رو' # Tagger # tagger = hazm.POSTagger(model='resources/postagger.model') # tagger.tag(hazm.word_tokenize('ما بسیار کتاب میخوانیم')) # [('ما', 'PRO'), ('بسیار', 'ADV'), ('کتاب', 'N'), ('میخوانیم', 'V')]
class TextNormalizer: to_remove_fa = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUZWXYZ" + "،؛,|:»«َُِّ<>؟÷×" + "٬ًٌٍؘَؙُؚِّْٰٕٖٜٟؐؑؒؓؔؕؖؗٓٔٗ٘ٙٚٛٝٞ" regex_fa_space = re.compile('[%s]' % re.escape(string.punctuation)) regex_fa_none = re.compile('[%s]' % re.escape(to_remove_fa)) to_replace_fa = [ ("\u200c", " "), ("ـ", ""), ("آ", "ا"), ("ۀ", "ه"), ("ة", "ه"), ("ي", "ی"), ("ئ", "ی"), ("ء", ""), ("أ", "ا"), ("إ", "ا"), ("ؤ", "و"), ("ك", "ک"), ("۰", "0"), ("۱", "1"), ("۲", "2"), ("۳", "3"), ("۴", "4"), ("۵", "5"), ("۶", "6"), ("۷", "7"), ("۸", "8"), ("۹", "9") ] stemmer = hazm.Stemmer() regex_en_space = re.compile('[%s]' % re.escape(string.punctuation)) regex_en_unwanted = re.compile(r'[^A-Za-z0-9\s]+') porter_stemmer = nltk.stem.PorterStemmer() @staticmethod def prepare_text(text, lang="fa", tokenize=True): if lang == "fa": return TextNormalizer.prepare_persian_text(text, tokenize) return TextNormalizer.prepare_english_text(text, tokenize) @staticmethod def get_word_language(text): farsi = False i = 0 for ch in text: i += 1 name = unicodedata.name(ch).lower() if 'arabic' in name or 'farsi' in name or 'persian' in name: farsi = True break if i == 10: break return "fa" if farsi else "en" @staticmethod def prepare_english_text(text, tokenize): t = text.casefold() t = TextNormalizer.regex_en_space.sub(' ', t) t = TextNormalizer.regex_en_unwanted.sub(' ', t) tokens = nltk.tokenize.word_tokenize(t) stemmed_tokens = [] for x in tokens: word = TextNormalizer.porter_stemmer.stem(x) if word != "": stemmed_tokens.append(word) if tokenize: return stemmed_tokens return " ".join(stemmed_tokens) @staticmethod def prepare_persian_text(text, tokenize): t = text for tup in TextNormalizer.to_replace_fa: for ch in tup[0]: t = t.replace(ch, tup[1]) t = TextNormalizer.regex_fa_space.sub(' ', t) t = TextNormalizer.regex_fa_none.sub('', t) tokens2 = hazm.word_tokenize(t) tokens = [] for x in tokens2: tokens.extend(x.split("_")) stemmed_tokens = [] for x in tokens: word = TextNormalizer.stemmer.stem(x) if word != "": stemmed_tokens.append(word) if tokenize: return stemmed_tokens return " ".join(stemmed_tokens)
# Modules import hazm as hz import numpy as np import keras from keras.models import Sequential from keras.layers import Dense, Dropout, Activation import xml.etree.ElementTree as et from os import listdir from os.path import isfile, join from collections import defaultdict # Parameters normalizer = hz.Normalizer() tagger = hz.POSTagger(model='resources/postagger.model') stemmer = hz.Stemmer() lemmatizer = hz.Lemmatizer() lexicon_file_name = 'final_lexi' data_path = './data/' lexicon = None # Make bag_of_words def bow(text): global normalizer global tagger global stemmer global lemmatizer
import hazm as Hazm import sys from StopWords import stop_words import re import json from wordfreq import zipf_frequency if len(sys.argv) < 2: print('error') sys.exit() raw_text = str(sys.argv[1]) normalizer_instance = Hazm.Normalizer() lemmatizer_instance = Hazm.Lemmatizer() stem_finder_instance = Hazm.Stemmer() remove_non_persian_regex = re.compile('[^آ-ی]') raw_text = remove_non_persian_regex.sub( ' ', raw_text) #We replace all non persian texts normalized_text = normalizer_instance.normalize(raw_text) sentences = Hazm.sent_tokenize(normalized_text) result_tokens = list() less_accurate_tokens = list() def add_to_tokens_if_not_exists(parsed_token): exists = False for result_token in result_tokens: if parsed_token == result_token: exists = True
zoomitComments.columns zoomitComments.head() zoomitComments=zoomitComments.drop(["ParentCommentid","UpdateDate2","CreateDate2","UpdatedByUserId","Name","Email"], axis=1) zoomitComments['Message']=zoomitComments['Message'].astype(str) zoomitComments['Message'] = zoomitComments['Message'].agg(lambda x: reg.sub('[<br />]',' ',x)) zoomitComments['wordCount'] = zoomitComments["Message"].agg(lambda x: len(x.split(" "))) zoomitComments['charCount'] = zoomitComments["Message"].agg(lambda x: len(x)) zoomitComments['Message'] = zoomitComments['Message'].agg(lambda x: reg.sub('\s+',' ',x)) #zoomitComments['Message']=zoomitComments['Message'].agg(lambda x: (' ').join(reg.sub('.','',[w for w in x.split() if reg.match('([\w]+\.)+[\w]+(?=[\s]|$)',w)])) stopWords=hm.stopwords_list() zoomitComments['#_of_StopWords']=zoomitComments['Message'].agg(lambda x: len([w for w in x.split() if w in stopWords])) stemWords=hm.Stemmer() zoomitComments['Message']=zoomitComments['Message'].agg(lambda x: (' ').join([stemWords.stem(w) for w in x.split()])) pubComment=zoomitComments.loc[zoomitComments['Status']==1,:].loc[:,['Message']] unpubComment=zoomitComments.loc[zoomitComments['Status']==0,:].loc[:,['Message']] len(unpubComment) zoomitComments['Status'].unique() import matplotlib.pyplot as pPlot from PIL import Image commentsWord=""
def persian_stemmer(list): hazm_stemmer = hazm.Stemmer() output = [] for token in list: output.append(hazm_stemmer.stem(token)) return output
def stem(self, word): return hazm.Stemmer().stem(word)
def stem(self, token_list): stemmer = hazm.Stemmer() return [stemmer.stem(token) for token in token_list]
def __init__(self): self.Normalizer = hazm.Normalizer() self.stopwords_list = hazm.stopwords_list() self.Stemmer = hazm.Stemmer()