def __init__(self): self.preprocessed_docs = [] self.normalizer = hazm.Normalizer() self.word_tokenizer = hazm.WordTokenizer() self.stemmer = hazm.Stemmer() self.stop_words = hazm.stopwords_list() self.persian_garbage = { u'÷': u'', u'ٰ': u'', u'،': ' ', u'؟': ' ', u'؛': '', u'َ': '', u'ُ': '', u'ِ': '', u'ّ': '', u'ٌ': '', u'ٍ': '', u'ئ': u'ی', u'ي': u'ی', u'ة': u'ه', u'ء': u'', u'ك': u'ک', u'ْ': u'', u'أ': u'ا', u'إ': u'ا', u'ؤ': u'و', u'×': u'', u'٪': u'', u'٬': u'', u'آ': u'ا', u'●': u'' }
def similar(s1, s2): normalizer = hazm.Normalizer() s1 = normalizer.normalize(s1) s2 = normalizer.normalize(s2) list_s1 = [ word for word in s1.split(" ") if word not in hazm.stopwords_list() ] list_s2 = [ word for word in s2.split(" ") if word not in hazm.stopwords_list() ] stemmer = hazm.Stemmer() stem_s1 = [stemmer.stem(word) for word in list_s1] same_words = set.intersection(set(list_s1), set(list_s2)) return len(same_words)
def prepare_text(text): text = text.lower() text = re.sub('\d+', '', text) text = text.translate(str.maketrans(punctuations, ' ' * len(punctuations))) text = ' '.join( re.sub(r'[^ضصثقفغعهخحجچشسیبلاتنمکگظطزرذدپوئژآؤ \n]', ' ', text).split()) text = text.strip() normalized_text = normalizer.normalize(text) words = word_tokenize(normalized_text) words = [w for w in words if w != '.'] words = [w for w in words if w not in stopwords_list()] words = [stemmer.stem(w) for w in words] return words
def __init__(self): self.punctuations = [ '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '/', ':', ';', '<', '=', '>', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '£', '¤', '§', '©', '«', '®', '°', '±', '²', '´', '¸', '»', '¼', '½', '¾', '×', '÷', 'ˈ', '˜', '˝', '٪', '٫', '٬', '‐', '–', '—', '‘', '’', '“', '”', '„', '…', '″', '‹', '›', '™', '↑', '→', '↓', '⋅', '⌘', '▪', '◄', '○', '♫', '✓', '❤', '《', '》', '爆', '者', '被', '\uf020', '\uf04f', '\uf05f', '\uf076', '\uf0a7', '\uf0fc', '﴾', '﴿', ':', '�', '?', '؟', '.', '،', '؛', '•', '●' ] self.diacritics_pattern = re.compile( "[\u064B-\u065e\u0670\u0674\u06c3\u06d4-\u06ed]") self.emojis_pattern = re.compile( "[" u"\U0001F600-\U0001F64F" u"\U0001F300-\U0001F5FF" u"\U0001F680-\U0001F6FF" u"\U0001F1E0-\U0001F1FF" "]+", flags=re.UNICODE) self.latin_characters_pattern = re.compile("[" "\u0041-\u007a" "\u00c0-\u036f" "\u0400-\u050f" "\u0342-\u03ff" "]") self.numbers_pattern = re.compile("[0-9]") self.space_patterns = [ (re.compile("[\u202c\u2005\u2009\u2029\u2066\u3000\ufe0f]"), ' '), (re.compile("[\f\r\t\n]"), ' '), (re.compile("[\u001f\u009d\u200a\u200e\u200f\u206d\xa0\xad]"), '\u200c'), (re.compile( "[\u007f\u0085\u061c\u200b\u200d\u202a\u202b\u206f\u2003" "\u2028\u2060\u2063\u2067\u2069\ufeff\ufffc\x18]"), ''), ] self.stopwords = hazm.stopwords_list()[:200] + [ 'ام', 'م', 'ات', 'ای', 'ی', 'ت', 'اش', 'ش', 'مان', 'یم', 'ایم', 'تان', 'ید', 'اید', 'شان', 'ند', 'اند', 'است', 'هست', 'بود', 'شد', 'شو', 'باش', 'خواه', 'ها', 'های', 'ان', 'یک', 'دو', 'سه', 'چهار', 'پنج', 'شش', 'هفت', 'هشت', 'نه', 'ده', 'هستم', 'هستم', 'هست', 'هستید', 'هستیم', 'نیستم', 'نیستی', 'نیست', 'نیستیم', 'نیستید', 'نیستند' ] self.normalizer = parsivar.Normalizer() self.stemmer = parsivar.FindStems() self.lemmatizer = hazm.Lemmatizer()
def __init__(self, feature_set, orientations=None, language='english'): self.language = language self.normalizer[language] = hazm.Normalizer() if language == 'persian': self.stopwords[language] = hazm.stopwords_list() self.regex_words[language] = r"[\w']+|[.,!?;،؟؛]" else: self.stopwords[language] = set(stopwords.words('english')) self.regex_words[language] = r"[\w']+|[.,!?;]" if orientations: self.orientations = orientations self.feature_set = feature_set self.weights = {} self.hash_dictionary[self.language] = {}
def clean_fa(self, data): data.text = self.fa_normalize(data.text) data.text = self.tokenizer(data.text) stemmer = hazm.Stemmer() lemmatizer = hazm.Lemmatizer() stopwords = hazm.stopwords_list() alphabet = set(list("ابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی")) data.text = data.apply( lambda row: self.stemLemmaStopWord( stemmer, lemmatizer, stopwords, alphabet, row.text ), axis=1, ) return data
def __init__(self, mask=None, size=900, stop_words_addr=default_stop_words_path, mask_addr=None): self.hazm_normalizer = hazm.Normalizer() self.parsivar_normalizer = parsivar.Normalizer() self.stemmer = hazm.Stemmer() self.lemmatizer = hazm.Lemmatizer() self.stop_words = set(hazm.stopwords_list(stop_words_addr)) mask = np.array( Image.open(mask_addr)) if mask_addr is not None else None self.generator = WordCloud(width=size, height=size, include_numbers=False, persian_normalize=False, collocations=True, mask=mask, background_color='white')
def search_wikipedia(cls, word): page = requests.get("https://fa.wikipedia.org/wiki/" + word) soup = BeautifulSoup(page.content, features="html.parser") word_tokenized = [] try: size = len(soup.find_all('p')) content = soup.find_all('p') for i in range(size): word_tokenized.append(word_tokenize(content[i].get_text())) filtered_words = [] for list in word_tokenized: for word in list: if word not in stopwords_list(): if word.isalpha(): filtered_words.append(word) most_common_words = ( collections.Counter(filtered_words).most_common(10)) return most_common_words except: return "error"
from hazm import stopwords_list def get_stopwords(): with open("stopwords.txt", 'r') as f: results = f.readline().split() return results stopwords = stopwords_list() punctuation = get_stopwords() all_stopwords = punctuation + stopwords + ["NUM"] + [ 'آقا', 'آور', 'افزا', 'باش', 'بردار', 'بست', 'بند', 'توان', 'توانست', 'دارا', 'دان', 'ده', 'رس', 'ریخت', 'ریز', 'سال', 'سو', 'شخص', 'شو', 'هست', 'وقت', 'کس', 'کن', 'گذار', 'گذاشت', 'گرد', 'گشت', 'گو', 'گیر', 'یاب' ] + ['بس']
import pickle from hazm import Normalizer, Stemmer, Lemmatizer, sent_tokenize, word_tokenize, stopwords_list stops = set(stopwords_list()) def load_dataset(file_name, column_name='question'): data = pickle.load(open(file_name, "rb")) statements = [] for i in range(len(data)): statements.append(data[i][column_name]) return statements def statement_pre_processing(input_statement): normalizer = Normalizer() lemmatizer = Lemmatizer() input_statement = normalizer.normalize(input_statement) input_statement = [ lemmatizer.lemmatize(word) for word in word_tokenize(input_statement) if word not in stops ] return input_statement def dataset_cleaner(dataset): statements = [] normalizer = Normalizer() lemmatizer = Lemmatizer() for i in range(len(dataset)):
zoomitComments.dtypes zoomitComments.describe() zoomitComments.columns zoomitComments.head() zoomitComments=zoomitComments.drop(["ParentCommentid","UpdateDate2","CreateDate2","UpdatedByUserId","Name","Email"], axis=1) zoomitComments['Message']=zoomitComments['Message'].astype(str) zoomitComments['Message'] = zoomitComments['Message'].agg(lambda x: reg.sub('[<br />]',' ',x)) zoomitComments['wordCount'] = zoomitComments["Message"].agg(lambda x: len(x.split(" "))) zoomitComments['charCount'] = zoomitComments["Message"].agg(lambda x: len(x)) zoomitComments['Message'] = zoomitComments['Message'].agg(lambda x: reg.sub('\s+',' ',x)) #zoomitComments['Message']=zoomitComments['Message'].agg(lambda x: (' ').join(reg.sub('.','',[w for w in x.split() if reg.match('([\w]+\.)+[\w]+(?=[\s]|$)',w)])) stopWords=hm.stopwords_list() zoomitComments['#_of_StopWords']=zoomitComments['Message'].agg(lambda x: len([w for w in x.split() if w in stopWords])) stemWords=hm.Stemmer() zoomitComments['Message']=zoomitComments['Message'].agg(lambda x: (' ').join([stemWords.stem(w) for w in x.split()])) pubComment=zoomitComments.loc[zoomitComments['Status']==1,:].loc[:,['Message']] unpubComment=zoomitComments.loc[zoomitComments['Status']==0,:].loc[:,['Message']] len(unpubComment) zoomitComments['Status'].unique() import matplotlib.pyplot as pPlot from PIL import Image
import hazm as hz from PersianStemmer import PersianStemmer import search_engine.words_lists as wl from itertools import combinations import re import string import search_engine.configurations as config ps = PersianStemmer() stop_words = hz.stopwords_list() normalizer = hz.Normalizer() stemmer = hz.Stemmer() lemmatizer = hz.Lemmatizer() def process_single_document(doc_content): # 1: remove html tags and irrelevant contents cleaned_content_from_tag = remove_tags(doc_content) # 2: normalize text normalize_text = normalizer.normalize(cleaned_content_from_tag) # 3: tokenize words_token = hz.word_tokenize(normalize_text) config.pure_number_tokens += len(words_token) return words_token def preprocess_single_word_in_query(word): # 1: normalized word = normalizer.normalize(word) # 2: lemmatized and stemmer word_lemmatized = lemmatizer.lemmatize(word)
def __init__(self): self.Normalizer = hazm.Normalizer() self.stopwords_list = hazm.stopwords_list() self.Stemmer = hazm.Stemmer()
class Preprocessor: normalizer = Normalizer() stemmer = Stemmer() lemmatizer = Lemmatizer() tokenizer = WordTokenizer() stop_words = stopwords_list() @staticmethod def remove_noise(text: str) -> str: return Preprocessor.__remove_punctuation( Preprocessor.__remove_emojis(text)) @staticmethod def remove_stop_words(tokens: List) -> str: return [t for t in tokens if t not in Preprocessor.stop_words] @staticmethod def __remove_emojis(text: str): emoji_pattern = re.compile( "[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" u"\U0001f926-\U0001f937" u'\U00010000-\U0010ffff' u"\u200d" u"\u2640-\u2642" u"\u2600-\u2B55" u"\u23cf" u"\u23e9" u"\u231a" u"\u3030" u"\ufe0f" "]+", flags=re.UNICODE) first_cleaned_text = emoji_pattern.sub(r'', text) # no emoji return emoji.get_emoji_regexp().sub(r'', first_cleaned_text) @staticmethod def __remove_punctuation(text: str): try: return re.sub( r'[\.\?\!\,\:\;\،\(\)\؛\#\%\^\&\$\~\'\"\×\-\_\*\>\<\+\=\\\/]', '', text) except TypeError as e: print(e, text) @staticmethod def normalize(text: str) -> str: return Preprocessor.normalizer.normalize(text) @staticmethod def stem(word: str) -> str: return Preprocessor.stemmer.stem(word) @staticmethod def lemmatize(word: str) -> str: return Preprocessor.lemmatizer.lemmatize(word) @staticmethod def tokenize(text: str) -> str: return Preprocessor.tokenizer.tokenize(text) @staticmethod def preprocess(text: str) -> str: cleaned_text = Preprocessor.remove_noise(str(text)) normalized_text = Preprocessor.normalize(cleaned_text) tokens = Preprocessor.tokenize(normalized_text) none_stop_words = Preprocessor.remove_stop_words(tokens) # stems = [Preprocessor.stem(w) for w in tokens] lemmatized = [Preprocessor.lemmatize(w) for w in none_stop_words] return ' '.join(lemmatized)
# ایجاد یک دیکشنری category_to_id = dict(category_id_df.values) id_to_category = dict(category_id_df[['category_id', 'Label']].values) # ایجاد یک نمود فراوانی تکرار ایموجیها fig = plt.figure(figsize=(8,6)) df2.groupby('Label').comment.count().sort_values().plot.bar( ylim=0, title= 'Term Frequency of each Emoji \n') plt.xlabel('\n Number of ocurrences', fontsize = 10); plt.show() # پالایش کامنتها normalizer = hazm.Normalizer() tokenizer = hazm.SentenceTokenizer() tokens = hazm.word_tokenize S_Words = list(hazm.stopwords_list()) #بازنمایی متن tfidf = TfidfVectorizer(lowercase=False, preprocessor=normalizer.normalize, tokenizer=tokens, ngram_range=(1, 2), stop_words=S_Words) comments = df2.comment features = tfidf.fit_transform(comments).toarray() labels = df2.category_id # مقایسه همه مدلها models = [ MultinomialNB(), RandomForestClassifier(n_estimators=200, max_depth=5, random_state=0),
punctuations_list = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ''' + string.punctuation def remove_punctuations(text): translator = str.maketrans('', '', punctuations_list) return text.translate(translator) words = remove_punctuations(words) words = re.sub( 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))', '', words, flags=re.MULTILINE) words = re.sub(r"@(\w+)", ' ', words, flags=re.MULTILINE) wordcloud = WordCloudFa(persian_normalize=True, stopwords=list(STOPWORDS) + hazm.stopwords_list(), include_numbers=False, background_color='white', width=700, height=500) frequencies = wordcloud.process_text(words) wc = wordcloud.generate_from_frequencies(frequencies) image = wc.to_image() st.image(image) # Dataframe st.subheader('**Data**') st.write(data) # Random Tweet col1, col2 = st.beta_columns(2) with col1:
import hazm stopwords = ['ام', 'م', 'ات', 'ای', 'ی', 'ت', 'اش', 'ش', 'مان', 'یم', 'ایم', 'تان', 'ید', 'اید', 'شان', 'ند', 'اند', 'است', 'هست', 'بود', 'شد', 'شو', 'باش', 'خواه', 'ها', 'های', 'ان', 'یک', 'دو', 'سه', 'چهار', 'پنج', 'شش', 'هفت', 'هشت', 'نه', 'ده', 'هستم', 'هستم', 'هست', 'هستید', 'هستیم', 'نیستم', 'نیستی', 'نیست', 'نیستیم', 'نیستید', 'نیستند'] + hazm.stopwords_list()[:200] punctuations = ['.', '،', '!', '؟', '?', ':', '؛', '(', ')', '{', '}', '[', ']', '«', '»', '-', '/', '٪', '%', '"', "'", '،', '_', '=', '<', '>', '+', '@', '*', ',', ';', '&', '#', '٬', '`', '|', ','] diacritics = ['\u064B', '\u064C', '\u064D', '\u064E', '\u064F', '\u0650', '\u0651', '\u0652', '\u0653', '\u0654', '\u0655'] character_mapping = { 'ا': ['ا', 'إ', 'أ', 'آ', 'ٱ'], 'و': ['و', 'ؤ'], 'ی': ['ی', 'ي', 'ئ'], 'ک': ['ک', 'ك'], 'ه': ['ه', 'ة', 'ۀ'], } english_characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' half_spaces = ['\u200C', '\u200f', '\xa0'] multi_words_token = ['چنان چه', 'بنا بر این', 'مع ذلک', 'فی مابین', 'فوق العاده', 'بی شک', 'در خصوص', 'این که', 'به دلیل', 'به خاطر', 'بر اساس', 'از جمله', 'با توجه به', 'اشاره به', 'بین الملل', 'در راستا', 'در اختیار', 'خاطر نشان', 'ما فوق', 'بدین شکل'] digit_characters = '1234567890۱۲۳۴۵۶۷۸۹۰١٢٣٤٥٦٧٨٩' # Heaps Law: y = 0.49 * x + 1.60 # Zipf Law: y = -1.73 * x + 8.36 ?????
for j in dd['title'][i]: temp = j.split('\u200c') for q in temp: temp2.append(q) dd['title'][i] = copy.deepcopy(temp2) # ## Remove stopwords and Special Chars # Because they are useless to the meaning of sentences and as a result, to prices. # In[9]: special_chars = [ '!', '"', '#', '(', ')', '*', ',', '-', '.', '/', '\'', '«', '»', '،', '؛', '؟', '.', '…', '$' ] stopwords = hazm.stopwords_list() for i in range(len(dd2)): for j in dd2['desc'][i]: if (j in special_chars) or (j in stopwords): while (j in dd2['desc'][i]): dd2['desc'][i].remove(j) for j in dd2['title'][i]: if (j in special_chars) or (j in stopwords): while (j in dd2['desc'][i]): dd2['desc'][i].remove(j) for i in range(len(dd)): for j in dd['desc'][i]: if (j in special_chars) or (j in stopwords): while (j in dd['desc'][i]):