def stem_data(dat): normalizer = hazm.Normalizer() dat = normalizer.normalize(dat) sent = hazm.sent_tokenize(dat) words = [] for s in sent: tagged = list(tagger.tag(hazm.word_tokenize(s))) new_tag = list(tagged) for token in tagged: if token[0] in stop_words: new_tag.remove(token) lemmatizer = hazm.Lemmatizer() for token in new_tag: stemmed = lemmatizer.lemmatize(token[0], pos=token[1]) stemmer = hazm.Stemmer() stemmed = stemmer.stem(stemmed) if len(stemmed) > 0 and ('#' not in stemmed): words.append(stemmed) return words
def __init__(self): self.punctuations = [ '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '/', ':', ';', '<', '=', '>', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '£', '¤', '§', '©', '«', '®', '°', '±', '²', '´', '¸', '»', '¼', '½', '¾', '×', '÷', 'ˈ', '˜', '˝', '٪', '٫', '٬', '‐', '–', '—', '‘', '’', '“', '”', '„', '…', '″', '‹', '›', '™', '↑', '→', '↓', '⋅', '⌘', '▪', '◄', '○', '♫', '✓', '❤', '《', '》', '爆', '者', '被', '\uf020', '\uf04f', '\uf05f', '\uf076', '\uf0a7', '\uf0fc', '﴾', '﴿', ':', '�', '?', '؟', '.', '،', '؛', '•', '●' ] self.diacritics_pattern = re.compile( "[\u064B-\u065e\u0670\u0674\u06c3\u06d4-\u06ed]") self.emojis_pattern = re.compile( "[" u"\U0001F600-\U0001F64F" u"\U0001F300-\U0001F5FF" u"\U0001F680-\U0001F6FF" u"\U0001F1E0-\U0001F1FF" "]+", flags=re.UNICODE) self.latin_characters_pattern = re.compile("[" "\u0041-\u007a" "\u00c0-\u036f" "\u0400-\u050f" "\u0342-\u03ff" "]") self.numbers_pattern = re.compile("[0-9]") self.space_patterns = [ (re.compile("[\u202c\u2005\u2009\u2029\u2066\u3000\ufe0f]"), ' '), (re.compile("[\f\r\t\n]"), ' '), (re.compile("[\u001f\u009d\u200a\u200e\u200f\u206d\xa0\xad]"), '\u200c'), (re.compile( "[\u007f\u0085\u061c\u200b\u200d\u202a\u202b\u206f\u2003" "\u2028\u2060\u2063\u2067\u2069\ufeff\ufffc\x18]"), ''), ] self.stopwords = hazm.stopwords_list()[:200] + [ 'ام', 'م', 'ات', 'ای', 'ی', 'ت', 'اش', 'ش', 'مان', 'یم', 'ایم', 'تان', 'ید', 'اید', 'شان', 'ند', 'اند', 'است', 'هست', 'بود', 'شد', 'شو', 'باش', 'خواه', 'ها', 'های', 'ان', 'یک', 'دو', 'سه', 'چهار', 'پنج', 'شش', 'هفت', 'هشت', 'نه', 'ده', 'هستم', 'هستم', 'هست', 'هستید', 'هستیم', 'نیستم', 'نیستی', 'نیست', 'نیستیم', 'نیستید', 'نیستند' ] self.normalizer = parsivar.Normalizer() self.stemmer = parsivar.FindStems() self.lemmatizer = hazm.Lemmatizer()
def my_lemmatizer(string): ps = PersianStemmer() lemmatizer = hazm.Lemmatizer() current = ps.run(string) current = lemmatizer.lemmatize(string) # new = stemmer.stem(current) # while(current != new): # current = new # new = stemmer.stem(current) # result = new if("#" in current): # return bone mozareh!! result = current.split("#")[1] else: result = current return result
def textNormalizer(lousyCollection): docs = list() normalizer = hz.Normalizer() lemmatizer = hz.Lemmatizer() stemmer = hz.Stemmer() for i in range(len(lousyCollection)): normalized = normalizer.normalize(lousyCollection[i]) docs.append(delete_Punc(normalized)) for doc in docs: tokens = hz.word_tokenize(doc) for token in tokens: tokens[tokens.index(token)] = lemmatizer.lemmatize( stemmer.stem(token)) docs[docs.index(doc)] = tokens return docs
def clean_fa(self, data): data.text = self.fa_normalize(data.text) data.text = self.tokenizer(data.text) stemmer = hazm.Stemmer() lemmatizer = hazm.Lemmatizer() stopwords = hazm.stopwords_list() alphabet = set(list("ابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی")) data.text = data.apply( lambda row: self.stemLemmaStopWord( stemmer, lemmatizer, stopwords, alphabet, row.text ), axis=1, ) return data
def stemming(self): if self.persian: stemmer = hazm.Stemmer() lemmatizer = hazm.Lemmatizer() for i in range(len(self.tokens)): self.tokens[i] = lemmatizer.lemmatize( stemmer.stem(self.tokens[i])) else: porter = nltk.PorterStemmer() self.tokens = [porter.stem(word) for word in self.tokens] lemma = nltk.WordNetLemmatizer() self.tokens = [ lemma.lemmatize(word, pos="v") for word in self.tokens ] self.tokens = [ lemma.lemmatize(word, pos="n") for word in self.tokens ]
def __init__(self, mask=None, size=900, stop_words_addr=default_stop_words_path, mask_addr=None): self.hazm_normalizer = hazm.Normalizer() self.parsivar_normalizer = parsivar.Normalizer() self.stemmer = hazm.Stemmer() self.lemmatizer = hazm.Lemmatizer() self.stop_words = set(hazm.stopwords_list(stop_words_addr)) mask = np.array( Image.open(mask_addr)) if mask_addr is not None else None self.generator = WordCloud(width=size, height=size, include_numbers=False, persian_normalize=False, collocations=True, mask=mask, background_color='white')
def create_emotion_embedding(word_emotion_path, token_field, embedding_dim): """ create_emotion_embedding method is written for create emotional embedding matrix :param word_emotion_path: address of emotional dictionary :param token_field: token_field :param embedding_dim: dimension of emotional embedding :return: weight_matrix: emotional embedding """ lematizer = hazm.Lemmatizer() # load pickle dictionary of emotional embedding with open(word_emotion_path, "rb") as file: word_emotion_dict = pkl.load(file) # create weight_matrix as zero matrix weight_matrix = np.zeros((len(token_field.vocab), embedding_dim)) for token, idx in token_field.vocab.stoi.items(): lemma_token = lematizer.lemmatize(token) emotion_embedding = word_emotion_dict.get(lemma_token) if emotion_embedding is not None: weight_matrix[idx] = emotion_embedding return weight_matrix
# hazm.sent_tokenize('ما هم برای وصل کردن آمدیم! ولی برای پردازش، جدا بهتر نیست؟') # ['ما هم برای وصل کردن آمدیم!', 'ولی برای پردازش، جدا بهتر نیست؟'] # word tokenizer # hazm.word_tokenize('ولی برای پردازش، جدا بهتر نیست؟') # ['ولی', 'برای', 'پردازش', '،', 'جدا', 'بهتر', 'نیست', '؟'] # Stemmer stemmer = hazm.Stemmer() # stemmer.stem('کتابها') # 'کتاب' # Lemmatizer lemmatizer = hazm.Lemmatizer() # lemmatizer.lemmatize('میروم') # 'رفت#رو' # Tagger # tagger = hazm.POSTagger(model='resources/postagger.model') # tagger.tag(hazm.word_tokenize('ما بسیار کتاب میخوانیم')) # [('ما', 'PRO'), ('بسیار', 'ADV'), ('کتاب', 'N'), ('میخوانیم', 'V')] # Chunker # chunker = hazm.Chunker(model='resources/chunker.model') # tagged = tagger.tag(word_tokenize('کتاب خواندن را دوست داریم')) # tree2brackets(chunker.parse(tagged)) # '[کتاب خواندن NP] [را POSTP] [دوست داریم VP]'
# Modules import hazm as hz import numpy as np import keras from keras.models import Sequential from keras.layers import Dense, Dropout, Activation import xml.etree.ElementTree as et from os import listdir from os.path import isfile, join from collections import defaultdict # Parameters normalizer = hz.Normalizer() tagger = hz.POSTagger(model='resources/postagger.model') stemmer = hz.Stemmer() lemmatizer = hz.Lemmatizer() lexicon_file_name = 'final_lexi' data_path = './data/' lexicon = None # Make bag_of_words def bow(text): global normalizer global tagger global stemmer global lemmatizer text = hz.sent_tokenize(normalizer.normalize(text))
from __future__ import unicode_literals import hazm as Hazm import sys from StopWords import stop_words import re import json from wordfreq import zipf_frequency if len(sys.argv) < 2: print('error') sys.exit() raw_text = str(sys.argv[1]) normalizer_instance = Hazm.Normalizer() lemmatizer_instance = Hazm.Lemmatizer() stem_finder_instance = Hazm.Stemmer() remove_non_persian_regex = re.compile('[^آ-ی]') raw_text = remove_non_persian_regex.sub( ' ', raw_text) #We replace all non persian texts normalized_text = normalizer_instance.normalize(raw_text) sentences = Hazm.sent_tokenize(normalized_text) result_tokens = list() less_accurate_tokens = list() def add_to_tokens_if_not_exists(parsed_token): exists = False for result_token in result_tokens: if parsed_token == result_token:
def __init__(self): self.lemmatizer = hazm.Lemmatizer()