def stem(self): """ :return: """ print('hi') stemmer = FindStems() for words in self.words: temp = [] for word in words: temp.append(stemmer.convert_to_stem(str(word))) self.stem_words.append(temp) return self.stem_words
def __init__(self): self.sw = pd.read_csv("data/stop_words/stpwrd.csv") # از فایل مربوطه ستون واژه های ایست را انتخاب میکنیم و به صورت لیست رشته ای باز میگردانیم self.sw = self.sw["StopWord"].astype(str).values.tolist() self.correction = pd.read_csv("data/Vocab_dataset_1.csv", index_col=0, header=None, squeeze=True).to_dict() self._normalizer = Normalizer(statistical_space_correction=True, date_normalizing_needed=True) self._normalizer1 = HazmNormal() self._tokenizer1 = HazmTokenizer(join_verb_parts=False, replace_hashtags=True, replace_numbers=True, separate_emoji=True) self._tokenizer2 = HazmTokenizer(join_verb_parts=False, replace_hashtags=True, replace_numbers=False, separate_emoji=True) self._stemmer = FindStems() # region Regular Experssion # عبارتی که کلمات و اعداد به هم چسبیده را از هم جدا میکند self.persianmixRE = re.compile( "(([\u0600-\u06FF]+)([0-9]+)|([0-9]+)([\u0600-\u06FF]+)|([a-zA-Z]+)([0-9]+)|([0-9]+)([a-zA-Z]+))" ) # برای اینکه متوجه شویم در متن عدد وجود دارد از این عبارت استفاده میکنیم self.numRE = re.compile('\d') self.removeIrritateRE = re.compile(r'(.)\1{2,}', re.IGNORECASE) # Emoji self.emojiRE = re.compile( pattern="[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) "]+", flags=re.UNICODE) # endregion self._emojiList = list(emoji.UNICODE_EMOJI.keys()) #self.w2vModel = Word2Vec.load('P:/pkl/newSentence.bin') print("\n ** Persian Text PreParation by Naghme Jamali ** \n")
from parsivar import FindStems import pickle import re stopwords = ["و", "از", "در", "برای", "چون"] DEBUG_MODE = False my_stemmer = FindStems() postlist = {} with open('InformationRetrieval/objs.pkl', 'rb') as f: postlist = pickle.load(f) def search_term(term): # returns list # if term in STOP_WORDS: # term = "$$$" if term in postlist.keys(): return postlist[term].keys() else: return False def search_phrase(phrase): # phrase is a list of words . returns set print("im in") print(phrase) answers = [] for word in phrase: if word not in postlist.keys(): return False
from random import shuffle from parsivar import Normalizer from parsivar import SpellCheck from parsivar import FindStems from parsivar import Tokenizer from hazm import Lemmatizer from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split datelist = [] my_tokenizer = Tokenizer() lemmatizer = Lemmatizer() my_stemmer = FindStems() myspell_checker = SpellCheck() my_normalizer = Normalizer() with open("stopwords1.txt", "r") as file: f = file.read() StopWords = f.split("\n") vectorizer = CountVectorizer() def CleanText(InputText): WordsList = my_tokenizer.tokenize_words(my_normalizer.normalize(InputText)) for i in range(len(WordsList) - 1, -1, -1): if (WordsList[i] in StopWords): del WordsList[i] break WordsList[i] = lemmatizer.lemmatize(WordsList[i]).split("#")[-1]
class PreParation(): _tokenizer = Tokenizer() sw = None # Stop Words correction = None # Correction Collection ازتمیخوام -> ازت میخوام _normalizer = None # Normilizer ي --> ی _normalizer1 = None # Normilizer _tokenizer1 = None # Hazm Tokenizer _tokenizer2 = None # Hazm Tokenizer _stemmer = None # Stemmer گفت --> گو extraChar1 = ["؛", "؟", ",", ";", "!", "?", ".", ":", "،"] extraChar2 = [ "'", '"', "+", "{", "}", "-", "(", ")", "$", "#", '/', "\\", "@", "*", "٪", "÷", "¿", "[", "]", "«", "»", "^", "`", "|", "¡", "˘", "¤", "£", "<", ">", "¯", "°", "٭", "٫" ] _emojiList = None # Regular Experssion persianmixRE = None numRE = None removeIrritateRE = None emojiRE = None # Embedding w2vModel = None # این کلاس برای مقدار دهی اولیه مورد استفاده قرار میگیرد def __init__(self): self.sw = pd.read_csv("data/stop_words/stpwrd.csv") # از فایل مربوطه ستون واژه های ایست را انتخاب میکنیم و به صورت لیست رشته ای باز میگردانیم self.sw = self.sw["StopWord"].astype(str).values.tolist() self.correction = pd.read_csv("data/Vocab_dataset_1.csv", index_col=0, header=None, squeeze=True).to_dict() self._normalizer = Normalizer(statistical_space_correction=True, date_normalizing_needed=True) self._normalizer1 = HazmNormal() self._tokenizer1 = HazmTokenizer(join_verb_parts=False, replace_hashtags=True, replace_numbers=True, separate_emoji=True) self._tokenizer2 = HazmTokenizer(join_verb_parts=False, replace_hashtags=True, replace_numbers=False, separate_emoji=True) self._stemmer = FindStems() # region Regular Experssion # عبارتی که کلمات و اعداد به هم چسبیده را از هم جدا میکند self.persianmixRE = re.compile( "(([\u0600-\u06FF]+)([0-9]+)|([0-9]+)([\u0600-\u06FF]+)|([a-zA-Z]+)([0-9]+)|([0-9]+)([a-zA-Z]+))" ) # برای اینکه متوجه شویم در متن عدد وجود دارد از این عبارت استفاده میکنیم self.numRE = re.compile('\d') self.removeIrritateRE = re.compile(r'(.)\1{2,}', re.IGNORECASE) # Emoji self.emojiRE = re.compile( pattern="[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) "]+", flags=re.UNICODE) # endregion self._emojiList = list(emoji.UNICODE_EMOJI.keys()) #self.w2vModel = Word2Vec.load('P:/pkl/newSentence.bin') print("\n ** Persian Text PreParation by Naghme Jamali ** \n") # Remove Multiple Space - "Salam khobi?" -> "Salam Khobi?" def RemoveMultipleSpace(self, txt): return re.sub(' +', ' ', txt) # Remove Emoji def RemoveEmoji(self, txt): for emoji in self._emojiList: txt = txt.replace(emoji, ' EMOJI ') return txt # return self.RemoveMultipleSpace(self.emojiRE.sub(r' ', txt)) def RemoveExtraChar1(self, txt): for i in self.extraChar1: txt = txt.replace(i, " ") return txt def RemoveExtraChar2(self, txt): for i in self.extraChar2: txt = txt.replace(i, " ") return txt # تبدیل شماره ها به انگلیسی def NumberEN(self, input): return input.replace("۰", "0").replace("۱", "1").replace( "۲", "2").replace("۳", "3").replace("۴", "4").replace("۵", "5").replace( "٥", "5").replace("۶", "6").replace("v", "7").replace( "۷", "7").replace("۸", "8").replace("۹", "9") # حذف ایست واژه در زبان فارسی def stop_word(self, data): text = self.RemoveMultipleSpace(data) text = self.RemoveMultipleSpace(' '.join( [word for word in data.split() if word not in self.sw])) return text.strip() # if text != " " and text != "": # return ' '.join([word for word in text.split() if word not in self.sw]).strip() # else: # return '' # جداسازی اعداد و متن از یکدیگر def splitnumber(self, txt): if self.numRE.search(txt) != None: res = self.persianmixRE.match(txt).groups() return (" ".join([word for word in res[1:] if word != None])) return txt # ریشه کلمات را باز میگرداند # در صورتی که چند ریشه داشته باشد ما اولی را برمیداریم # در صورتی که در این ابزار ریشه برای کلمه پیدا نشد # همان کلمه را باز میگرداند def Stem(self, txt): _txt = self._stemmer.convert_to_stem(txt).split('&') return _txt[0] # حروف تکراری موجود در متن را حذف مینماید # برای مثال خوووووبی میشود خوبی def removeIrritate(self, txt): return self.removeIrritateRE.sub(r'\1', txt) def CorrectionText(self, texts): _texts = [] for _text in self.wordToken(texts): if _text in self.correction: _texts.append(self.correction[_text]) else: _texts.append(_text) return ' '.join(_texts) # تمیز کردن متون: # نرمال سازی داده ها # حذف حروف و نشانه های اضافه # در ورود یک فلگ میزاریم برای مواقعی که میخواهیم از حذف ایست واژه ها استفاده کنیم def cleanText(self, txt, stopword=False, isSplitNumber=True): #txt = txt.replace("\u200c", " ") txt = txt.replace("آ", "ا") if stopword: # آیا ایست واژه حذف شوند؟ txt = self.stop_word(txt) txt = self.removeIrritate(txt) # حذف کاراکترهای تکراری txt = self.RemoveEmoji(txt) txt = self._normalizer1.normalize(txt) txt = self.RemoveExtraChar2(txt) txt = self.RemoveMultipleSpace(txt) txt = self.NumberEN(txt) txt1 = [] for t in self.wordToken(txt): if isSplitNumber: try: t = self.splitnumber(t) # جداسازی اعداد از متن در یک کلمه except: pass for _t in t.split(): w1 = self.Stem(_t) txt1.append(w1) if stopword: return self.stop_word(' '.join(txt1)).strip() return (' '.join(txt1)).strip() # مرحله دوم از حذف علائم و نشانه ها # و چرا این همه جایگزینی کلمات و نشانه ها، بدلیل اینکه # کتابخانه های موجود عکس العملی به این علامت و نگارش های غلط # نشان نمیدهد و برای اینکه بتوان آن ها را تصحیح کرد از روش استفاده میکنیم # تا کلمات تصحیح شوند و به شکل درست خود بازگردند def cleanText2(self, txt, stopword=False, isTokenize=False): _txt = txt if stopword: _txt = self.stop_word(txt) # در این خط، نیم فاصله ای که ابزارهای پارسی وار و هضم ایجاد کرده اند را به فاصله تبدیل میکنیم #_txt = _txt.replace("\u200c", " ") _txt = ' '.join(self.wordToken(_txt, replaceNumber=True)) _txt = self.CorrectionText(_txt) _txt = self._normalizer1.normalize(_txt) _txt = self.NumberEN(_txt) if stopword: _txt = self.stop_word(_txt).strip() return _txt # توکنایز در سطح جمله def token(self, txt): _sents = self._tokenizer.tokenize_sentences(txt) sents = [] for _txt in _sents: _txt = self.RemoveExtraChar1(_txt) _txt = self.RemoveMultipleSpace(_txt).strip() sents.append(_txt) return sents # جایگزینی اعداد Exmaple: 5 عدد --> NUM1 عدد # هضم نمیتواند بعضی از اعداد را تشحیص دهد # برای همین این تابع را نوشتیم تا آن را پوشش دهد def ReplaceNumber(self, input1): if input1.isnumeric(): return "NUM" + str(len(input1)) else: return input1 def wordToken(self, txt, replaceNumber=False, removeExtra=False, stopword=False): if removeExtra: txt = self.RemoveExtraChar1(txt) txt = self.RemoveMultipleSpace(txt).strip() _words = [] if replaceNumber: _words.extend( self.ReplaceNumber(x) for x in self._tokenizer1.tokenize(txt)) else: _words.extend(self._tokenizer2.tokenize(txt)) if stopword: return [word for word in _words if word not in self.sw] else: return _words
def get_unique_words(text_path): ''' This function get text corpus path as input and give set of unique words as output. Parameters ---------- text_path : str Path to where our corpus exist (must in .txt format). Raises If spell check package of parsivar won't found.' ------ Returns ------- words_set : set Set of unique words from corpus. ''' # Create a list from all text text_names = glob(join_path(text_path, '*.txt')) sentences = [] for name in text_names: with open(name, 'r') as text: text_sentences = tuple(text.readlines()) for sentence in text_sentences: sentence = sentence.strip('\n') sentences.append(sentence[sentence.find('|') + 1:]) # Specify Signs and Numbers in order to avoid words contain them enter in our final Set signs = [ '،', '«', '»', '.', ')', '(', '"', ':', ';', '%', '-', '?', ',', '؛', "'", '_' ] numbers = [f'{i}' for i in range(10)] # Create Set of all words in corpus try: spell = SpellCheck() except: raise Exception( 'Please download spell.zip from https://www.dropbox.com/s/tlyvnzv1ha9y1kl/spell.zip?dl=0 and extract to path to parsivar/resource.' ) normal = Normalizer() token = Tokenizer() stemm = FindStems() words_set = set() print('\n Start to extract and clean words from sentences! \n') with progressbar.ProgressBar(max_value=len(sentences), redirect_stdout=True) as bar: for sentence, i in zip(sentences, range(len(sentences))): sentence = normal.normalize(spell.spell_corrector(sentence)) sentence = sentence.replace(u'\u200c', ' ') words = token.tokenize_words(sentence) for word in words: word = stemm.convert_to_stem(word) if '&' in word: #This pattern found manually in text word = word[:word.find('&')] if word in signs: # Ignore signs bar.update(i) continue for let in word: # Ignore words contain numbers if let in numbers: bar.update(i) continue if len(word) <= 1: # ignore one (or less)letter strings bar.update(i) continue words_set.add(word) bar.update(i) return words_set
import pickle from parsivar import FindStems from hazm import stopwords_list, Lemmatizer import numpy as np from constants import given_doc_root_path, document_root_path, limit_index from LP_toolkits import normalizer Lemmatizer = Lemmatizer() Stemmer = FindStems() stopwords = set(stopwords_list()) # define stemmer function. def stemmer(email): """ :param email: a string of email text :return: a string of input in which for each verb it's root has been replaced """ tokens = '' for word in email.split(): token = Lemmatizer.lemmatize(word) if '#' in token: token = token.split('#') if word in token[0]: token = token[0] else: token = token[1] else: token = Stemmer.convert_to_stem(word) if '&' in token:
def Evaluate_lemmatizer(inputs, labels, lib='hazm'): predicted_labels_with_pos = [] predicted_labels_no_pos = [] if lib == 'hazm': lemmatizer = Lemmatizer() for sentence in inputs: sent_labels_with_pos = [] sent_labels_no_pos = [] for (word, pos) in sentence: if pos == 'ADJ': pos = 'AJ' sent_labels_with_pos.append(lemmatizer.lemmatize(word, pos)) sent_labels_no_pos.append(lemmatizer.lemmatize(word)) predicted_labels_with_pos.append(sent_labels_with_pos) predicted_labels_no_pos.append(sent_labels_no_pos) elif lib == 'parsivar': stemmer = FindStems() for sentence in inputs: sent_labels_with_pos = [] sent_labels_no_pos = [] for (word, pos) in sentence: sent_labels_with_pos.append(stemmer.convert_to_stem(word, pos)) sent_labels_no_pos.append(stemmer.convert_to_stem(word)) for i in range(len(sentence)): if sentence[i][1] == 'V': sent_labels_with_pos[i] = re.sub(r"&", r"#", sent_labels_with_pos[i]) sent_labels_no_pos[i] = re.sub(r"&", r"#", sent_labels_no_pos[i]) predicted_labels_with_pos.append(sent_labels_with_pos) predicted_labels_no_pos.append(sent_labels_no_pos) precisions_with_pos = [] precisions_no_pos = [] all_truly_labeled_with_pos = [] for i in range(len(labels)): truly_labeled_with_pos = [ predicted_labels_with_pos[i][j] == labels[i][j] for j in range(len(labels[i])) ] all_truly_labeled_with_pos.append(truly_labeled_with_pos) num_truly_labeled_with_pos = sum(truly_labeled_with_pos) truly_labeled_no_pos = [ predicted_labels_no_pos[i][j] == labels[i][j] for j in range(len(labels[i])) ] num_truly_labeled_no_pos = sum(truly_labeled_no_pos) precision_with_pos = num_truly_labeled_with_pos / len(labels[i]) precision_no_pos = num_truly_labeled_no_pos / len(labels[i]) precisions_with_pos.append(precision_with_pos) precisions_no_pos.append(precision_no_pos) per_pos = {} detailed_analyze = {} for i in range(len(inputs)): for j in range(len(inputs[i])): if inputs[i][j][1] not in per_pos.keys(): per_pos[inputs[i][j][1]] = {'true': 0, 'false': 0} if all_truly_labeled_with_pos[i][j]: per_pos[inputs[i][j][1]]['true'] += 1 else: per_pos[inputs[i][j][1]]['false'] += 1 if inputs[i][j][1] not in detailed_analyze.keys(): detailed_analyze[inputs[i][j][1]] = {'true': [], 'false': []} # detailed_analyze[inputs[i][j][1]]['gold'].append(labels[i][j]) if all_truly_labeled_with_pos[i][j]: detailed_analyze[inputs[i][j][1]]['true'].append( inputs[i][j][0]) # detailed_analyze[inputs[i][j][1]]['false'].append('NONE') else: detailed_analyze[inputs[i][j][1]]['false'].append( inputs[i][j][0]) # detailed_analyze[inputs[i][j][1]]['true'].append('NONE') accuracy_per_pos = { k: v['true'] / (v['true'] + v['false']) for k, v in per_pos.items() } for k, v in detailed_analyze.items(): v['true'] = set(v['true']) v['false'] = set(v['false']) precision_with_pos = sum(precisions_with_pos) / len(precisions_with_pos) precision_no_pos = sum(precisions_no_pos) / len(precisions_no_pos) return precision_with_pos, precision_no_pos, accuracy_per_pos, detailed_analyze
from parsivar import Normalizer from parsivar import Tokenizer from parsivar import FindStems from parsivar import POSTagger my_normalizer = Normalizer() my_tokenizer = Tokenizer() my_stemmer = FindStems() my_tagger = POSTagger( tagging_model="stanford") # tagging_model = "wapiti" or "stanford" tokens = [] ignore = ['،', '؟'] f = open("zaban-e_atash.txt", "r") for line in f: tmp_txt = line words = my_tokenizer.tokenize_words(my_normalizer.normalize(tmp_txt)) for word in words: if word not in tokens: if word not in ignore: tokens.append(word) f.close() print("<table>") print("<td>Stem</td><td>Token & POS</td>") for token in tokens: stem = my_stemmer.convert_to_stem(token) token_pos = my_tagger.parse(my_tokenizer.tokenize_words(token))