def score(self, sentences): # Predict pos, neg, neu = 0, 0, 0 stemmer = Stemmer() classifier = self.__get_model() normalizer = Normalizer() sentences = sent_tokenize(sentences) for sentence in sentences: sentence = normalizer.normalize(sentence) words = word_tokenize(sentence) for word in words: stemmer.stem(word) class_result = classifier.classify(self.__word_feats(word)) if class_result == 'neg': neg = neg + 1 if class_result == 'pos': pos = pos + 1 if class_result == 'neu': neu = neu + 1 positive_sentiment = str(float(pos) / len(words)) # print('Positive: ' + positive_sentiment) neutral_sentiment = str(float(neu) / len(words)) # print('Neutral: ' + neutral_sentiment) negative_sentiment = str(-float(neg) / len(words)) # print('Negative: ' + negative_sentiment) total_sentiment = (float(positive_sentiment)+float(negative_sentiment)) / 2 # print('Total (Avg): ' + str(total_sentiment)) return total_sentiment
def get_stemmer(self, document): ''' Stemmer ''' content = self.clear_document(document) result = self.split_document(content) stemmer = Stemmer() word_stems = [(item, stemmer.stem(item)) for item in result] return word_stems
def stemming_and_lemmatization(token): stemmer = Stemmer() lemmatizer = Lemmatizer() stemmed = stemmer.stem(token) lemmatized = lemmatizer.lemmatize(stemmed) return lemmatized
class PersianTextPreProcessor: def __init__(self): self.stemmer = Stemmer() self.normalizer = Normalizer() self.punctuations = string.punctuation def process_single_word(self, word): word = word.lower() word = re.sub('\d+', '', word) word = word.translate( str.maketrans(self.punctuations, ' ' * len(self.punctuations))) word = ' '.join( re.sub(r'[^ضصثقفغعهخحجچشسیبلاتنمکگظطزرذدپوئژآؤ \n]', ' ', word).split()) word = word.strip() word = self.normalizer.normalize(word) word = self.stemmer.stem(word) return word def pre_stopword_process(self, text): # text = self.persian_text_cleaner.get_sentences(text) text = text.lower() text = re.sub('\d+', '', text) text = text.translate( str.maketrans(self.punctuations, ' ' * len(self.punctuations))) text = ' '.join( re.sub(r'[^ضصثقفغعهخحجچشسیبلاتنمکگظطزرذدپوئژآؤ \n]', ' ', text).split()) text = text.strip() normalized_text = self.normalizer.normalize(text) words = word_tokenize(normalized_text) words = [w for w in words if w != '.'] return words def clean_text(self, text, stopwords, remove_stopwords=True, stem=True): words = self.pre_stopword_process(text) if remove_stopwords: words = [w for w in words if w not in stopwords] if stem: words = [self.stemmer.stem(w) for w in words] return words def stem(self, words): words = [self.stemmer.stem(w) for w in words] return words
def stem(target_string): stemmed_string = "" stemmer = Stemmer() for single_word in target_string.split(): stemmed_string += stemmer.stem(single_word) + " " return stemmed_string
def stem(self): """ :return: """ stemmer = Stemmer() for words in self.words: temp = [] for word in words: temp.append(stemmer.stem(str(word))) self.stem_words.append(temp) return self.stem_words
def preprocess(doc): stemmer = Stemmer() lemmatizer = Lemmatizer() normalizer = Normalizer() doc = normalizer.normalize(doc) tokenized = re.split(' |-', doc) for w in tokenized[:]: if w in stopwords: tokenized.remove(w) stemmed = [stemmer.stem(w) for w in tokenized] new_words = [word for word in stemmed if word.isalnum()] lemmatized = [lemmatizer.lemmatize(w) for w in new_words] return lemmatized
def doc_stemmer(doc): stem_doc_list=[] stemmer=Stemmer() for i in range(len(doc)): stem_doc_list.append([]) for i in range(len(doc)): for j in range(len(doc[i])): stem_doc_list[i].append([]) for i in range(len(doc)): for j in range(len(doc[i])): for z in range(len(doc[i][j])): stem_doc_list[i][j].append(stemmer.stem(doc[i][j][z])) return stem_doc_list
def TextCleaner(self): self.stopwordsList= '' Data = self.imported_data stemmer = Stemmer() lemmatizer = Lemmatizer() dataList = Data table = str.maketrans('', '', punctuation) for i in range(0, len(dataList)): for j in range(0, len(dataList[i][0])): dataList[i][0][j] = stemmer.stem(dataList[i][0][j]) dataList[i][0][j] = lemmatizer.lemmatize(dataList[i][0][j]) dataList[i][0] = [word for word in dataList[i][0] if word.isalpha()] dataList[i][0]= [w.translate(table) for w in dataList[i][0]] dataList[i][0] = [word for word in dataList[i][0] if len(word) > 3] self.imported_data = dataList return self.imported_data
def perform_word_stemming(data_dict): from hazm import Stemmer stemmer = Stemmer() return_value = {} for folder_name in data_dict.keys(): return_value[folder_name] = {} for file_name in data_dict[folder_name].keys(): this_files_words = [] for sent_text in data_dict[folder_name][file_name]: this_sentences_words = [] for word in sent_text: lemma_word = stemmer.stem(word) this_sentences_words.append(lemma_word) this_files_words.append(this_sentences_words) return_value[folder_name][file_name] = this_files_words return return_value
class HazmTokenizer(Component): defaults = {"stemmer": True, "lemmatizer": True, 'pos': False} def __init__(self, component_config: Dict[Text, Any] = None) -> None: super().__init__(component_config) if self.component_config.stemmer: self._stemmer = Stemmer() if self.component_config.lemmatizer: self._lemmatizer = Lemmatizer() if self.component_config.pos: self._pos_tagger = POSTagger(model='resources/postagger.model') def required_packages(self) -> List[Text]: return ['hazm'] def process(self, message: Message, **kwargs: Any) -> None: text = message.text for sentence_str in sent_tokenize(text): sentence = Sentence(sentence_str) tokens = word_tokenize(sentence_str) pos_tags = [] if self.component_config.pos: pos_tags = self._pos_tagger.tag(tokens) for idx, token_str in enumerate(tokens): token = Token(text=token_str) if self.component_config.stemmer: token[TOKEN_ATTRIBUTE_STEM] = self._stemmer.stem(token_str) if self.component_config.lemmatizer: token[TOKEN_ATTRIBUTE_LEMM] = self._lemmatizer.lemmatize( token_str) if self.component_config.pos: token[TOKEN_ATTRIBUTE_POS] = pos_tags[idx][1] sentence.add_token(token) message.add_sentence(sentence)
# row_sums[row_sums.nonzero()]).sum() / len(row_sums[row_sums.nonzero()]) #print labels #print confusion_matrix return precision if __name__ == '__main__': rd = HamshahriReader(config.corpora_root) counter = Counter() docs = [] normalizer = Normalizer() stemmer = Stemmer() for doc in rd.docs(count=config.documents_count): doc['text'] = normalizer.normalize(doc['text']) doc['words'] = [stemmer.stem(word) for word in word_tokenize(doc['text'])] counter.update([doc['cat']]) docs.append(doc) print counter all_words = [] for doc in docs: all_words.extend(doc['words']) dist = nltk.FreqDist(word for word in all_words) word_features = dimension_reduction(all_words, dist) print len(word_features) / float(len(all_words)) * 100.0 features_set = [(doc_features(doc, word_features), doc['cat']) for doc in docs] #train_set, test_set = features_set[:len(docs)/2], features_set[len(docs)/2:len(docs)]
def stemming(self, tokens): stemmer = Stemmer() return [stemmer.stem(token) for token in tokens]
return np.nan # farsi stemmer and stopwords from hazm stemmer = Stemmer() stopwords = stopwords_list() print("Processing words...") # remove stopwords and stem words in wikipedia corpus with open('datasets/wiki.txt', 'r') as f: wiki = f.readlines() words = [w.split(' ') for w in wiki] words = [item for sublist in words for item in sublist] words = np.unique(words) words = np.fromiter((stemmer.stem(xi) for xi in words if xi not in stopwords), words.dtype) # fit count vectorizer on wikipedia corpus count_vect = CountVectorizer(ngram_range=(1,2)) count_vect.fit(words) print("Processing documents...") web = pd.read_csv('datasets/web.csv', header=None) web = web.drop(columns=[0,1,2,3,4,5,6,8,12,13,11]) # create bag of words for each entry, cleaning from stopwords and stemming web['bag_of_words'] = web[7].apply(try_join) web = web[~web['bag_of_words'].isna()]
class POS(): def __init__(self, inFile, outFile): self.inFile = inFile self.outFile = outFile self.normalizer = Normalizer() self.tagger = POSTagger(model='resources/postagger.model') self.lemmatizer = Lemmatizer() self.stemmer = Stemmer() def posTaggerTXT(self): with open(self.outFile, 'w', encoding="utf8") as o: with open(self.inFile, 'r', encoding="utf8") as f: line = f.readline() while line: line = line.strip() line = self.normalizer.normalize(line) tags = self.tagger.tag(word_tokenize(line)) for li in tags: t = '{:20s} {:20s} {:20s} {:20s}\n'.format( li[0], self.nameTag(li[1]), self.lemmatizer.lemmatize(li[0]), self.stemmer.stem(li[0])) o.write(t) line = f.readline() def posTaggerHTML(self): with open(self.outFile, 'w', encoding="utf8") as o: with open(self.inFile, 'r', encoding="utf8") as f: o.write(self.preHTML()) line = f.readline() while line: line = line.strip() line = self.normalizer.normalize(line) tags = self.tagger.tag(word_tokenize(line)) for li in tags: t = '{:s} -//- {:s} -//- {:s} -//- {:s}\n'.format( li[0], self.nameTag(li[1]), self.lemmatizer.lemmatize(li[0]), self.stemmer.stem(li[0])) o.write(self.divHTML(self.colorTag(li[1]), t)) o.write("\n") line = f.readline() o.write(self.posHTML()) def nameTag(self, tag): if tag == "V": return "فعل" elif tag == "N": return "اسم" elif tag == "ADV": return "قید" elif tag == "PRO": return "ضمیر" elif tag == "PUNC": return "نشانه نگارشی" elif tag == "Ne": return "غیر قابل تشخیص" elif tag == "NUM": return "عدد" elif tag == "CONJ": return "حرف ربط" elif tag == "POSTP": return "نشانه مفعولی" elif tag == "P": return "حرف اضافه" elif tag == "AJ": return "صفت" elif tag == "DET": return "ضمیر اشاره" else: return tag def colorTag(self, tag): if tag == "V": return "red" elif tag == "N": return "hotpink" elif tag == "ADV": return "blue" elif tag == "PRO": return "gold" elif tag == "PUNC": return "lightblue" elif tag == "Ne": return "darkgray" elif tag == "NUM": return "white" elif tag == "CONJ": return "lightgreen" elif tag == "POSTP": return "white" elif tag == "P": return "aqua" elif tag == "AJ": return "teal" elif tag == "DET": return "slateblue" else: return "white" def preHTML(self): return """<!DOCTYPE html> <head> <meta charset="UTF-8"> </head> <body> """ def posHTML(self): return """ </body> </html>""" def divHTML(self, color, text): return """ <div style="background-color:""" + color + """"> """ + """<h4>""" + text + """</h4> """ + """</div>