def __init__(self, inFile, outFile): self.inFile = inFile self.outFile = outFile self.normalizer = Normalizer() self.tagger = POSTagger(model='resources/postagger.model') self.lemmatizer = Lemmatizer() self.stemmer = Stemmer()
def doc_normalizer(doc): normalized_doc_list=[] normalizer=Normalizer() for i in range(len(doc)): normalized_doc_list.append(normalizer.normalize(doc[i])) return normalized_doc_list
def score(self, sentences): # Predict pos, neg, neu = 0, 0, 0 stemmer = Stemmer() classifier = self.__get_model() normalizer = Normalizer() sentences = sent_tokenize(sentences) for sentence in sentences: sentence = normalizer.normalize(sentence) words = word_tokenize(sentence) for word in words: stemmer.stem(word) class_result = classifier.classify(self.__word_feats(word)) if class_result == 'neg': neg = neg + 1 if class_result == 'pos': pos = pos + 1 if class_result == 'neu': neu = neu + 1 positive_sentiment = str(float(pos) / len(words)) # print('Positive: ' + positive_sentiment) neutral_sentiment = str(float(neu) / len(words)) # print('Neutral: ' + neutral_sentiment) negative_sentiment = str(-float(neg) / len(words)) # print('Negative: ' + negative_sentiment) total_sentiment = (float(positive_sentiment)+float(negative_sentiment)) / 2 # print('Total (Avg): ' + str(total_sentiment)) return total_sentiment
def prepare(): normalizer = Normalizer() stemmer = Stemmer() string = '''ویکی پدیای انگلیسی در تاریخ ۱۵ ژانویه ۲۰۰۱ (۲۶ دی ۱۳۷۹) به صورت مکملی برای دانشنامهٔ تخصصی نیوپدیا نوشته شد. بنیان گذاران آن «جیمی ویلز» و «لری سنگر» هستند. هم اکنون بنیاد غیرانتفاعی ویکی مدیا پروژهٔ ویکی پدیا را پشتیبانی می کند. میزبان های اینترنتی اصلی این وبگاه در شهر تامپای فلوریدا هستند؟ همچنین میزبان های اضافی دیگری هم در شهرهای آمستردام، شیراز و سئول به این وبگاه یاری می رسانند؟''' tokenizer = WordTokenizer(join_verb_parts=True, separate_emoji=True, replace_links=True, replace_IDs=True, replace_emails=True, replace_numbers=True, replace_hashtags=True) labels = {'،': 'COMMA', '.': 'DOT', '؟': 'QMARK'} normal_string = normalizer.normalize(string) for label in labels.keys(): print(normal_string.find(label)) exit(0) for i, sent in enumerate([1, 2, 3, 4]): entities = [] (10, 15, 'PrdName') for label in labels.keys(): print(f'{label} in {i}', label in sent) record = (sent, {'entities': entities}) print()
def prepare_line(line): global normalizer, incorrect, correct, unicode_redundant_chars, whitespace_chars, digits, punct_str, punctuations if normalizer is None: normalizer = Normalizer() incorrect, correct = CorrectCodings.loadCodings("TableCodings.txt") line = normalizer.normalize(line) line = CorrectCodings.CorrectCodingInLine(line, incorrect, correct) # remove prefix pat = re.compile(r"https?(.)*[^\s]+") line = re.sub(pat, r" ", line) pat = re.compile(r"\\n") line = re.sub(pat, "\n", line) pat = re.compile(r"([^\sا-ی۰-۹a-zA-Z\d])") line = re.sub(pat, r" \1 ", line) for p in punctuations: pat = re.compile(r"([" + punct_str + "])") line = re.sub(pat, r" \1 ", line) pat = re.compile(r"([" + digits + "]+)") line = re.sub(pat, r" \1 ", line) pat = re.compile(r" +") line = re.sub(pat, r" ", line) pat = re.compile(r"\n+") line = re.sub(pat, r" \n ", line) pat = re.compile("[" + whitespace_chars + "]+") line = re.sub(pat, r" ", line) line = line.strip() return line
def handle(self, *args, **options): articles = Article.objects.filter(is_vectorized=False) N = Normalizer() FT = fasttext.load_model(options['path']) index = 0 for article in articles: try: if index % 100 == 0: print(index) text = N.normalize(article.text) text = text.translate(str.maketrans('', '', punctuation)) text = text.split() text = [word for word in text if len(word) > 2] vector = nan_to_num( mean([FT.get_word_vector(w) for w in text], axis=0)) vector = vector / (vector.dot(vector))**0.5 obj = ArticleVector(article=article, embedding=vector.tolist()) obj.save() article.is_vectorized = True article.save() index += 1 except Exception as e: print(e)
def process_text(self, text: str) -> Dict[str, int]: """ Splits a long text into words. If `persian_normalize` attribute has been set to True, normalizes `text` with Hazm Normalizer. If `include_numbers` attribute has been set to False, removes all Persian, English and Arabic numbers from text`. :param text: The text we want to process :return: a dictionary. keys are words and values are the frequencies. """ flags = ( re.UNICODE if version < '3' and type(text) is unicode # noqa: F821 else 0) if self.persian_normalize: normalizer = Normalizer() text = normalizer.normalize(text) if not self.include_numbers: text = re.sub(r"[0-9\u06F0-\u06F9\u0660-\u0669]", "", text) if self.regexp: words = re.findall(self.regexp, text, flags) else: words = word_tokenize(text) if self.collocations: word_counts = unigrams_and_bigrams(words, self.normalize_plurals) else: word_counts, _ = process_tokens(words, self.normalize_plurals) return word_counts
def normalize(self): """ :return: """ normalizer = Normalizer() for line in self.data.split('\n'): if line != "": self.normalize_text.append(normalizer.normalize(line)) return self.normalize_text
def statement_pre_processing(input_statement): normalizer = Normalizer() lemmatizer = Lemmatizer() input_statement = normalizer.normalize(input_statement) input_statement = [ lemmatizer.lemmatize(word) for word in word_tokenize(input_statement) if word not in stops ] return input_statement
def __init__(self): # persian words normalizer self.normalizer = Normalizer() # load stopwords logger.info(f"Loading stopwords from {DATA_DIR / 'stopwords.txt'}") stop_words = open(DATA_DIR / 'stopwords.txt').readlines() stop_words = map(str.strip, stop_words) self.stop_words = set(map(self.normalizer.normalize, stop_words))
def __init__(self, path_dataset, path_stopwords): self.path_dataset = path_dataset self.path_stopwords = path_stopwords self.stopwords = self.file_reader(self.path_stopwords) self.normalizer = Normalizer()
def bigram_cleaner(text): text = re.sub(Text_cleaner.persian_regex, ' ', text) text = re.sub('[ ]+', ' ', text) normalizer = Normalizer() text = normalizer.normalize(text) tokenized = word_tokenize(text) return tokenized
def tokenize(paragraph, wanted_list): normal = Normalizer(remove_extra_spaces=True, punctuation_spacing=True, persian_style=False, persian_numbers=False, remove_diacritics=False, affix_spacing=False, token_based=False) for sentence in sent_tokenize(normal.normalize(paragraph)): wanted_list.append(sentence)
def test_word_visualization(model_path, some_words): normalizer = Normalizer() model = word2vec.Word2Vec.load(model_path) vectors = [model[normalizer.normalize(word)] for word in some_words if normalizer.normalize(word) in model.vocab.keys()] # print(model[normalizer.normalize('فرهنگ')]) # print(model.similarity('فرهنگ', 'تمدن')) # print(vectors) rd = W2VPersianVis(model_path, selected_words=some_words) rd.show_plot()
def test_word_visualization(model_path, some_words): normalizer = Normalizer() model = word2vec.Word2Vec.load(model_path) vectors = [ model[normalizer.normalize(word)] for word in some_words if normalizer.normalize(word) in model.vocab.keys() ] # print(model[normalizer.normalize('فرهنگ')]) # print(model.similarity('فرهنگ', 'تمدن')) # print(vectors) rd = W2VPersianVis(model_path, selected_words=some_words) rd.show_plot()
def preprocess(self, cm): cm = ''.join([c for c in str(cm) if c not in punctuation]) cm = self._numbers_to_english(cm) cm = re.sub(r"[0-9]", '', cm) cm = cm.replace('\u200c', ' ').replace('\n', '').replace('\r', '').replace( 'ي', 'ی').replace('ك', 'ک') normalizer = Normalizer() cm = normalizer.normalize(cm) tokens = word_tokenize(cm) cm = ' '.join([x for x in tokens if x not in self.stopwords]) return cm
def pipeline_sentence(sentence, model, tokenizer): sentence = change_words(sentence) normalizer = Normalizer() sentence = normalizer.normalize(sentence) sentence_lem = ' '.join([ Lemmatizer().lemmatize(x) for x in word_tokenize(normalizer.normalize(sentence)) ]) nlp = pipeline("ner", model=model, tokenizer=tokenizer) sentence_ner = nlp(sentence) sentence_ner_lem = nlp(sentence_lem) return sentence_ner, sentence_ner_lem, sentence_lem, sentence
def preprocess(doc): stemmer = Stemmer() lemmatizer = Lemmatizer() normalizer = Normalizer() doc = normalizer.normalize(doc) tokenized = re.split(' |-', doc) for w in tokenized[:]: if w in stopwords: tokenized.remove(w) stemmed = [stemmer.stem(w) for w in tokenized] new_words = [word for word in stemmed if word.isalnum()] lemmatized = [lemmatizer.lemmatize(w) for w in new_words] return lemmatized
def entofa(bot, update): per = ["ﺽ", "ﺹ", "ﺙ", "ﻕ", "ﻑ", "ﻍ", "ﻉ", "ﻩ", "ﺥ", "ﺡ", "ﺝ", "چ", "ﺵ", "ﺱ", "ی", "ﺏ", "ﻝ", "ﺍ", "ﺕ", "ﻥ", "ﻡ", "ک", "گ", "ﻅ", "ﻁ", "ﺯ", "ﺭ", "ﺫ", "ﺩ", "پ", "ﻭ"] eng = ["q", "w", "e", "r", "t", "y", "u", "i", "o", "p", "[", "]", "a", "s", "d", "f", "g", "h", "j", "k", "l", ";", "'", "z", "x", "c", "v", "b", "n", "m", ","] s = update.message.text for i in range(len(per)): s = s.replace(eng[i], per[i]) normalizer = Normalizer() s = normalizer.normalize(s) bot.sendMessage(update.message.chat_id, text=s)
def dataset_cleaner(dataset): statements = [] normalizer = Normalizer() lemmatizer = Lemmatizer() for i in range(len(dataset)): normalized_statement = normalizer.normalize(dataset[i]) # for sentence in sent_tokenize(dataset[i]): word_list = [ lemmatizer.lemmatize(word) for word in word_tokenize(normalized_statement) if word not in stops ] statements.append(word_list) return statements
def document(filepath): f = open(filepath, 'r', encoding='utf-8', errors='ignore') txt = f.read() f.close() txt = remove_punctuation(txt) normalizer = Normalizer() txt = normalizer.normalize(txt) document = word_tokenize(txt) document = [word for word in document if word not in stop_words and not word.isdigit()] return document
class PersianTextPreProcessor: def __init__(self): self.stemmer = Stemmer() self.normalizer = Normalizer() self.punctuations = string.punctuation def process_single_word(self, word): word = word.lower() word = re.sub('\d+', '', word) word = word.translate( str.maketrans(self.punctuations, ' ' * len(self.punctuations))) word = ' '.join( re.sub(r'[^ضصثقفغعهخحجچشسیبلاتنمکگظطزرذدپوئژآؤ \n]', ' ', word).split()) word = word.strip() word = self.normalizer.normalize(word) word = self.stemmer.stem(word) return word def pre_stopword_process(self, text): # text = self.persian_text_cleaner.get_sentences(text) text = text.lower() text = re.sub('\d+', '', text) text = text.translate( str.maketrans(self.punctuations, ' ' * len(self.punctuations))) text = ' '.join( re.sub(r'[^ضصثقفغعهخحجچشسیبلاتنمکگظطزرذدپوئژآؤ \n]', ' ', text).split()) text = text.strip() normalized_text = self.normalizer.normalize(text) words = word_tokenize(normalized_text) words = [w for w in words if w != '.'] return words def clean_text(self, text, stopwords, remove_stopwords=True, stem=True): words = self.pre_stopword_process(text) if remove_stopwords: words = [w for w in words if w not in stopwords] if stem: words = [self.stemmer.stem(w) for w in words] return words def stem(self, words): words = [self.stemmer.stem(w) for w in words] return words
class HazmNormalizer(Component): def __init__(self, component_config: Dict[Text, Any] = None) -> None: super().__init__(component_config) self._normalizer = Normalizer() def process(self, message: Message, **kwargs: Any) -> None: message.text = self._normalizer.normalize(message.text) exclude_items = {} if 'exclude_items' in kwargs: exclude_items = {x: x for x in kwargs['exclude_items']} for key, value in message: if key in exclude_items: continue if isinstance(value, str): message[key] = self._normalizer.normalize(value) elif isinstance(value, list): for idx, item_value in enumerate(value): value[idx] = self._normalizer.normalize(item_value)
class PoemSentences(object): def __init__(self, poems_path): self.poems_path = poems_path self.normalizer = Normalizer() def __iter__(self): for poem_file in os.listdir(self.poems_path): for sentence in open(os.path.join(self.poems_path, poem_file)): yield word_tokenize( self.normalizer.normalize(sentence.replace('هٔ', 'ه')))
def normalize_words(words: Iterable) -> List[str]: """ This method gets an Iterable containing some Farsi words as elements, normalizes them using Hazm and then returns a list of normalized words. :param words: an iterable including words :return: A list of normalized elements of the `words` iterable. """ combined_words: str = "".join(x + "\n" for x in words) normalizer: Normalizer = Normalizer() normalized_combined_words: str = normalizer.normalize(combined_words) return normalized_combined_words.split("\n")
def process_text(text): normalize=Normalizer() text=normalize.normalize(text) text = text.replace("_", " ") text = text.replace(',', ' ') text=text.replace("\u220c","") text=text.replace("\u200c","") text=text.replace("-","") # text = text.replace('/', ' ') text = text.replace('(', ' ') text = text.replace(')', ' ') text = text.replace('.', ' ') text=text.replace("،"," ") text=text.replace("«"," ") text=text.replace("»"," ") # Convert text string to a list of words t = re.findall("[\u0627-\u06FF]+|<S>|</s>|\?|//", text) # just split word by space to space and omit other thing lemma=Lemmatizer() text=[lemma.lemmatize(x) for x in t] return text
def clean(sentence): #trim digits ind = 0 for i in range(len(sentence)): if (sentence[i] in FARSI_DIGITS or sentence[i] in ENGLISH_DIGITS): ind += 1 else: break sentence = sentence[ind:] #remove Non-Alphanumeric res = [] for i in range(len(sentence)): if (sentence[i] in FARSI_ALPHABET or sentence[i] in FARSI_DIGITS): res.append(sentence[i]) sentence = "".join(res) normalizer = Normalizer() sentence = normalizer.normalize(sentence) return sentence
def prepare_line(line): global normalizer, incorrect, correct, unicode_redundant_chars, whitespace_chars, digits, punct_str, punctuations if normalizer is None: normalizer = Normalizer() incorrect, correct = loadCodings("TableCodings.txt") line = normalizer.normalize(line) line = CorrectCodingInLine(line, incorrect, correct) pat = re.compile(r"([" + re.escape(punct_str) + "])") line = re.sub(pat, r" \1 ", line) pat = re.compile(r"([" + digits + "]+)") line = re.sub(pat, r" \1 ", line) pat = re.compile(r"\n+") line = re.sub(pat, r" \n ", line) pat = re.compile("[" + whitespace_chars + "]+") line = re.sub(pat, r" ", line) line = line.strip() return line
def clean_tweet(tweet): tweet = str(tweet) tweet = tweet.lower() # remove # so we preserve hashtags for the cloud tweet = tweet.replace("#", "") tweet = remove_links(tweet) tweet = remove_mentions(tweet) tweet = remove_emoji(tweet) tweet = remove_punctuations(tweet) tweet = remove_reserved_words(tweet) normalizer = Normalizer() tweet = normalizer.normalize(tweet) # replace arabic ي with persian tweet = tweet.replace('ي', 'ی') # removes verbs such as میشود or نمیگویند tweet = re.sub(r'ن?می[]\S+', '', tweet) tokens = word_tokenize(tweet) tokens = [token for token in tokens if not token.isdigit()] tokens = [token for token in tokens if token not in stopwords.persian] tokens = [token for token in tokens if token not in stopwords.english] return " ".join(tokens).strip()
def prepare_text(text, should_stem=True): normalizer = Normalizer() text = normalizer.normalize(text) tokenized = word_tokenize(text) #نگارشی def fix_word(w): # for c in Text_cleaner.punct_list: # w = w.replace(c, '') w = re.sub(Text_cleaner.punct_regex, '', w).replace('،', '') return "$" if w == "" else w punc_free = list(filter(lambda x: x != '$', map(fix_word, tokenized))) stemmer = Stemmer() if should_stem: stemmed_list = list( filter(lambda x: x != '', map(stemmer.stem, punc_free))) else: stemmed_list = punc_free return stemmed_list
def clean_persianText(txt): normalizer = Normalizer() txt = normalizer.character_refinement(txt) txt = normalizer.affix_spacing(txt) txt = normalizer.punctuation_spacing(txt) txt = txt.replace('.', '') txt = normalizer.normalize(txt) return txt
def process_text(self, text: str) -> Dict[str, int]: """ Splits a long text into words. If `persian_normalize` attribute has been set to True, normalizes `text` with Hazm Normalizer. If `include_numbers` attribute has been set to False, removes all Persian, English and Arabic numbers from text`. Attention: this method will not remove stopwords from the input. :param text: The text we want to process :return: a dictionary. keys are words and values are the frequencies. """ flags = ( re.UNICODE if version < '3' and type(text) is unicode # noqa: F821 else 0) if self.remove_unhandled_utf_characters: text = WordCloudFa.unhandled_characters_regex.sub(r'', text) if self.persian_normalize: normalizer = Normalizer() text = normalizer.normalize(text) if not self.include_numbers: text = re.sub(r"[0-9\u06F0-\u06F9\u0660-\u0669]", "", text) if self.regexp: words = re.findall(self.regexp, text, flags) else: words = word_tokenize(text) if self.collocations: # We remove stopwords in the WordCloudFa, so there is no need for passing them in this function. word_counts = unigrams_and_bigrams(words, [], self.normalize_plurals, self.collocation_threshold) else: word_counts, _ = process_tokens(words, self.normalize_plurals) return word_counts
class Summarizer(object): def __init__(self): self.normalizer = Normalizer() def summarize(self, input): self.input = self.normalizer.normalize(input) self.base_words = word_tokenize(self.input) self.working_sentences = sent_tokenize(self.input) self.sentences_number = len(self.working_sentences) return self._get_summarize(num_sentences=self._find_num_sentences()) def _find_num_sentences(self): return (int(math.log(self.sentences_number)**2 + 1) + 1) if self.sentences_number >= 6 else self.sentences_number # return int(self.sentences_number - 0.2 * self.sentences_number) def _get_summarize(self, num_sentences): # if str(word not in stopwords.words()] words = [ word for word in self.base_words if word not in stopwords.words('persian') ] word_frequencies = FreqDist(words) most_frequent_words = [ pair[0] for pair in word_frequencies.items()[:100] ] actual_sentences = sent_tokenize(self.input) output_sentences = [] for word in most_frequent_words: for i in range(0, len(self.working_sentences)): if (word in self.working_sentences[i] and actual_sentences[i] not in output_sentences): output_sentences.append(actual_sentences[i]) break if len(output_sentences) >= num_sentences: break if len(output_sentences) >= num_sentences: break return self._reorder_sentences(output_sentences) def _reorder_sentences(self, output_sentences): output_sentences.sort( lambda s1, s2: self.input.find(s1) - self.input.find(s2)) return output_sentences
def __init__(self, corpus_path='resources/corpus.json', symbols_json_path='resources/symbols.json', persian_lang_path='resources/persian_lang.json', postagger_model_path='resources/postagger.model', max_keyword_num=10, min_keyword_occurrences=0.01, expand_corpus=False): self.postagger_model_path = postagger_model_path self.symbols_json_path = symbols_json_path self.corpus_path = corpus_path self.corpus = {} self.docs_num = 0 self.expand_corpus = expand_corpus if self.corpus_path is not None: with open(corpus_path, encoding='utf-8') as json_file: corpus = json.load(json_file) self.corpus = corpus['corpus'] self.docs_num = corpus['docs_num'] with open(symbols_json_path, encoding='utf-8') as json_file: data = json.load(json_file) lst = list(data.values()) self.all_symbols_list = [item for sublist in lst for item in sublist] with open(persian_lang_path, encoding='utf-8') as json_file: persian_lang = json.load(json_file) self.epic_keywords = persian_lang['epic_keywords'] self.punctuations = persian_lang['punctuations'] self.persian_alphabet = persian_lang['persian_alphabet'] self.stop_words = persian_lang['stop_words'] self.tagger = POSTagger(model=self.postagger_model_path) self.normalizer = Normalizer() self.max_keyword_num = max_keyword_num self.min_keyword_occurrences = min_keyword_occurrences
from __future__ import unicode_literals from hazm import Normalizer from hazm import sent_tokenize, word_tokenize from hazm import Stemmer, Lemmatizer from gensim.models.doc2vec import TaggedDocument from gensim.models import Doc2Vec from sklearn.linear_model import LogisticRegression import numpy as np stemmer = Stemmer() normalizer = Normalizer() ################## define variables----------------------------------------------- num_features=100 num_Of_epoch = 0 train_rate = 0.6 validate_rate = 0.1 sentences = [] # Initialize an empty list of sentences mylabel = [] # labels for train sentences #_________________________________________________________________________________ def train_test_seperator(data_path, label_path, train_rate = 0.6, validate_rate = 0.1): data_file = open(data_path, "r") label_file = open(label_path, "r") tmp_data = data_file.readlines() data_content = [] for s in tmp_data: s = s.split("\n") s = s[0] s = s.split("\r") s = s[0] if(s == "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"): continue
from __future__ import unicode_literals import os,sys,codecs from hazm import Normalizer,sent_tokenize, word_tokenize reader=codecs.open(os.path.abspath(sys.argv[1]),'r',encoding='utf-8') writer=codecs.open(os.path.abspath(sys.argv[2]),'w',encoding='utf-8') count=1 line=reader.readline() normalizer = Normalizer() while line: if count%1000==0: sys.stdout.write(str(count)+'...') if line.strip(): n=normalizer.normalize(line.strip()) tok=word_tokenize(n) sen=u' '.join(tok).replace('_',' ').replace(' ',' ').replace(' ',' ') l=sen+u'\n' writer.write(l) else: writer.write(u'\n') count+=1 line=reader.readline() sys.stdout.write('\n') writer.flush() writer.close()
from __future__ import unicode_literals from hazm import Normalizer from hazm import sent_tokenize, word_tokenize from hazm import Stemmer, Lemmatizer from gensim.models.doc2vec import TaggedDocument from gensim.models import Doc2Vec from sklearn.linear_model import LogisticRegression stemmer = Stemmer() normalizer = Normalizer() ################## define variables----------------------------------------------- num_features=100 num_Of_epoch = 10 sentences = [] # Initialize an empty list of sentences mylabel = [] # labels for train sentences #_________________________________________________________________________________ sentence_path = '/home/bero/Desktop/dataset/Persian Product Review Dataset/totaldata' label_path = '/home/bero/Desktop/dataset/Persian Product Review Dataset/totallabel' file_to_read = open(label_path, 'r') labels = file_to_read.readlines() mylabel = [] for line in labels: tmp = line.split('\n') mylabel.append(int(tmp[0])) file_to_read.close() file_to_read = open(sentence_path, 'r') file_content = file_to_read.readlines() file_to_read.close()
def on_chat_message(self, msg): normalizer = Normalizer() #if msg.has_key(u'document'): #self.sender.downloadFile(msg[u'document'][u'file_id'], file_path="~/dl") m = msg['text'].split(' ') mr = msg['text'] fn = msg['from']['first_name'] chat_type = msg['chat']['type'] user_id = msg['from']['id'] r = '' if m[0] == u'/start': r = u'سلام به تو که اسمتو گذاشتی ' + unicode(fn) elif m[0] == u'mojose': r = msg if chat_type == 'private' and mr[:3] != u'هوی': mr = u'هوی ' + mr m = mr.split(' ') if user_id == 170378225: #global ddd = {index, keyOf dd } global h_id global d global ddd global q2 #get outputs from db if m[1] == u'g': #print 'g' try: q = Hoy.select(User, Hoy).join(Chat).join(User).where(Hoy.hoy.contains(': 0')).get() h_id = q.id q2 = User.select().join(Chat).join(Hoy).where(Hoy.id==h_id) d = ast.literal_eval(q.hoy) o = '' i = 0 d_iter = d.iteritems() for (k, v) in (d_iter): o += str(i)+' : '+k+' : '+str(v)+'\n' i += 1 i = 0 d_k = d.keys() dd = {} for k in d_k: dd[i] = k i += 1 ddd = dd inputs = '' for i in q2: inputs += i.user + '\n' r = inputs+'\n-----------\n'+o except: r = 'چیزی برای تأیید نیست!' elif mr[4:6] == u'g\n': #print 'g2' mrc = mr[4:] mc = mrc.split('\n') user_input = mc[1] try: q = Hoy.select(User, Hoy).join(Chat).join(User).where(User.user == user_input).get() h_id = q.id q2 = User.select(Hoy, User).join(Chat).join(Hoy).where(Hoy.id==h_id) d = ast.literal_eval(q.hoy) o = '' i = 0 d_iter = d.iteritems() for (k, v) in (d_iter): o += str(i)+' : '+k+' : '+str(v)+'\n' i += 1 i = 0 d_k = d.keys() dd = {} for k in d_k: dd[i] = k i += 1 ddd = dd inputs = '' for i in q2: inputs += i.user + '\n' r = inputs+'\n-----------\n'+o except: r = 'نبود که!' elif mr[4:7] == 'gg\n': #print 'gg' mrr = mr[7:].replace(u'؟', u'').replace(u'.', u'').replace(u'!', u'').replace(u'می ', u'می').replace(u'می', u'می') mrr = normalizer.normalize(mrr) #print 'normalized user input:', mrr mm = mrr.split(' ') rgx = u'' for w in mm: rgx += w+'|' if u'می' == w[:2] and u'' != w[2] and u' ' != w[2]: rgx += u'می'+w[2:]+u'|' if len(mm) < 3: rgx = u'(' + rgx[:-1] + u') ' else: rgx = u'(' + rgx[:-1] + u')? ' rgx = rgx * len(mm) rgx = rgx[:-1] #print 'regex:', rgx try: q = Chat.select(Chat, Hoy, User).join(User).switch(Chat).join(Hoy).where(User.user.regexp(rgx)).limit(10) #print 'records founded (max 10):', len(q) if len(q) == 0: #try to fuzzy string and rematch #print 'not found!' raise else: n = 0 #rd = {n: ratio} rd = {} while n < len(q): us = q[n].user.user #print 'string founded: ', us ratio = fuzz.ratio(us, mrr) #print ratio if ratio >= 50: rd[n] = ratio n += 1 #print rd ho = '' while len(ho) == 0: maxn = max(rd.values()) n = rd.keys()[rd.values().index(maxn)] hoo = q[n].hoy.hoy #print 'founded a dict for', n try: ho = ast.literal_eval(hoo) #print 'a valid dict:', ho user_input = q[n].user.user if 1 not in ho.values(): #print 'this dict haven\'t any valid item' raise except: #print 'deleting', rd[n] del rd[n] #print 'deleted!' ho = '' user_input = '' except: #print 'eee!' pass try: q = Hoy.select(User, Hoy).join(Chat).join(User).where(User.user == user_input).get() h_id = q.id q2 = User.select(Hoy, User).join(Chat).join(Hoy).where(Hoy.id==h_id) d = ast.literal_eval(q.hoy) o = '' i = 0 d_iter = d.iteritems() for (k, v) in (d_iter): o += str(i)+' : '+k+' : '+str(v)+'\n' i += 1 i = 0 d_k = d.keys() dd = {} for k in d_k: dd[i] = k i += 1 ddd = dd inputs = '' for i in q2: inputs += i.user + '\n' r = inputs+'\n-----------\n'+o except: r = 'نبود که!' #review items elif m[1] == u'r': o = '' i = 0 d_iter = d.iteritems() for (k, v) in (d_iter): o += str(i)+' : '+k+' : '+str(v)+'\n' i += 1 i = 0 d_k = d.keys() dd = {} for k in d_k: dd[i] = k i += 1 ddd = dd inputs = '' for i in q2: inputs += i.user + '\n' r = inputs+'\n-----------\n'+o #commit changes elif m[1] == u'c': d_i = d.items() for k, v in d_i: if v == 0: del d[k] Hoy.update(hoy=d).where(Hoy.id == h_id).execute() d = {} ddd = {} inputs = '' r = 'تغییرات ذخیره شد!' #change state of an item elif len(m) == 2: try: i = int(m[1]) if d[ddd[i]] == 0: d[ddd[i]] = 1 else: d[ddd[i]] = 0 r = ddd[i] + ' : ' + str(d[ddd[i]]) except: pass #if m[1] == 'grupoj': #TODO merge same outputs if '\n' in mr and u'\nبگو\n' in mr and r == '': mrc = normalizer.normalize(mr[4:]) mc = mrc.split('\n') say_index = mc.index(u'بگو') user_inputs = mc[:say_index] hoy_outputs = mc[say_index+1:] hoy_outputs = {k:0 for k in hoy_outputs} hoy_outputs_old = {} for user_input in user_inputs: try: H = (Hoy.select().join(Chat).join(User).where(User.user==user_input)) hoy_outputs_old = H[0].hoy h_id = H[0].id hoy_outputs_old = ast.literal_eval(hoy_outputs_old) del user_inputs[user_inputs.index(user_input)] except: pass if hoy_outputs_old == {}: h = Hoy.create(hoy=hoy_outputs) r = u'پاسخهای شما در صف بررسی قرار گرفت. تا ارباب چی بگن!' else: try: hoy_outputs.update(hoy_outputs_old) update_query = Hoy.update(hoy=hoy_outputs).where(Hoy.id==h_id) update_query.execute() h = Hoy.get(Hoy.id==h_id) r = u'پاسخهای شما نیز در صف بررسی قرار گرفت. تا ارباب چی بگن!' except Exception as e: pass #print e try: for user_input in user_inputs: u, created = User.get_or_create(user=user_input) if created: Chat.create(user=u, hoy=h) except Exception as e: pass #print e elif '\n' in mr and u'\nنفهم' in mr and r == '' and user_id == 170378225: mrc = mr[4:] mc = mrc.split('\n') say_index = mc.index(u'نفهم') user_input = mc[:say_index] try: dq = User.delete().where(User.user==user_input[0]) dq.execute() r = u'اطاعت! دیگر به چنین چیزی پاسخ نمیدهم.' #TODO delete u_id that not exist in User, from Chat except: r = u'چنین چیزی وجود ندارد!' elif m[0] == u'هوی': if re.search(u'تخم|کیر|کسخل|کون|کون|الاغ|الاق|جنده|گای|پستون|ممه|گوز|شاش|جیش|قبحه|جلق|جق|سگ|جاکش|گائ|گاتو|کیون|لاشی|گامو|فاک|ساک|کُس|کوس|کوص|کص|سکس|پورن|الکسیس|گاشو', mr) \ or re.search(u'(^| )رید(.|$)', mr) or u'خرم' in m or u'خری' in m or u'خره' in m or u'گا' in m or u'شق' in m or u'منی' in m or re.search(u'(^| )حشری(.|$)', mr): r = choice([u'بیادب :|', u'بیتربیت :|', u'بیشخصیت :|',u'عفت کلام داشته باش یه ذره :|', u'دهنتو آب بکش :|']) #elif m[1] == u'سلام' or m[1] == u'درود': #r = choice([u'سلام', u'علیک سلام']) elif len(m) >= 3 and m[1] == u'بگو': r = normalizer.normalize(mr[8:]) elif len(m) == 3: m2 = m[1]+' '+m[2] if m2 == u'چه خبر؟': response = urllib2.urlopen('http://www.farsnews.com/RSS') rss = response.read() soup = BeautifulSoup.BeautifulSoup(rss) all_title = soup.findAll('title') def get_link(nth): item = soup.findAll('item')[nth] link = re.search(r'http://www.farsnews.com/(\d+)',unicode(item)).group(0) return link r = unicode(all_title[2]).replace('<title>', '<a href="%s">'%get_link(0), 2).replace('</title>', '</a>') + '\n\n' + \ unicode(all_title[3]).replace('<title', '<a href="%s"'%get_link(1), 2).replace('</title>', '</a>') + '\n\n' + \ unicode(all_title[4]).replace('<title', '<a href="%s"'%get_link(2), 2).replace('</title>', '</a>') elif len(m) == 2: if m[1] == u'راهنما': r = u'• به این شکل هوی را آموزش دهید:\n\ \n\ سلام\n\ درود\n\ بگو\n\ علیک سلام\n\ سلام حاجی\n\ \n\ !> دقت کنید که در یک پیام و در خطهای جدا باشد.\n\ \n\ !> اگر در گروه آموزشش میدهید، ابتدا هوی بنویسید و سپس مثل بالا خطوط را وارد کنید. این دو شکل قابل قبول است:\n\ \n\ هوی سلام\n\ درود\n\ بگو\n\ علیک سلام\n\ سلام حاجی\n\ ---------\n\ هوی\n\ سلام\n\ درود\n\ بگو\n\ علیک سلام\n\ سلام حاجی\n\ \n\ • آموختهها پس از تأیید به نمایش در میآیند.\n\ \n\ !> آموختههایی که به اشخاص مربوط است و جنبهٔ عمومی ندارد، تأیید نمیشود.\n\ !> آموختههای شامل حرف بد، توهین و… تأیید نمیشود.\n\ !> آموختههای دارای اشتباه نوشتاری تأیید نمیشود.\n\ \n\ • اگر مثلاً «سلام» برای هوی تعریف شده باشد، میتواند اینگونه از پاسخهای «سلام» برای «هلو» هم استفاده کند:\n\ \n\ سلام\n\ هلو\n\ بگو\n\ علیک\n\ های\n\ سلام عزیز\n\ \n\ • اگر پیشنهادی دارید، به @HSN6789 پیام بدهید.' if r == '': mrr = mr[4:].replace(u'؟', u'').replace(u'.', u'').replace(u'!', u'').replace(u'می ', u'می').replace(u'می', u'می') mrr = normalizer.normalize(mrr) #print 'normalized user input:', mrr mm = mrr.split(' ') rgx = u'' for w in mm: rgx += w+'|' if u'می' == w[:2] and u'' != w[2] and u' ' != w[2]: rgx += u'می'+w[2:]+u'|' if len(mm) < 3: rgx = u'(' + rgx[:-1] + u') ' else: rgx = u'(' + rgx[:-1] + u')? ' rgx = rgx * len(mm) rgx = rgx[:-1] #print 'regex:', rgx try: q = Chat.select(Chat, Hoy, User).join(User).switch(Chat).join(Hoy).where(User.user.regexp(rgx)).limit(10) #print 'records founded (max 10):', len(q) if len(q) == 0: #try to fuzzy string and rematch #print 'not found!' raise else: n = 0 #rd = {n: ratio} rd = {} while n < len(q): us = q[n].user.user #print 'string founded: ', us ratio = fuzz.ratio(us, mrr) #print ratio if ratio >= 50: rd[n] = ratio n += 1 #print rd ho = '' while len(ho) == 0: maxn = max(rd.values()) n = rd.keys()[rd.values().index(maxn)] hoo = q[n].hoy.hoy #print 'founded a dict for', n try: ho = ast.literal_eval(hoo) #print 'a valid dict:', ho if 1 not in ho.values(): #print 'this dict haven\'t any valid item' raise except: #print 'deleting', rd[n] del rd[n] #print 'deleted!' ho = '' try: outputs = [] for key in ho.keys(): if ho[key]==1: outputs.append(key) r = normalizer.normalize(choice(outputs)) w = r.split(' ') if u'می' == w[-1][:2] and u'' != w[-1][2] and u' ' != w[-1][2]: w[-1] = u'می'+w[-1][2:] r = ' '.join(w) except: r = '' if r == '': raise except Exception as e: if re.search(u'(نظرت|نظر تو) (در مورد|درباره|دربارهٔ|درباره ی|دربارهی|راجع به|راجب) .* (چیست|چیه)', mr): r = choice([u'در مورد همه چی باید نظر بدم؟!', u'نظر خاصی ندارم.', u'در این زمینه صاحب نظر نیستم.']) elif re.search(u'؟$', mr): r = choice([u'چرا میپرسی؟', u'نپرس!', u'نمیدونم.']) elif re.search(u'!$', mr): r = choice([u'عجب!', u'چه جالب!']) elif re.search(u'\.$', mr): r = choice([u'این که پایان جملهت نقطه گذاشتی خیلی عالیه! ولی معنی جملهت رو نمیفهمم. یادم بده.']) else: r = u'نمیفهمم چی میگی. بیا خصوصی یادم بده!' #print 'erorr:', e #r = e if len(r) > 0: self.sender.sendMessage(r,parse_mode='HTML')
col_sums[col_sums.nonzero()]).sum() / len(col_sums[col_sums.nonzero()]) row_sums = confusion_matrix.sum(1) # recall = ( # confusion_matrix.diagonal()[row_sums.nonzero()] / # row_sums[row_sums.nonzero()]).sum() / len(row_sums[row_sums.nonzero()]) #print labels #print confusion_matrix return precision if __name__ == '__main__': rd = HamshahriReader(config.corpora_root) counter = Counter() docs = [] normalizer = Normalizer() stemmer = Stemmer() for doc in rd.docs(count=config.documents_count): doc['text'] = normalizer.normalize(doc['text']) doc['words'] = [stemmer.stem(word) for word in word_tokenize(doc['text'])] counter.update([doc['cat']]) docs.append(doc) print counter all_words = [] for doc in docs: all_words.extend(doc['words']) dist = nltk.FreqDist(word for word in all_words) word_features = dimension_reduction(all_words, dist)
def normalizefarsi(bot, update): normalizer = Normalizer() s = normalizer.normalize(update.message.text) bot.sendMessage(update.message.chat_id, text=s)
from hazm import sent_tokenize, word_tokenize, Normalizer, HamshahriReader, POSTagger, DependencyParser from InformationExtractor import InformationExtractor from progress.bar import Bar hamshahri = HamshahriReader() normalizer = Normalizer() tagger = POSTagger() parser = DependencyParser(tagger=tagger) extractor = InformationExtractor() texts = [] output = open('informations.txt', 'w') for text in Bar(max=310000).iter(hamshahri.texts()): texts.append(normalizer.normalize(text)) if len(texts) <= 1000: continue sentences = [] for text in texts: for sentence in sent_tokenize(text): words = word_tokenize(sentence) if len(words) >= 3: sentences.append(words) texts = [] tagged = tagger.batch_tag(sentences) parsed = parser.tagged_batch_parse(tagged) for sentence in parsed: # print('*', *[node['word'] for node in sentence.nodelist if node['word']], file=output)