def test_word_visualization(model_path, some_words): normalizer = Normalizer() model = word2vec.Word2Vec.load(model_path) vectors = [model[normalizer.normalize(word)] for word in some_words if normalizer.normalize(word) in model.vocab.keys()] # print(model[normalizer.normalize('فرهنگ')]) # print(model.similarity('فرهنگ', 'تمدن')) # print(vectors) rd = W2VPersianVis(model_path, selected_words=some_words) rd.show_plot()
def test_word_visualization(model_path, some_words): normalizer = Normalizer() model = word2vec.Word2Vec.load(model_path) vectors = [ model[normalizer.normalize(word)] for word in some_words if normalizer.normalize(word) in model.vocab.keys() ] # print(model[normalizer.normalize('فرهنگ')]) # print(model.similarity('فرهنگ', 'تمدن')) # print(vectors) rd = W2VPersianVis(model_path, selected_words=some_words) rd.show_plot()
def pipeline_sentence(sentence, model, tokenizer): sentence = change_words(sentence) normalizer = Normalizer() sentence = normalizer.normalize(sentence) sentence_lem = ' '.join([ Lemmatizer().lemmatize(x) for x in word_tokenize(normalizer.normalize(sentence)) ]) nlp = pipeline("ner", model=model, tokenizer=tokenizer) sentence_ner = nlp(sentence) sentence_ner_lem = nlp(sentence_lem) return sentence_ner, sentence_ner_lem, sentence_lem, sentence
def doc_normalizer(doc): normalized_doc_list=[] normalizer=Normalizer() for i in range(len(doc)): normalized_doc_list.append(normalizer.normalize(doc[i])) return normalized_doc_list
def process_text(self, text: str) -> Dict[str, int]: """ Splits a long text into words. If `persian_normalize` attribute has been set to True, normalizes `text` with Hazm Normalizer. If `include_numbers` attribute has been set to False, removes all Persian, English and Arabic numbers from text`. :param text: The text we want to process :return: a dictionary. keys are words and values are the frequencies. """ flags = ( re.UNICODE if version < '3' and type(text) is unicode # noqa: F821 else 0) if self.persian_normalize: normalizer = Normalizer() text = normalizer.normalize(text) if not self.include_numbers: text = re.sub(r"[0-9\u06F0-\u06F9\u0660-\u0669]", "", text) if self.regexp: words = re.findall(self.regexp, text, flags) else: words = word_tokenize(text) if self.collocations: word_counts = unigrams_and_bigrams(words, self.normalize_plurals) else: word_counts, _ = process_tokens(words, self.normalize_plurals) return word_counts
def score(self, sentences): # Predict pos, neg, neu = 0, 0, 0 stemmer = Stemmer() classifier = self.__get_model() normalizer = Normalizer() sentences = sent_tokenize(sentences) for sentence in sentences: sentence = normalizer.normalize(sentence) words = word_tokenize(sentence) for word in words: stemmer.stem(word) class_result = classifier.classify(self.__word_feats(word)) if class_result == 'neg': neg = neg + 1 if class_result == 'pos': pos = pos + 1 if class_result == 'neu': neu = neu + 1 positive_sentiment = str(float(pos) / len(words)) # print('Positive: ' + positive_sentiment) neutral_sentiment = str(float(neu) / len(words)) # print('Neutral: ' + neutral_sentiment) negative_sentiment = str(-float(neg) / len(words)) # print('Negative: ' + negative_sentiment) total_sentiment = (float(positive_sentiment)+float(negative_sentiment)) / 2 # print('Total (Avg): ' + str(total_sentiment)) return total_sentiment
def prepare_line(line): global normalizer, incorrect, correct, unicode_redundant_chars, whitespace_chars, digits, punct_str, punctuations if normalizer is None: normalizer = Normalizer() incorrect, correct = CorrectCodings.loadCodings("TableCodings.txt") line = normalizer.normalize(line) line = CorrectCodings.CorrectCodingInLine(line, incorrect, correct) # remove prefix pat = re.compile(r"https?(.)*[^\s]+") line = re.sub(pat, r" ", line) pat = re.compile(r"\\n") line = re.sub(pat, "\n", line) pat = re.compile(r"([^\sا-ی۰-۹a-zA-Z\d])") line = re.sub(pat, r" \1 ", line) for p in punctuations: pat = re.compile(r"([" + punct_str + "])") line = re.sub(pat, r" \1 ", line) pat = re.compile(r"([" + digits + "]+)") line = re.sub(pat, r" \1 ", line) pat = re.compile(r" +") line = re.sub(pat, r" ", line) pat = re.compile(r"\n+") line = re.sub(pat, r" \n ", line) pat = re.compile("[" + whitespace_chars + "]+") line = re.sub(pat, r" ", line) line = line.strip() return line
def handle(self, *args, **options): articles = Article.objects.filter(is_vectorized=False) N = Normalizer() FT = fasttext.load_model(options['path']) index = 0 for article in articles: try: if index % 100 == 0: print(index) text = N.normalize(article.text) text = text.translate(str.maketrans('', '', punctuation)) text = text.split() text = [word for word in text if len(word) > 2] vector = nan_to_num( mean([FT.get_word_vector(w) for w in text], axis=0)) vector = vector / (vector.dot(vector))**0.5 obj = ArticleVector(article=article, embedding=vector.tolist()) obj.save() article.is_vectorized = True article.save() index += 1 except Exception as e: print(e)
def prepare(): normalizer = Normalizer() stemmer = Stemmer() string = '''ویکی پدیای انگلیسی در تاریخ ۱۵ ژانویه ۲۰۰۱ (۲۶ دی ۱۳۷۹) به صورت مکملی برای دانشنامهٔ تخصصی نیوپدیا نوشته شد. بنیان گذاران آن «جیمی ویلز» و «لری سنگر» هستند. هم اکنون بنیاد غیرانتفاعی ویکی مدیا پروژهٔ ویکی پدیا را پشتیبانی می کند. میزبان های اینترنتی اصلی این وبگاه در شهر تامپای فلوریدا هستند؟ همچنین میزبان های اضافی دیگری هم در شهرهای آمستردام، شیراز و سئول به این وبگاه یاری می رسانند؟''' tokenizer = WordTokenizer(join_verb_parts=True, separate_emoji=True, replace_links=True, replace_IDs=True, replace_emails=True, replace_numbers=True, replace_hashtags=True) labels = {'،': 'COMMA', '.': 'DOT', '؟': 'QMARK'} normal_string = normalizer.normalize(string) for label in labels.keys(): print(normal_string.find(label)) exit(0) for i, sent in enumerate([1, 2, 3, 4]): entities = [] (10, 15, 'PrdName') for label in labels.keys(): print(f'{label} in {i}', label in sent) record = (sent, {'entities': entities}) print()
def normalize(self): """ :return: """ normalizer = Normalizer() for line in self.data.split('\n'): if line != "": self.normalize_text.append(normalizer.normalize(line)) return self.normalize_text
def statement_pre_processing(input_statement): normalizer = Normalizer() lemmatizer = Lemmatizer() input_statement = normalizer.normalize(input_statement) input_statement = [ lemmatizer.lemmatize(word) for word in word_tokenize(input_statement) if word not in stops ] return input_statement
def tokenize(paragraph, wanted_list): normal = Normalizer(remove_extra_spaces=True, punctuation_spacing=True, persian_style=False, persian_numbers=False, remove_diacritics=False, affix_spacing=False, token_based=False) for sentence in sent_tokenize(normal.normalize(paragraph)): wanted_list.append(sentence)
class PoemSentences(object): def __init__(self, poems_path): self.poems_path = poems_path self.normalizer = Normalizer() def __iter__(self): for poem_file in os.listdir(self.poems_path): for sentence in open(os.path.join(self.poems_path, poem_file)): yield word_tokenize( self.normalizer.normalize(sentence.replace('هٔ', 'ه')))
def tokenizer(input_var): tokenized = [] normalizer1 = Normalizer(True, False, False) normalizer2 = Normalizer(False, True, False) normalizer3 = Normalizer(False, False, True) word_tokenizer = WordTokenizer(False) input_var = normalizer1.normalize( normalizer2.normalize(normalizer3.normalize(input_var))) actual = word_tokenizer.tokenize(input_var) lemmatizer = Lemmatizer() # stemmer = Stemmer for x in actual: # print(x); s = lemmatizer.lemmatize(x) if "#" in s and s.split("#")[0] != "": tokenized.append(s.split("#")[0] + "ن") else: tokenized.append(s.replace("#", "")) return tokenized
def preprocess(self, cm): cm = ''.join([c for c in str(cm) if c not in punctuation]) cm = self._numbers_to_english(cm) cm = re.sub(r"[0-9]", '', cm) cm = cm.replace('\u200c', ' ').replace('\n', '').replace('\r', '').replace( 'ي', 'ی').replace('ك', 'ک') normalizer = Normalizer() cm = normalizer.normalize(cm) tokens = word_tokenize(cm) cm = ' '.join([x for x in tokens if x not in self.stopwords]) return cm
def entofa(bot, update): per = ["ﺽ", "ﺹ", "ﺙ", "ﻕ", "ﻑ", "ﻍ", "ﻉ", "ﻩ", "ﺥ", "ﺡ", "ﺝ", "چ", "ﺵ", "ﺱ", "ی", "ﺏ", "ﻝ", "ﺍ", "ﺕ", "ﻥ", "ﻡ", "ک", "گ", "ﻅ", "ﻁ", "ﺯ", "ﺭ", "ﺫ", "ﺩ", "پ", "ﻭ"] eng = ["q", "w", "e", "r", "t", "y", "u", "i", "o", "p", "[", "]", "a", "s", "d", "f", "g", "h", "j", "k", "l", ";", "'", "z", "x", "c", "v", "b", "n", "m", ","] s = update.message.text for i in range(len(per)): s = s.replace(eng[i], per[i]) normalizer = Normalizer() s = normalizer.normalize(s) bot.sendMessage(update.message.chat_id, text=s)
def preprocess(doc): stemmer = Stemmer() lemmatizer = Lemmatizer() normalizer = Normalizer() doc = normalizer.normalize(doc) tokenized = re.split(' |-', doc) for w in tokenized[:]: if w in stopwords: tokenized.remove(w) stemmed = [stemmer.stem(w) for w in tokenized] new_words = [word for word in stemmed if word.isalnum()] lemmatized = [lemmatizer.lemmatize(w) for w in new_words] return lemmatized
def dataset_cleaner(dataset): statements = [] normalizer = Normalizer() lemmatizer = Lemmatizer() for i in range(len(dataset)): normalized_statement = normalizer.normalize(dataset[i]) # for sentence in sent_tokenize(dataset[i]): word_list = [ lemmatizer.lemmatize(word) for word in word_tokenize(normalized_statement) if word not in stops ] statements.append(word_list) return statements
class Summarizer(object): def __init__(self): self.normalizer = Normalizer() def summarize(self, input): self.input = self.normalizer.normalize(input) self.base_words = word_tokenize(self.input) self.working_sentences = sent_tokenize(self.input) self.sentences_number = len(self.working_sentences) return self._get_summarize(num_sentences=self._find_num_sentences()) def _find_num_sentences(self): return (int(math.log(self.sentences_number)**2 + 1) + 1) if self.sentences_number >= 6 else self.sentences_number # return int(self.sentences_number - 0.2 * self.sentences_number) def _get_summarize(self, num_sentences): # if str(word not in stopwords.words()] words = [ word for word in self.base_words if word not in stopwords.words('persian') ] word_frequencies = FreqDist(words) most_frequent_words = [ pair[0] for pair in word_frequencies.items()[:100] ] actual_sentences = sent_tokenize(self.input) output_sentences = [] for word in most_frequent_words: for i in range(0, len(self.working_sentences)): if (word in self.working_sentences[i] and actual_sentences[i] not in output_sentences): output_sentences.append(actual_sentences[i]) break if len(output_sentences) >= num_sentences: break if len(output_sentences) >= num_sentences: break return self._reorder_sentences(output_sentences) def _reorder_sentences(self, output_sentences): output_sentences.sort( lambda s1, s2: self.input.find(s1) - self.input.find(s2)) return output_sentences
def document(filepath): f = open(filepath, 'r', encoding='utf-8', errors='ignore') txt = f.read() f.close() txt = remove_punctuation(txt) normalizer = Normalizer() txt = normalizer.normalize(txt) document = word_tokenize(txt) document = [word for word in document if word not in stop_words and not word.isdigit()] return document
def prepare_line(line): global normalizer, incorrect, correct, unicode_redundant_chars, whitespace_chars, digits, punct_str, punctuations if normalizer is None: normalizer = Normalizer() incorrect, correct = loadCodings("TableCodings.txt") line = normalizer.normalize(line) line = CorrectCodingInLine(line, incorrect, correct) pat = re.compile(r"([" + re.escape(punct_str) + "])") line = re.sub(pat, r" \1 ", line) pat = re.compile(r"([" + digits + "]+)") line = re.sub(pat, r" \1 ", line) pat = re.compile(r"\n+") line = re.sub(pat, r" \n ", line) pat = re.compile("[" + whitespace_chars + "]+") line = re.sub(pat, r" ", line) line = line.strip() return line
def clean(sentence): #trim digits ind = 0 for i in range(len(sentence)): if (sentence[i] in FARSI_DIGITS or sentence[i] in ENGLISH_DIGITS): ind += 1 else: break sentence = sentence[ind:] #remove Non-Alphanumeric res = [] for i in range(len(sentence)): if (sentence[i] in FARSI_ALPHABET or sentence[i] in FARSI_DIGITS): res.append(sentence[i]) sentence = "".join(res) normalizer = Normalizer() sentence = normalizer.normalize(sentence) return sentence
def clean_tweet(tweet): tweet = str(tweet) tweet = tweet.lower() # remove # so we preserve hashtags for the cloud tweet = tweet.replace("#", "") tweet = remove_links(tweet) tweet = remove_mentions(tweet) tweet = remove_emoji(tweet) tweet = remove_punctuations(tweet) tweet = remove_reserved_words(tweet) normalizer = Normalizer() tweet = normalizer.normalize(tweet) # replace arabic ي with persian tweet = tweet.replace('ي', 'ی') # removes verbs such as میشود or نمیگویند tweet = re.sub(r'ن?می[]\S+', '', tweet) tokens = word_tokenize(tweet) tokens = [token for token in tokens if not token.isdigit()] tokens = [token for token in tokens if token not in stopwords.persian] tokens = [token for token in tokens if token not in stopwords.english] return " ".join(tokens).strip()
def tok(dataTok): normalizer = Normalizer() tokenizer = WordTokenizer(join_verb_parts=False, replace_links=True, replace_IDs=True, replace_numbers=True, replace_hashtags=True) s = time.time() ij = 0 #dataTok.apply (lambda x: dataTok1.append(sent_tokenize(x)) ) for row in dataTok: _sents = sent_tokenize(row) _sents = stop_word(_sents) for _sent in _sents: _temp = _sent.replace(".", "").replace(",", "").replace( "،", "").replace("؛", "").strip() _wrds = [] _wrds = normalizer.normalize(_temp) dataTok1.append(tokenizer.tokenize(_wrds)) print("Data: ", dataTok1.__len__()) e = time.time() print("Tokenize Done, Time: ", e - s, " !\n")
def on_chat_message(self, msg): normalizer = Normalizer() #if msg.has_key(u'document'): #self.sender.downloadFile(msg[u'document'][u'file_id'], file_path="~/dl") m = msg['text'].split(' ') mr = msg['text'] fn = msg['from']['first_name'] chat_type = msg['chat']['type'] user_id = msg['from']['id'] r = '' if m[0] == u'/start': r = u'سلام به تو که اسمتو گذاشتی ' + unicode(fn) elif m[0] == u'mojose': r = msg if chat_type == 'private' and mr[:3] != u'هوی': mr = u'هوی ' + mr m = mr.split(' ') if user_id == 170378225: #global ddd = {index, keyOf dd } global h_id global d global ddd global q2 #get outputs from db if m[1] == u'g': #print 'g' try: q = Hoy.select(User, Hoy).join(Chat).join(User).where(Hoy.hoy.contains(': 0')).get() h_id = q.id q2 = User.select().join(Chat).join(Hoy).where(Hoy.id==h_id) d = ast.literal_eval(q.hoy) o = '' i = 0 d_iter = d.iteritems() for (k, v) in (d_iter): o += str(i)+' : '+k+' : '+str(v)+'\n' i += 1 i = 0 d_k = d.keys() dd = {} for k in d_k: dd[i] = k i += 1 ddd = dd inputs = '' for i in q2: inputs += i.user + '\n' r = inputs+'\n-----------\n'+o except: r = 'چیزی برای تأیید نیست!' elif mr[4:6] == u'g\n': #print 'g2' mrc = mr[4:] mc = mrc.split('\n') user_input = mc[1] try: q = Hoy.select(User, Hoy).join(Chat).join(User).where(User.user == user_input).get() h_id = q.id q2 = User.select(Hoy, User).join(Chat).join(Hoy).where(Hoy.id==h_id) d = ast.literal_eval(q.hoy) o = '' i = 0 d_iter = d.iteritems() for (k, v) in (d_iter): o += str(i)+' : '+k+' : '+str(v)+'\n' i += 1 i = 0 d_k = d.keys() dd = {} for k in d_k: dd[i] = k i += 1 ddd = dd inputs = '' for i in q2: inputs += i.user + '\n' r = inputs+'\n-----------\n'+o except: r = 'نبود که!' elif mr[4:7] == 'gg\n': #print 'gg' mrr = mr[7:].replace(u'؟', u'').replace(u'.', u'').replace(u'!', u'').replace(u'می ', u'می').replace(u'می', u'می') mrr = normalizer.normalize(mrr) #print 'normalized user input:', mrr mm = mrr.split(' ') rgx = u'' for w in mm: rgx += w+'|' if u'می' == w[:2] and u'' != w[2] and u' ' != w[2]: rgx += u'می'+w[2:]+u'|' if len(mm) < 3: rgx = u'(' + rgx[:-1] + u') ' else: rgx = u'(' + rgx[:-1] + u')? ' rgx = rgx * len(mm) rgx = rgx[:-1] #print 'regex:', rgx try: q = Chat.select(Chat, Hoy, User).join(User).switch(Chat).join(Hoy).where(User.user.regexp(rgx)).limit(10) #print 'records founded (max 10):', len(q) if len(q) == 0: #try to fuzzy string and rematch #print 'not found!' raise else: n = 0 #rd = {n: ratio} rd = {} while n < len(q): us = q[n].user.user #print 'string founded: ', us ratio = fuzz.ratio(us, mrr) #print ratio if ratio >= 50: rd[n] = ratio n += 1 #print rd ho = '' while len(ho) == 0: maxn = max(rd.values()) n = rd.keys()[rd.values().index(maxn)] hoo = q[n].hoy.hoy #print 'founded a dict for', n try: ho = ast.literal_eval(hoo) #print 'a valid dict:', ho user_input = q[n].user.user if 1 not in ho.values(): #print 'this dict haven\'t any valid item' raise except: #print 'deleting', rd[n] del rd[n] #print 'deleted!' ho = '' user_input = '' except: #print 'eee!' pass try: q = Hoy.select(User, Hoy).join(Chat).join(User).where(User.user == user_input).get() h_id = q.id q2 = User.select(Hoy, User).join(Chat).join(Hoy).where(Hoy.id==h_id) d = ast.literal_eval(q.hoy) o = '' i = 0 d_iter = d.iteritems() for (k, v) in (d_iter): o += str(i)+' : '+k+' : '+str(v)+'\n' i += 1 i = 0 d_k = d.keys() dd = {} for k in d_k: dd[i] = k i += 1 ddd = dd inputs = '' for i in q2: inputs += i.user + '\n' r = inputs+'\n-----------\n'+o except: r = 'نبود که!' #review items elif m[1] == u'r': o = '' i = 0 d_iter = d.iteritems() for (k, v) in (d_iter): o += str(i)+' : '+k+' : '+str(v)+'\n' i += 1 i = 0 d_k = d.keys() dd = {} for k in d_k: dd[i] = k i += 1 ddd = dd inputs = '' for i in q2: inputs += i.user + '\n' r = inputs+'\n-----------\n'+o #commit changes elif m[1] == u'c': d_i = d.items() for k, v in d_i: if v == 0: del d[k] Hoy.update(hoy=d).where(Hoy.id == h_id).execute() d = {} ddd = {} inputs = '' r = 'تغییرات ذخیره شد!' #change state of an item elif len(m) == 2: try: i = int(m[1]) if d[ddd[i]] == 0: d[ddd[i]] = 1 else: d[ddd[i]] = 0 r = ddd[i] + ' : ' + str(d[ddd[i]]) except: pass #if m[1] == 'grupoj': #TODO merge same outputs if '\n' in mr and u'\nبگو\n' in mr and r == '': mrc = normalizer.normalize(mr[4:]) mc = mrc.split('\n') say_index = mc.index(u'بگو') user_inputs = mc[:say_index] hoy_outputs = mc[say_index+1:] hoy_outputs = {k:0 for k in hoy_outputs} hoy_outputs_old = {} for user_input in user_inputs: try: H = (Hoy.select().join(Chat).join(User).where(User.user==user_input)) hoy_outputs_old = H[0].hoy h_id = H[0].id hoy_outputs_old = ast.literal_eval(hoy_outputs_old) del user_inputs[user_inputs.index(user_input)] except: pass if hoy_outputs_old == {}: h = Hoy.create(hoy=hoy_outputs) r = u'پاسخهای شما در صف بررسی قرار گرفت. تا ارباب چی بگن!' else: try: hoy_outputs.update(hoy_outputs_old) update_query = Hoy.update(hoy=hoy_outputs).where(Hoy.id==h_id) update_query.execute() h = Hoy.get(Hoy.id==h_id) r = u'پاسخهای شما نیز در صف بررسی قرار گرفت. تا ارباب چی بگن!' except Exception as e: pass #print e try: for user_input in user_inputs: u, created = User.get_or_create(user=user_input) if created: Chat.create(user=u, hoy=h) except Exception as e: pass #print e elif '\n' in mr and u'\nنفهم' in mr and r == '' and user_id == 170378225: mrc = mr[4:] mc = mrc.split('\n') say_index = mc.index(u'نفهم') user_input = mc[:say_index] try: dq = User.delete().where(User.user==user_input[0]) dq.execute() r = u'اطاعت! دیگر به چنین چیزی پاسخ نمیدهم.' #TODO delete u_id that not exist in User, from Chat except: r = u'چنین چیزی وجود ندارد!' elif m[0] == u'هوی': if re.search(u'تخم|کیر|کسخل|کون|کون|الاغ|الاق|جنده|گای|پستون|ممه|گوز|شاش|جیش|قبحه|جلق|جق|سگ|جاکش|گائ|گاتو|کیون|لاشی|گامو|فاک|ساک|کُس|کوس|کوص|کص|سکس|پورن|الکسیس|گاشو', mr) \ or re.search(u'(^| )رید(.|$)', mr) or u'خرم' in m or u'خری' in m or u'خره' in m or u'گا' in m or u'شق' in m or u'منی' in m or re.search(u'(^| )حشری(.|$)', mr): r = choice([u'بیادب :|', u'بیتربیت :|', u'بیشخصیت :|',u'عفت کلام داشته باش یه ذره :|', u'دهنتو آب بکش :|']) #elif m[1] == u'سلام' or m[1] == u'درود': #r = choice([u'سلام', u'علیک سلام']) elif len(m) >= 3 and m[1] == u'بگو': r = normalizer.normalize(mr[8:]) elif len(m) == 3: m2 = m[1]+' '+m[2] if m2 == u'چه خبر؟': response = urllib2.urlopen('http://www.farsnews.com/RSS') rss = response.read() soup = BeautifulSoup.BeautifulSoup(rss) all_title = soup.findAll('title') def get_link(nth): item = soup.findAll('item')[nth] link = re.search(r'http://www.farsnews.com/(\d+)',unicode(item)).group(0) return link r = unicode(all_title[2]).replace('<title>', '<a href="%s">'%get_link(0), 2).replace('</title>', '</a>') + '\n\n' + \ unicode(all_title[3]).replace('<title', '<a href="%s"'%get_link(1), 2).replace('</title>', '</a>') + '\n\n' + \ unicode(all_title[4]).replace('<title', '<a href="%s"'%get_link(2), 2).replace('</title>', '</a>') elif len(m) == 2: if m[1] == u'راهنما': r = u'• به این شکل هوی را آموزش دهید:\n\ \n\ سلام\n\ درود\n\ بگو\n\ علیک سلام\n\ سلام حاجی\n\ \n\ !> دقت کنید که در یک پیام و در خطهای جدا باشد.\n\ \n\ !> اگر در گروه آموزشش میدهید، ابتدا هوی بنویسید و سپس مثل بالا خطوط را وارد کنید. این دو شکل قابل قبول است:\n\ \n\ هوی سلام\n\ درود\n\ بگو\n\ علیک سلام\n\ سلام حاجی\n\ ---------\n\ هوی\n\ سلام\n\ درود\n\ بگو\n\ علیک سلام\n\ سلام حاجی\n\ \n\ • آموختهها پس از تأیید به نمایش در میآیند.\n\ \n\ !> آموختههایی که به اشخاص مربوط است و جنبهٔ عمومی ندارد، تأیید نمیشود.\n\ !> آموختههای شامل حرف بد، توهین و… تأیید نمیشود.\n\ !> آموختههای دارای اشتباه نوشتاری تأیید نمیشود.\n\ \n\ • اگر مثلاً «سلام» برای هوی تعریف شده باشد، میتواند اینگونه از پاسخهای «سلام» برای «هلو» هم استفاده کند:\n\ \n\ سلام\n\ هلو\n\ بگو\n\ علیک\n\ های\n\ سلام عزیز\n\ \n\ • اگر پیشنهادی دارید، به @HSN6789 پیام بدهید.' if r == '': mrr = mr[4:].replace(u'؟', u'').replace(u'.', u'').replace(u'!', u'').replace(u'می ', u'می').replace(u'می', u'می') mrr = normalizer.normalize(mrr) #print 'normalized user input:', mrr mm = mrr.split(' ') rgx = u'' for w in mm: rgx += w+'|' if u'می' == w[:2] and u'' != w[2] and u' ' != w[2]: rgx += u'می'+w[2:]+u'|' if len(mm) < 3: rgx = u'(' + rgx[:-1] + u') ' else: rgx = u'(' + rgx[:-1] + u')? ' rgx = rgx * len(mm) rgx = rgx[:-1] #print 'regex:', rgx try: q = Chat.select(Chat, Hoy, User).join(User).switch(Chat).join(Hoy).where(User.user.regexp(rgx)).limit(10) #print 'records founded (max 10):', len(q) if len(q) == 0: #try to fuzzy string and rematch #print 'not found!' raise else: n = 0 #rd = {n: ratio} rd = {} while n < len(q): us = q[n].user.user #print 'string founded: ', us ratio = fuzz.ratio(us, mrr) #print ratio if ratio >= 50: rd[n] = ratio n += 1 #print rd ho = '' while len(ho) == 0: maxn = max(rd.values()) n = rd.keys()[rd.values().index(maxn)] hoo = q[n].hoy.hoy #print 'founded a dict for', n try: ho = ast.literal_eval(hoo) #print 'a valid dict:', ho if 1 not in ho.values(): #print 'this dict haven\'t any valid item' raise except: #print 'deleting', rd[n] del rd[n] #print 'deleted!' ho = '' try: outputs = [] for key in ho.keys(): if ho[key]==1: outputs.append(key) r = normalizer.normalize(choice(outputs)) w = r.split(' ') if u'می' == w[-1][:2] and u'' != w[-1][2] and u' ' != w[-1][2]: w[-1] = u'می'+w[-1][2:] r = ' '.join(w) except: r = '' if r == '': raise except Exception as e: if re.search(u'(نظرت|نظر تو) (در مورد|درباره|دربارهٔ|درباره ی|دربارهی|راجع به|راجب) .* (چیست|چیه)', mr): r = choice([u'در مورد همه چی باید نظر بدم؟!', u'نظر خاصی ندارم.', u'در این زمینه صاحب نظر نیستم.']) elif re.search(u'؟$', mr): r = choice([u'چرا میپرسی؟', u'نپرس!', u'نمیدونم.']) elif re.search(u'!$', mr): r = choice([u'عجب!', u'چه جالب!']) elif re.search(u'\.$', mr): r = choice([u'این که پایان جملهت نقطه گذاشتی خیلی عالیه! ولی معنی جملهت رو نمیفهمم. یادم بده.']) else: r = u'نمیفهمم چی میگی. بیا خصوصی یادم بده!' #print 'erorr:', e #r = e if len(r) > 0: self.sender.sendMessage(r,parse_mode='HTML')
# confusion_matrix.diagonal()[row_sums.nonzero()] / # row_sums[row_sums.nonzero()]).sum() / len(row_sums[row_sums.nonzero()]) #print labels #print confusion_matrix return precision if __name__ == '__main__': rd = HamshahriReader(config.corpora_root) counter = Counter() docs = [] normalizer = Normalizer() stemmer = Stemmer() for doc in rd.docs(count=config.documents_count): doc['text'] = normalizer.normalize(doc['text']) doc['words'] = [stemmer.stem(word) for word in word_tokenize(doc['text'])] counter.update([doc['cat']]) docs.append(doc) print counter all_words = [] for doc in docs: all_words.extend(doc['words']) dist = nltk.FreqDist(word for word in all_words) word_features = dimension_reduction(all_words, dist) print len(word_features) / float(len(all_words)) * 100.0 features_set = [(doc_features(doc, word_features), doc['cat']) for doc in docs]
from hazm import sent_tokenize, word_tokenize, Normalizer, HamshahriReader, POSTagger, DependencyParser from InformationExtractor import InformationExtractor from progress.bar import Bar hamshahri = HamshahriReader() normalizer = Normalizer() tagger = POSTagger() parser = DependencyParser(tagger=tagger) extractor = InformationExtractor() texts = [] output = open('informations.txt', 'w') for text in Bar(max=310000).iter(hamshahri.texts()): texts.append(normalizer.normalize(text)) if len(texts) <= 1000: continue sentences = [] for text in texts: for sentence in sent_tokenize(text): words = word_tokenize(sentence) if len(words) >= 3: sentences.append(words) texts = [] tagged = tagger.batch_tag(sentences) parsed = parser.tagged_batch_parse(tagged) for sentence in parsed: # print('*', *[node['word'] for node in sentence.nodelist if node['word']], file=output)
return(train_data, train_label, validate_data, validate_label, test_data, test_label) ############################################ End of Functions ###################################### data_path = '/media/bero/9214EFDB14EFBFF9/Users/BeRo/Google Drive/Bero/arshad project/Ms_Thesis/data set/SentiPersV1.0/separated/beroData.txt' label_path = '/media/bero/9214EFDB14EFBFF9/Users/BeRo/Google Drive/Bero/arshad project/Ms_Thesis/data set/SentiPersV1.0/separated/beroLabel.txt' total_data = train_test_seperator(data_path = data_path, label_path= label_path, train_rate= train_rate, validate_rate = validate_rate) train_data = total_data[0] train_label = total_data[1] validate_data = total_data[2] validate_label = total_data[3] test_data = total_data[4] test_label = total_data[5] index = 0 for line in train_data: tmp = normalizer.normalize(line) #print(tmp) #print(sent_tokenize(tmp)) word_tokenized = word_tokenize(tmp) #print(word_tokenized) labeledSent = TaggedDocument(words = word_tokenized, tags = [index]) sentences.append(labeledSent) index += 1 num_features = 100 min_word_count = 5 context = 8 num_workers = 4 print("Training model...") model = Doc2Vec(sentences, workers=num_workers, size = num_features, min_count = min_word_count, window = context) print("model Trained.")
def normalizefarsi(bot, update): normalizer = Normalizer() s = normalizer.normalize(update.message.text) bot.sendMessage(update.message.chat_id, text=s)
from __future__ import unicode_literals import os,sys,codecs from hazm import Normalizer,sent_tokenize, word_tokenize reader=codecs.open(os.path.abspath(sys.argv[1]),'r',encoding='utf-8') writer=codecs.open(os.path.abspath(sys.argv[2]),'w',encoding='utf-8') count=1 line=reader.readline() normalizer = Normalizer() while line: if count%1000==0: sys.stdout.write(str(count)+'...') if line.strip(): n=normalizer.normalize(line.strip()) tok=word_tokenize(n) sen=u' '.join(tok).replace('_',' ').replace(' ',' ').replace(' ',' ') l=sen+u'\n' writer.write(l) else: writer.write(u'\n') count+=1 line=reader.readline() sys.stdout.write('\n') writer.flush() writer.close()
labels = file_to_read.readlines() mylabel = [] for line in labels: tmp = line.split('\n') mylabel.append(int(tmp[0])) file_to_read.close() file_to_read = open(sentence_path, 'r') file_content = file_to_read.readlines() file_to_read.close() index = 0 for line in file_content: tmp = line.split('\n') tmp = tmp[0] tmp = normalizer.normalize(tmp) #print(tmp) #print(sent_tokenize(tmp)) word_tokenized = word_tokenize(tmp) #print(word_tokenized) labeledSent = TaggedDocument(words = word_tokenized, tags = [index]) sentences.append(labeledSent) index += 1 num_features = 100 min_word_count = 5 context = 8 num_workers = 4 print("Training model...") model = Doc2Vec(sentences, workers=num_workers, size = num_features, min_count = min_word_count, window = context) print("model Trained.")