def prepare(): normalizer = Normalizer() stemmer = Stemmer() string = '''ویکی پدیای انگلیسی در تاریخ ۱۵ ژانویه ۲۰۰۱ (۲۶ دی ۱۳۷۹) به صورت مکملی برای دانشنامهٔ تخصصی نیوپدیا نوشته شد. بنیان گذاران آن «جیمی ویلز» و «لری سنگر» هستند. هم اکنون بنیاد غیرانتفاعی ویکی مدیا پروژهٔ ویکی پدیا را پشتیبانی می کند. میزبان های اینترنتی اصلی این وبگاه در شهر تامپای فلوریدا هستند؟ همچنین میزبان های اضافی دیگری هم در شهرهای آمستردام، شیراز و سئول به این وبگاه یاری می رسانند؟''' tokenizer = WordTokenizer(join_verb_parts=True, separate_emoji=True, replace_links=True, replace_IDs=True, replace_emails=True, replace_numbers=True, replace_hashtags=True) labels = {'،': 'COMMA', '.': 'DOT', '؟': 'QMARK'} normal_string = normalizer.normalize(string) for label in labels.keys(): print(normal_string.find(label)) exit(0) for i, sent in enumerate([1, 2, 3, 4]): entities = [] (10, 15, 'PrdName') for label in labels.keys(): print(f'{label} in {i}', label in sent) record = (sent, {'entities': entities}) print()
def score(self, sentences): # Predict pos, neg, neu = 0, 0, 0 stemmer = Stemmer() classifier = self.__get_model() normalizer = Normalizer() sentences = sent_tokenize(sentences) for sentence in sentences: sentence = normalizer.normalize(sentence) words = word_tokenize(sentence) for word in words: stemmer.stem(word) class_result = classifier.classify(self.__word_feats(word)) if class_result == 'neg': neg = neg + 1 if class_result == 'pos': pos = pos + 1 if class_result == 'neu': neu = neu + 1 positive_sentiment = str(float(pos) / len(words)) # print('Positive: ' + positive_sentiment) neutral_sentiment = str(float(neu) / len(words)) # print('Neutral: ' + neutral_sentiment) negative_sentiment = str(-float(neg) / len(words)) # print('Negative: ' + negative_sentiment) total_sentiment = (float(positive_sentiment)+float(negative_sentiment)) / 2 # print('Total (Avg): ' + str(total_sentiment)) return total_sentiment
def process_text(self, text: str) -> Dict[str, int]: """ Splits a long text into words. If `persian_normalize` attribute has been set to True, normalizes `text` with Hazm Normalizer. If `include_numbers` attribute has been set to False, removes all Persian, English and Arabic numbers from text`. :param text: The text we want to process :return: a dictionary. keys are words and values are the frequencies. """ flags = ( re.UNICODE if version < '3' and type(text) is unicode # noqa: F821 else 0) if self.persian_normalize: normalizer = Normalizer() text = normalizer.normalize(text) if not self.include_numbers: text = re.sub(r"[0-9\u06F0-\u06F9\u0660-\u0669]", "", text) if self.regexp: words = re.findall(self.regexp, text, flags) else: words = word_tokenize(text) if self.collocations: word_counts = unigrams_and_bigrams(words, self.normalize_plurals) else: word_counts, _ = process_tokens(words, self.normalize_plurals) return word_counts
def prepare_line(line): global normalizer, incorrect, correct, unicode_redundant_chars, whitespace_chars, digits, punct_str, punctuations if normalizer is None: normalizer = Normalizer() incorrect, correct = CorrectCodings.loadCodings("TableCodings.txt") line = normalizer.normalize(line) line = CorrectCodings.CorrectCodingInLine(line, incorrect, correct) # remove prefix pat = re.compile(r"https?(.)*[^\s]+") line = re.sub(pat, r" ", line) pat = re.compile(r"\\n") line = re.sub(pat, "\n", line) pat = re.compile(r"([^\sا-ی۰-۹a-zA-Z\d])") line = re.sub(pat, r" \1 ", line) for p in punctuations: pat = re.compile(r"([" + punct_str + "])") line = re.sub(pat, r" \1 ", line) pat = re.compile(r"([" + digits + "]+)") line = re.sub(pat, r" \1 ", line) pat = re.compile(r" +") line = re.sub(pat, r" ", line) pat = re.compile(r"\n+") line = re.sub(pat, r" \n ", line) pat = re.compile("[" + whitespace_chars + "]+") line = re.sub(pat, r" ", line) line = line.strip() return line
def handle(self, *args, **options): articles = Article.objects.filter(is_vectorized=False) N = Normalizer() FT = fasttext.load_model(options['path']) index = 0 for article in articles: try: if index % 100 == 0: print(index) text = N.normalize(article.text) text = text.translate(str.maketrans('', '', punctuation)) text = text.split() text = [word for word in text if len(word) > 2] vector = nan_to_num( mean([FT.get_word_vector(w) for w in text], axis=0)) vector = vector / (vector.dot(vector))**0.5 obj = ArticleVector(article=article, embedding=vector.tolist()) obj.save() article.is_vectorized = True article.save() index += 1 except Exception as e: print(e)
def __init__(self, inFile, outFile): self.inFile = inFile self.outFile = outFile self.normalizer = Normalizer() self.tagger = POSTagger(model='resources/postagger.model') self.lemmatizer = Lemmatizer() self.stemmer = Stemmer()
def clean_persianText(txt): normalizer = Normalizer() txt = normalizer.character_refinement(txt) txt = normalizer.affix_spacing(txt) txt = normalizer.punctuation_spacing(txt) txt = txt.replace('.', '') txt = normalizer.normalize(txt) return txt
def __init__(self, path_dataset, path_stopwords): self.path_dataset = path_dataset self.path_stopwords = path_stopwords self.stopwords = self.file_reader(self.path_stopwords) self.normalizer = Normalizer()
def statement_pre_processing(input_statement): normalizer = Normalizer() lemmatizer = Lemmatizer() input_statement = normalizer.normalize(input_statement) input_statement = [ lemmatizer.lemmatize(word) for word in word_tokenize(input_statement) if word not in stops ] return input_statement
def bigram_cleaner(text): text = re.sub(Text_cleaner.persian_regex, ' ', text) text = re.sub('[ ]+', ' ', text) normalizer = Normalizer() text = normalizer.normalize(text) tokenized = word_tokenize(text) return tokenized
def __init__(self): # persian words normalizer self.normalizer = Normalizer() # load stopwords logger.info(f"Loading stopwords from {DATA_DIR / 'stopwords.txt'}") stop_words = open(DATA_DIR / 'stopwords.txt').readlines() stop_words = map(str.strip, stop_words) self.stop_words = set(map(self.normalizer.normalize, stop_words))
def normalize(self): """ :return: """ normalizer = Normalizer() for line in self.data.split('\n'): if line != "": self.normalize_text.append(normalizer.normalize(line)) return self.normalize_text
def tokenize(paragraph, wanted_list): normal = Normalizer(remove_extra_spaces=True, punctuation_spacing=True, persian_style=False, persian_numbers=False, remove_diacritics=False, affix_spacing=False, token_based=False) for sentence in sent_tokenize(normal.normalize(paragraph)): wanted_list.append(sentence)
def normalize_words(words: Iterable) -> List[str]: """ This method gets an Iterable containing some Farsi words as elements, normalizes them using Hazm and then returns a list of normalized words. :param words: an iterable including words :return: A list of normalized elements of the `words` iterable. """ combined_words: str = "".join(x + "\n" for x in words) normalizer: Normalizer = Normalizer() normalized_combined_words: str = normalizer.normalize(combined_words) return normalized_combined_words.split("\n")
def test_word_visualization(model_path, some_words): normalizer = Normalizer() model = word2vec.Word2Vec.load(model_path) vectors = [ model[normalizer.normalize(word)] for word in some_words if normalizer.normalize(word) in model.vocab.keys() ] # print(model[normalizer.normalize('فرهنگ')]) # print(model.similarity('فرهنگ', 'تمدن')) # print(vectors) rd = W2VPersianVis(model_path, selected_words=some_words) rd.show_plot()
def preprocess(self, cm): cm = ''.join([c for c in str(cm) if c not in punctuation]) cm = self._numbers_to_english(cm) cm = re.sub(r"[0-9]", '', cm) cm = cm.replace('\u200c', ' ').replace('\n', '').replace('\r', '').replace( 'ي', 'ی').replace('ك', 'ک') normalizer = Normalizer() cm = normalizer.normalize(cm) tokens = word_tokenize(cm) cm = ' '.join([x for x in tokens if x not in self.stopwords]) return cm
def tokenizer(input_var): tokenized = [] normalizer1 = Normalizer(True, False, False) normalizer2 = Normalizer(False, True, False) normalizer3 = Normalizer(False, False, True) word_tokenizer = WordTokenizer(False) input_var = normalizer1.normalize( normalizer2.normalize(normalizer3.normalize(input_var))) actual = word_tokenizer.tokenize(input_var) lemmatizer = Lemmatizer() # stemmer = Stemmer for x in actual: # print(x); s = lemmatizer.lemmatize(x) if "#" in s and s.split("#")[0] != "": tokenized.append(s.split("#")[0] + "ن") else: tokenized.append(s.replace("#", "")) return tokenized
def pipeline_sentence(sentence, model, tokenizer): sentence = change_words(sentence) normalizer = Normalizer() sentence = normalizer.normalize(sentence) sentence_lem = ' '.join([ Lemmatizer().lemmatize(x) for x in word_tokenize(normalizer.normalize(sentence)) ]) nlp = pipeline("ner", model=model, tokenizer=tokenizer) sentence_ner = nlp(sentence) sentence_ner_lem = nlp(sentence_lem) return sentence_ner, sentence_ner_lem, sentence_lem, sentence
def preprocess(doc): stemmer = Stemmer() lemmatizer = Lemmatizer() normalizer = Normalizer() doc = normalizer.normalize(doc) tokenized = re.split(' |-', doc) for w in tokenized[:]: if w in stopwords: tokenized.remove(w) stemmed = [stemmer.stem(w) for w in tokenized] new_words = [word for word in stemmed if word.isalnum()] lemmatized = [lemmatizer.lemmatize(w) for w in new_words] return lemmatized
def dataset_cleaner(dataset): statements = [] normalizer = Normalizer() lemmatizer = Lemmatizer() for i in range(len(dataset)): normalized_statement = normalizer.normalize(dataset[i]) # for sentence in sent_tokenize(dataset[i]): word_list = [ lemmatizer.lemmatize(word) for word in word_tokenize(normalized_statement) if word not in stops ] statements.append(word_list) return statements
def clean(sentence): #trim digits ind = 0 for i in range(len(sentence)): if (sentence[i] in FARSI_DIGITS or sentence[i] in ENGLISH_DIGITS): ind += 1 else: break sentence = sentence[ind:] #remove Non-Alphanumeric res = [] for i in range(len(sentence)): if (sentence[i] in FARSI_ALPHABET or sentence[i] in FARSI_DIGITS): res.append(sentence[i]) sentence = "".join(res) normalizer = Normalizer() sentence = normalizer.normalize(sentence) return sentence
def process_text(text): normalize=Normalizer() text=normalize.normalize(text) text = text.replace("_", " ") text = text.replace(',', ' ') text=text.replace("\u220c","") text=text.replace("\u200c","") text=text.replace("-","") # text = text.replace('/', ' ') text = text.replace('(', ' ') text = text.replace(')', ' ') text = text.replace('.', ' ') text=text.replace("،"," ") text=text.replace("«"," ") text=text.replace("»"," ") # Convert text string to a list of words t = re.findall("[\u0627-\u06FF]+|<S>|</s>|\?|//", text) # just split word by space to space and omit other thing lemma=Lemmatizer() text=[lemma.lemmatize(x) for x in t] return text
def prepare_line(line): global normalizer, incorrect, correct, unicode_redundant_chars, whitespace_chars, digits, punct_str, punctuations if normalizer is None: normalizer = Normalizer() incorrect, correct = loadCodings("TableCodings.txt") line = normalizer.normalize(line) line = CorrectCodingInLine(line, incorrect, correct) pat = re.compile(r"([" + re.escape(punct_str) + "])") line = re.sub(pat, r" \1 ", line) pat = re.compile(r"([" + digits + "]+)") line = re.sub(pat, r" \1 ", line) pat = re.compile(r"\n+") line = re.sub(pat, r" \n ", line) pat = re.compile("[" + whitespace_chars + "]+") line = re.sub(pat, r" ", line) line = line.strip() return line
def prepare_text(text, should_stem=True): normalizer = Normalizer() text = normalizer.normalize(text) tokenized = word_tokenize(text) #نگارشی def fix_word(w): # for c in Text_cleaner.punct_list: # w = w.replace(c, '') w = re.sub(Text_cleaner.punct_regex, '', w).replace('،', '') return "$" if w == "" else w punc_free = list(filter(lambda x: x != '$', map(fix_word, tokenized))) stemmer = Stemmer() if should_stem: stemmed_list = list( filter(lambda x: x != '', map(stemmer.stem, punc_free))) else: stemmed_list = punc_free return stemmed_list
def clean_tweet(tweet): tweet = str(tweet) tweet = tweet.lower() # remove # so we preserve hashtags for the cloud tweet = tweet.replace("#", "") tweet = remove_links(tweet) tweet = remove_mentions(tweet) tweet = remove_emoji(tweet) tweet = remove_punctuations(tweet) tweet = remove_reserved_words(tweet) normalizer = Normalizer() tweet = normalizer.normalize(tweet) # replace arabic ي with persian tweet = tweet.replace('ي', 'ی') # removes verbs such as میشود or نمیگویند tweet = re.sub(r'ن?می[]\S+', '', tweet) tokens = word_tokenize(tweet) tokens = [token for token in tokens if not token.isdigit()] tokens = [token for token in tokens if token not in stopwords.persian] tokens = [token for token in tokens if token not in stopwords.english] return " ".join(tokens).strip()
def process_text(self, text: str) -> Dict[str, int]: """ Splits a long text into words. If `persian_normalize` attribute has been set to True, normalizes `text` with Hazm Normalizer. If `include_numbers` attribute has been set to False, removes all Persian, English and Arabic numbers from text`. Attention: this method will not remove stopwords from the input. :param text: The text we want to process :return: a dictionary. keys are words and values are the frequencies. """ flags = ( re.UNICODE if version < '3' and type(text) is unicode # noqa: F821 else 0) if self.remove_unhandled_utf_characters: text = WordCloudFa.unhandled_characters_regex.sub(r'', text) if self.persian_normalize: normalizer = Normalizer() text = normalizer.normalize(text) if not self.include_numbers: text = re.sub(r"[0-9\u06F0-\u06F9\u0660-\u0669]", "", text) if self.regexp: words = re.findall(self.regexp, text, flags) else: words = word_tokenize(text) if self.collocations: # We remove stopwords in the WordCloudFa, so there is no need for passing them in this function. word_counts = unigrams_and_bigrams(words, [], self.normalize_plurals, self.collocation_threshold) else: word_counts, _ = process_tokens(words, self.normalize_plurals) return word_counts
def tok(dataTok): normalizer = Normalizer() tokenizer = WordTokenizer(join_verb_parts=False, replace_links=True, replace_IDs=True, replace_numbers=True, replace_hashtags=True) s = time.time() ij = 0 #dataTok.apply (lambda x: dataTok1.append(sent_tokenize(x)) ) for row in dataTok: _sents = sent_tokenize(row) _sents = stop_word(_sents) for _sent in _sents: _temp = _sent.replace(".", "").replace(",", "").replace( "،", "").replace("؛", "").strip() _wrds = [] _wrds = normalizer.normalize(_temp) dataTok1.append(tokenizer.tokenize(_wrds)) print("Data: ", dataTok1.__len__()) e = time.time() print("Tokenize Done, Time: ", e - s, " !\n")
def __init__(self, corpus_path='resources/corpus.json', symbols_json_path='resources/symbols.json', persian_lang_path='resources/persian_lang.json', postagger_model_path='resources/postagger.model', max_keyword_num=10, min_keyword_occurrences=0.01, expand_corpus=False): self.postagger_model_path = postagger_model_path self.symbols_json_path = symbols_json_path self.corpus_path = corpus_path self.corpus = {} self.docs_num = 0 self.expand_corpus = expand_corpus if self.corpus_path is not None: with open(corpus_path, encoding='utf-8') as json_file: corpus = json.load(json_file) self.corpus = corpus['corpus'] self.docs_num = corpus['docs_num'] with open(symbols_json_path, encoding='utf-8') as json_file: data = json.load(json_file) lst = list(data.values()) self.all_symbols_list = [item for sublist in lst for item in sublist] with open(persian_lang_path, encoding='utf-8') as json_file: persian_lang = json.load(json_file) self.epic_keywords = persian_lang['epic_keywords'] self.punctuations = persian_lang['punctuations'] self.persian_alphabet = persian_lang['persian_alphabet'] self.stop_words = persian_lang['stop_words'] self.tagger = POSTagger(model=self.postagger_model_path) self.normalizer = Normalizer() self.max_keyword_num = max_keyword_num self.min_keyword_occurrences = min_keyword_occurrences
stopwords = stopwords_f.readlines() for i in range(len(stopwords)): stopwords[i] = stopwords[i].replace("\n", "") samewords_f = open('same_words.txt', 'r', encoding='utf-8') samewords = samewords_f.readlines() #samewords_tokens = word_tokenize(samewords_f.read(),"\n") for i in range(len(samewords)): samewords[i] = samewords[i].replace("\n", "") samewords[i] = word_tokenize(samewords[i]) #print('same=' + str(samewords)) samewords_f.close() stopwords_f.close() #print('stop='+str(stopwords)) lemmatizer = Lemmatizer() normalizer = Normalizer() #print(query_process("ما تو را کودک،. کتابهای به برای دوست داریم خودرو را هنوز اتومبیل")) @app.route('/api/dataframe', methods=['GET']) def df(): return j def find_in_dictionary(word,dictionary1): if word in dictionary1: return dictionary1[word].copy()#returns a list of docIDs with where the term is else: return []
def __init__(self): self.stemmer = Stemmer() self.normalizer = Normalizer() self.punctuations = string.punctuation