def make_fa_tokenize(lang_dir: Path) -> typing.Optional[TokenizeFunc]: """Tokenize Persian/Farsi""" try: import hazm except ImportError: _LOGGER.warning("hazm is highly recommended for language 'fa'") _LOGGER.warning("pip install 'hazm>=0.7.0'") return None normalizer = hazm.Normalizer() # Load part of speech tagger model_path = lang_dir / "postagger.model" if not model_path.is_file(): _LOGGER.warning("Missing model: %s", model_path) return None _LOGGER.debug("Using hazm tokenizer (model=%s)", model_path) tagger = hazm.POSTagger(model=str(model_path)) def do_tokenize(text: str, **kwargs) -> typing.List[typing.List[Token]]: """Normalize, tokenize, and recognize part of speech""" sentences_tokens = [] sentences = hazm.sent_tokenize(normalizer.normalize(text)) for sentence in sentences: sentence_tokens = [] for word, pos in tagger.tag(hazm.word_tokenize(sentence)): sentence_tokens.append(Token(text=word, pos=pos)) sentences_tokens.append(sentence_tokens) return sentences_tokens return do_tokenize
def stem_data(dat): normalizer = hazm.Normalizer() dat = normalizer.normalize(dat) sent = hazm.sent_tokenize(dat) words = [] for s in sent: tagged = list(tagger.tag(hazm.word_tokenize(s))) new_tag = list(tagged) for token in tagged: if token[0] in stop_words: new_tag.remove(token) lemmatizer = hazm.Lemmatizer() for token in new_tag: stemmed = lemmatizer.lemmatize(token[0], pos=token[1]) stemmer = hazm.Stemmer() stemmed = stemmer.stem(stemmed) if len(stemmed) > 0 and ('#' not in stemmed): words.append(stemmed) return words
def preProcessing(self, doc, level=0): """ This function remove punctuations and some useless prepositions and return a list of words. """ junkList = [ ".", "-", "]", "[", "،", "؛", ":", ")", "(", "!", "؟", "»", "«", "ْ" ] junkWords = [ "که", "از", "با", "برای", "با", "به", "را", "هم", "و", "در", "تا", "یا", "هر", "می", "بر" ] pronouns = [ "من", "تو", "او", "ما", "شما", "ایشان", "آنها", "اینها", "آن", "این", "اونجا", "آنجا", "انجا", "اینها", "آنها", "اینکه" ] for char in junkList: doc = doc.replace(char, " ") result = [] doc = hazm.Normalizer().normalize(doc) doc = hazm.word_tokenize(doc) for word in doc: word.strip() if word not in junkWords and word not in pronouns: result.append(word) return result
def not_map_farsnet_kg_ontology(): input_ontology_filename = DataUtils.join(Config.farsnet_ontology, Config.farsnet_ontology_filename) input_farsnet_map_ontology_filename = DataUtils.join( Config.farsnet_ontology, Config.farsnet_map_ontology_filename) output_farsnet_not_map_ontology_filename = DataUtils.join( Config.farsnet_ontology, Config.farsnet_not_map_ontology_filename) normalizer = hazm.Normalizer() flag_find = False item = 'word' with open(input_ontology_filename, 'r') as input_file_ontology, \ open(output_farsnet_not_map_ontology_filename, 'a') as output_file: csv_reader_ontology, csv_writer = csv.reader( input_file_ontology), csv.writer(output_file) for line_ontology in csv_reader_ontology: if not flag_find: csv_writer.writerow([item]) print(item) item = normalizer.normalize(line_ontology[0]) flag_find = False with open(input_farsnet_map_ontology_filename, 'r') as input_file_map: csv_reader_graph = csv.reader(input_file_map) for line_map in csv_reader_graph: if item == normalizer.normalize(line_map[1]): flag_find = True break
def get_ambiguaty_abstract(): abstract_filename = os.listdir(Config.extracted_texts_dir) input_ambiguate_word_filename = join( Config.article_names_dir, Config.farsnet_ambiguate_word_filename) output_ambiguate_abstract_filename = join( Config.article_names_dir, Config.farsnet_ambiguate_abstract_filename) temp_list = [] count = 0 max_number = 0 min_number = 1000 normalizer = hazm.Normalizer() with open(output_ambiguate_abstract_filename, 'w') as output_file: csv_writer = csv.writer(output_file) for filename in abstract_filename: # if count == 1: # break; count += 1 print('file ' + str(count) + ' is runing ' + filename) dict_abstract = DataUtils.load_json(Config.extracted_texts_dir, filename) for abstract_item in dict_abstract: with open(input_ambiguate_word_filename, 'r') as ambiguate_word: csv_reader = csv.reader(ambiguate_word) for line in csv_reader: item = normalizer.normalize(line[1]) if item == abstract_item: print('find ' + line[1] + ' in file.') del temp_list[:] temp_list.append(line[0]) temp_list.append(normalizer.normalize(line[1])) temp_list.append(line[2]) temp_list.append(normalizer.normalize(line[3])) temp_list.append(normalizer.normalize(line[4])) temp_list.append(normalizer.normalize(line[5])) temp_list.append( normalizer.normalize( dict_abstract[abstract_item])) sentence_snapshot = str(line[3]).replace( ',', ' ').replace('،', ' ') + ' ' gloss_sentence = str(line[4]).replace( ',', ' ').replace('،', ' ') + ' ' example = gloss = str(line[5]).replace( ',', ' ').replace('،', ' ') + ' ' sentence1 = sentence_snapshot + gloss_sentence + example sentence2 = str(temp_list[6]).replace( ',', ' ').replace('،', ' ').replace('.', ' ') diff = similar(sentence1, sentence2) if diff > max_number: max_number = diff if diff < min_number: min_number = diff temp_list.append(diff) csv_writer.writerow(temp_list) return [max_number, min_number]
def cleaning(text): text = text.strip() # regular cleaning text = clean(text, fix_unicode=True, to_ascii=False, lower=True, no_line_breaks=True, no_urls=True, no_emails=True, no_phone_numbers=True, no_numbers=False, no_digits=False, no_currency_symbols=True, no_punct=False, replace_with_url="", replace_with_email="", replace_with_phone_number="", replace_with_number="", replace_with_digit="0", replace_with_currency_symbol="", ) # normalizing normalizer = hazm.Normalizer() text = normalizer.normalize(text) # removing wierd patterns wierd_pattern = re.compile("[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" u"\U0001f926-\U0001f937" u'\U00010000-\U0010ffff' u"\u200d" u"\u2640-\u2642" u"\u2600-\u2B55" u"\u23cf" u"\u23e9" u"\u231a" u"\u3030" u"\ufe0f" u"\u2069" u"\u2066" # u"\u200c" u"\u2068" u"\u2067" "]+", flags=re.UNICODE) text = wierd_pattern.sub(r'', text) # removing extra spaces, hashtags text = re.sub("#", "", text) text = re.sub("\s+", " ", text) text = re.sub("a", "ِ", text) return text
def make_fa_tokenize() -> TOKENIZE_FUNC: """Tokenize Persian/Farsi""" import hazm normalizer = hazm.Normalizer() # Load part of speech tagger model_path = _DATA_DIR / "fa" / "postagger.model" if not model_path.is_file(): # Unzip model_gzip_path = Path(str(model_path) + ".gz") if model_gzip_path.is_file(): _LOGGER.debug("Unzipping %s", model_gzip_path) with open(model_path, "wb") as out_file: with gzip.open(model_gzip_path, "rb") as in_file: shutil.copyfileobj(in_file, out_file) _LOGGER.debug("Using hazm tokenizer (model=%s)", model_path) tagger = hazm.POSTagger(model=str(model_path)) def do_tokenize(text: str) -> typing.List[typing.List[Token]]: """Normalize, tokenize, and recognize part of speech""" sentences_tokens = [] sentences = hazm.sent_tokenize(normalizer.normalize(text)) for sentence in sentences: sentence_tokens = [] for word, pos in tagger.tag(hazm.word_tokenize(sentence)): sentence_tokens.append(Token(text=word, pos=pos)) sentences_tokens.append(sentence_tokens) return sentences_tokens return do_tokenize
def prepareText(text): normalizer = hazm.Normalizer() text = normalizer.normalize(text) tokens = hazm.word_tokenize(text) stemmer = hazm.Stemmer() words = [stemmer.stem(token) for token in tokens] return words
def countTextWords(text): normalizer = hazm.Normalizer() text = normalizer.normalize(text) tokens = hazm.word_tokenize(text) stemmer = hazm.Stemmer() words = [stemmer.stem(token) for token in tokens] return len(words)
def __init__(self): self.preprocessed_docs = [] self.normalizer = hazm.Normalizer() self.word_tokenizer = hazm.WordTokenizer() self.stemmer = hazm.Stemmer() self.stop_words = hazm.stopwords_list() self.persian_garbage = { u'÷': u'', u'ٰ': u'', u'،': ' ', u'؟': ' ', u'؛': '', u'َ': '', u'ُ': '', u'ِ': '', u'ّ': '', u'ٌ': '', u'ٍ': '', u'ئ': u'ی', u'ي': u'ی', u'ة': u'ه', u'ء': u'', u'ك': u'ک', u'ْ': u'', u'أ': u'ا', u'إ': u'ا', u'ؤ': u'و', u'×': u'', u'٪': u'', u'٬': u'', u'آ': u'ا', u'●': u'' }
def __call__(self, text): # preprocessing text = unicode(text) text = normalize_numbers(text) # text = ''.join(char for char in unicodedata.normalize('NFD', text) # if unicodedata.category(char) != 'Mn') # Strip accents # text = re.sub("[^ a-z'.,?!\-]", "", text) normalizer = hazm.Normalizer() text = normalizer.normalize(text) # tokenization words = hazm.word_tokenize(text) # tokens = pos_tag(words) # tuples of (word, tag) # steps prons = [] for word in words: if not any(letter in word for letter in self.graphemes): pron = [word] # elif word in self.homograph2features: # Check homograph # pron1, pron2, pos1 = self.homograph2features[word] # if pos.startswith(pos1): # pron = pron1 # else: # pron = pron2 elif word in self.tihu: # lookup tihu dict pron = self.tihu[word] else: # predict for oov pron = self.predict(word) prons.extend(pron) prons.extend([" "]) return prons[:-1]
def normalizing_validation_set(): with open('data/valid.json', 'r', encoding='utf-8') as json_file: validation_data = json.load(json_file) with open('data/most_frequent_words.json', 'r', encoding='utf-8') as json_file: most_frequent_words = json.load(json_file) parsivar_normalizer = parsivar.Normalizer() hazm_normalizer = hazm.Normalizer() sentence_tokenizer = hazm.SentenceTokenizer() word_tokenizer = hazm.WordTokenizer(join_verb_parts=False) all_sentence_tokens = [] for text in validation_data: text = parsivar_normalizer.sub_alphabets(text) text = hazm_normalizer.normalize(text) text = remove_english_characters(text) text = mask_numbers(text) text = remove_punctuations(text) text = remove_diacritics(text) text = remove_emojis(text) text = text.replace('\n', ' ') text = text.replace('?', '؟') text = text.replace('؟', ' ؟ ') text = text.replace('.', ' . ') text = text.replace(' ', ' ') sentences = sentence_tokenizer.tokenize(text) for sentence in sentences: words = word_tokenizer.tokenize(sentence) if words[-1] == '.' or words[-1] == '؟': words = words[:-1] if len(words) == 0: continue final_sentence_tokens = [] for ind, word in enumerate(words): if word == 'NUM': if len(final_sentence_tokens ) == 0 or final_sentence_tokens[-1] != 'NUM': final_sentence_tokens.append(word) elif word not in most_frequent_words: if len(final_sentence_tokens ) == 0 or final_sentence_tokens[-1] != 'UNK': final_sentence_tokens.append(word) else: final_sentence_tokens.append(word) all_sentence_tokens.append(final_sentence_tokens) with open('data/validation_sentences.json', 'w') as json_file: json.dump(all_sentence_tokens, json_file, ensure_ascii=False)
def bagify(doc): normalizer = hazm.Normalizer() tokenize = hazm.word_tokenize word_list = re.sub(r"(&...;|&....;|(\d))|'|{|}|!", " ", doc) #stemmer = hazm.Stemmer() tokens = tokenize(normalizer.normalize(word_list)) #tokens = [stemmer.stem(x) for x in tokens ] doc_list = [x for x in tokens if x not in stop_words] doc_set = set(doc_list) doc_bag = Counter({k:doc_list.count(k) for k in doc_set}) return doc_bag
def pre_process_data(): f_1 = open("../../Data/label1.txt", "r", encoding="utf-8") f1 =f_1.readlines() f1_preproc = open("../label1.txt", "w+", encoding="utf-8") for j in f1: normalized_f1 = Hazm1.Normalizer().normalize(j) tokenized_f1 = Hazm1.word_tokenize(normalized_f1) for i in tokenized_f1: f1_preproc.write(i) f1_preproc.write(' ') f1_preproc.write('\n') f_2 = open("../../Data/label2.txt", "r", encoding="utf-8") f2 = f_2.readlines() f2_preproc = open("../label2.txt", "w+", encoding="utf-8") for j in f2: normalized_f2 = Hazm2.Normalizer().normalize(j) tokenized_f2 = Hazm2.word_tokenize(normalized_f2) for i in tokenized_f2: f2_preproc.write(i) f2_preproc.write(' ') f2_preproc.write('\n')
def textNormalizer(lousyCollection): docs = list() normalizer = hz.Normalizer() lemmatizer = hz.Lemmatizer() stemmer = hz.Stemmer() for i in range(len(lousyCollection)): normalized = normalizer.normalize(lousyCollection[i]) docs.append(delete_Punc(normalized)) for doc in docs: tokens = hz.word_tokenize(doc) for token in tokens: tokens[tokens.index(token)] = lemmatizer.lemmatize( stemmer.stem(token)) docs[docs.index(doc)] = tokens return docs
def __init__(self, feature_set, orientations=None, language='english'): self.language = language self.normalizer[language] = hazm.Normalizer() if language == 'persian': self.stopwords[language] = hazm.stopwords_list() self.regex_words[language] = r"[\w']+|[.,!?;،؟؛]" else: self.stopwords[language] = set(stopwords.words('english')) self.regex_words[language] = r"[\w']+|[.,!?;]" if orientations: self.orientations = orientations self.feature_set = feature_set self.weights = {} self.hash_dictionary[self.language] = {}
def text_normalazation(self, raw_text): normalizer = hazm.Normalizer() clean_text = normalizer.normalize(raw_text) # clean_text = re.sub ( r'\n\s*\n', '\n' , clean_text ) # clean_text = re.sub ( r'\r\s*\r', '\n', clean_text ) clean_text = raw_text.replace('\n', ' ').replace('\r', '') clean_text = re.sub(' +', ' ', clean_text) # clean_text = re.compile ( '<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});' ) clean_text = clean_text.replace(u'\xa0', u' ') # clean_text = re.sub ( clean_text , ' ' , raw_text ) # clean_text = ' '.join(clean_text.split()) # clean_text = "\n".join(clean_text.split("\n")) return (clean_text)
def similar(s1, s2): normalizer = hazm.Normalizer() s1 = normalizer.normalize(s1) s2 = normalizer.normalize(s2) list_s1 = [ word for word in s1.split(" ") if word not in hazm.stopwords_list() ] list_s2 = [ word for word in s2.split(" ") if word not in hazm.stopwords_list() ] stemmer = hazm.Stemmer() stem_s1 = [stemmer.stem(word) for word in list_s1] same_words = set.intersection(set(list_s1), set(list_s2)) return len(same_words)
def preProcessingVW(self, doc): junkList = [ ".", "-", "]", "[", "،", "؛", ":", ")", "(", "!", "؟", "»", "«", "ْ" ] junkWords = [ "که", "از", "با", "برای", "با", "به", "را", "هم", "و", "در", "تا", "یا", "هر", "می", "بر" ] pronouns = [ "من", "تو", "او", "ما", "شما", "ایشان", "آنها", "اینها", "آن", "این", "اونجا", "آنجا", "انجا", "اینها", "آنها", "اینکه" ] for char in junkList: doc = doc.replace(char, "") doc.strip() doc = hazm.Normalizer().normalize(doc) return doc
def __call__(self, text, tidy=False, secret=False): # preprocessing text = unicode(text) text = normalize_numbers(text) # text = ''.join(char for char in unicodedata.normalize('NFD', text) # if unicodedata.category(char) != 'Mn') # Strip accents # text = re.sub("[^ a-z'.,?!\-]", "", text) normalizer = hazm.Normalizer() text = normalizer.normalize(text) # tokenization words = hazm.word_tokenize(text) # tokens = pos_tag(words) # tuples of (word, tag) # steps prons = [] for word in words: if not any(letter in word for letter in self.graphemes): pron = [word] # elif word in self.homograph2features: # Check homograph # pron1, pron2, pos1 = self.homograph2features[word] # if pos.startswith(pos1): # pron = pron1 # else: # pron = pron2 elif word in self.tihu: # lookup tihu dict pron = [self.tihu[word].replace(' ', '') ] if secret else [' ', self.tihu[word], ' '] else: # predict for oov pron = self.predict(word) prons.extend(pron) prons.extend([" "]) result = ''.join(prons[:-1]) if tidy: return Persian_g2p_converter.convert_from_native_to_good(result) return result
def __init__(self, mask=None, size=900, stop_words_addr=default_stop_words_path, mask_addr=None): self.hazm_normalizer = hazm.Normalizer() self.parsivar_normalizer = parsivar.Normalizer() self.stemmer = hazm.Stemmer() self.lemmatizer = hazm.Lemmatizer() self.stop_words = set(hazm.stopwords_list(stop_words_addr)) mask = np.array( Image.open(mask_addr)) if mask_addr is not None else None self.generator = WordCloud(width=size, height=size, include_numbers=False, persian_normalize=False, collocations=True, mask=mask, background_color='white')
def word_counter(self, text: str) -> (float, dict): text = text.lower() text = text.translate(str.maketrans( {'#': ' ', '$': ' ', '/': ' ', '+': ' ', '=': ' ', ':': ' ', ',': ' ', ';': ' ', '؛': ' ', '،': ' ', '.': ' ', '!': ' ', '؟': ' ', '?': ' ', '«': ' ', '»': ' ', '(': ' ', ')': ' ', '_': ' ', '-': ' ', '@': ' '})) text = hazm.Normalizer().normalize(text) text = hazm.word_tokenize(text) stemmer = hazm.Stemmer() keywords_dic = {word: 0 for word in self.keywords.keys()} value = 0.0 for i in range(len(text)): stemmed_word = stemmer.stem(text[i]) if stemmed_word in keywords_dic: keywords_dic[stemmed_word] += 1 if keywords_dic[stemmed_word] == 1: # count each word only once value += self.keywords[stemmed_word] if stemmed_word in self.filter_words: return 0, {} return value, keywords_dic
def text_to_tokens( self, text: str ) -> typing.Iterable[typing.Tuple[typing.List[str], typing.List[Token]]]: """ Process text into words and sentence tokens using hazm. Returns: (original_words, sentence_tokens) for each sentence """ try: import hazm except ImportError: _LOGGER.warning("hazm is highly recommended for language 'fa'") _LOGGER.warning("pip install 'hazm>=0.7.0'") # Fall back to parent implementation yield from super().text_to_tokens(text) # Load normalizer if not hasattr(self, "normalizer"): normalizer = hazm.Normalizer() setattr(self, "normalizer", normalizer) # Load tagger if not hasattr(self, "tagger"): # Load part of speech tagger model_path = self.lang_dir / "postagger.model" tagger = hazm.POSTagger(model=str(model_path)) setattr(self, "tagger", tagger) sentences = hazm.sent_tokenize(normalizer.normalize(text)) for sentence in sentences: original_words = [] sentence_tokens = [] for word, pos in tagger.tag(hazm.word_tokenize(sentence)): original_words.append(word) sentence_tokens.append( Token(text=word, features={TokenFeatures.PART_OF_SPEECH: pos})) yield original_words, sentence_tokens
def map_farsnet_kg_ontology(input_filename): input_ontology_filename = DataUtils.join(Config.farsnet_ontology, Config.farsnet_ontology_filename) output_farsnet_map_ontology_filename = DataUtils.join( Config.farsnet_ontology, Config.farsnet_map_ontology_filename) normalizer = hazm.Normalizer() print('input file ' + input_filename) with open(input_ontology_filename, 'r') as input_file_ontology, \ open(output_farsnet_map_ontology_filename, 'a') as output_file: csv_reader_ontology, csv_writer = csv.reader( input_file_ontology), csv.writer(output_file) for line_ontology in csv_reader_ontology: with open(input_filename, 'r') as input_file_graph: csv_reader_graph = csv.reader(input_file_graph) for line_graph in csv_reader_graph: item = normalizer.normalize(line_graph[1]) if normalizer.normalize(line_ontology[0]) == item: print(item) csv_writer.writerow( [line_graph[0], item, line_graph[3]])
def preprocess_farsi(text): prohibitedWords = ['[[', ']]', '{{', '}}', '{|', '|', '*', '==', '=', '\'\'\'' ,'_'] big_regex = re.compile('|'.join(map(re.escape, prohibitedWords))) new_text = big_regex.sub(" ", text) # print(new_text) ### Remove English characters new_text = re.sub(r'[a-zA-Z]','', new_text) ### Remove punctuation new_text = re.sub(r'[^\w\s]', ' ', new_text) normalizer = hazm.Normalizer(remove_extra_spaces=True, persian_style=True, persian_numbers=True, remove_diacritics=True, affix_spacing=True, token_based=False, punctuation_spacing=True) new_text = normalizer.normalize(new_text) ### Remove numbers new_text = re.sub(r'[۱۲۳۴۵۶۷۸۹۰]', ' ', new_text) ### Not in HAZM # new_text = new_text.replace('گی','ه') tokens = hazm.word_tokenize(new_text) stemmer = hazm.Stemmer() tokens = [word.replace('\u200c', '') for word in tokens ] tokens = [stemmer.stem(word) for word in tokens] tokens = [word for word in tokens if word != '' ] return tokens
import hazm from cleantext import clean import re normalizer = hazm.Normalizer() def cleanize(text): """ Clean the text from redundant and useless items """ text = clean( text, fix_unicode=True, to_ascii=False, lower=False, no_line_breaks=True, no_urls=True, no_emails=True, no_phone_numbers=True, no_numbers=False, no_digits=False, no_currency_symbols=True, no_punct=False, replace_with_url="", replace_with_email="", replace_with_phone_number="", replace_with_number="", replace_with_digit=".", replace_with_currency_symbol="" ) text = text.strip() text = re.sub("\s+", " ", text)
def CleanPersianText(text): _normalizer = hazm.Normalizer() text = _normalizer.normalize(text) return text
# coding: utf-8 # Modules import hazm as hz import numpy as np import keras from keras.models import Sequential from keras.layers import Dense, Dropout, Activation import xml.etree.ElementTree as et from os import listdir from os.path import isfile, join from collections import defaultdict # Parameters normalizer = hz.Normalizer() tagger = hz.POSTagger(model='resources/postagger.model') stemmer = hz.Stemmer() lemmatizer = hz.Lemmatizer() lexicon_file_name = 'final_lexi' data_path = './data/' lexicon = None # Make bag_of_words def bow(text): global normalizer global tagger global stemmer global lemmatizer
def normalize(self, input): return hazm.Normalizer().normalize(input)
def normalize_text(text): normalizer = hazm.Normalizer() normalized_text = normalizer.normalize(text) return normalized_text