def __init__(self): self.preprocessed_docs = [] self.normalizer = hazm.Normalizer() self.word_tokenizer = hazm.WordTokenizer() self.stemmer = hazm.Stemmer() self.stop_words = hazm.stopwords_list() self.persian_garbage = { u'÷': u'', u'ٰ': u'', u'،': ' ', u'؟': ' ', u'؛': '', u'َ': '', u'ُ': '', u'ِ': '', u'ّ': '', u'ٌ': '', u'ٍ': '', u'ئ': u'ی', u'ي': u'ی', u'ة': u'ه', u'ء': u'', u'ك': u'ک', u'ْ': u'', u'أ': u'ا', u'إ': u'ا', u'ؤ': u'و', u'×': u'', u'٪': u'', u'٬': u'', u'آ': u'ا', u'●': u'' }
def normalizing_validation_set(): with open('data/valid.json', 'r', encoding='utf-8') as json_file: validation_data = json.load(json_file) with open('data/most_frequent_words.json', 'r', encoding='utf-8') as json_file: most_frequent_words = json.load(json_file) parsivar_normalizer = parsivar.Normalizer() hazm_normalizer = hazm.Normalizer() sentence_tokenizer = hazm.SentenceTokenizer() word_tokenizer = hazm.WordTokenizer(join_verb_parts=False) all_sentence_tokens = [] for text in validation_data: text = parsivar_normalizer.sub_alphabets(text) text = hazm_normalizer.normalize(text) text = remove_english_characters(text) text = mask_numbers(text) text = remove_punctuations(text) text = remove_diacritics(text) text = remove_emojis(text) text = text.replace('\n', ' ') text = text.replace('?', '؟') text = text.replace('؟', ' ؟ ') text = text.replace('.', ' . ') text = text.replace(' ', ' ') sentences = sentence_tokenizer.tokenize(text) for sentence in sentences: words = word_tokenizer.tokenize(sentence) if words[-1] == '.' or words[-1] == '؟': words = words[:-1] if len(words) == 0: continue final_sentence_tokens = [] for ind, word in enumerate(words): if word == 'NUM': if len(final_sentence_tokens ) == 0 or final_sentence_tokens[-1] != 'NUM': final_sentence_tokens.append(word) elif word not in most_frequent_words: if len(final_sentence_tokens ) == 0 or final_sentence_tokens[-1] != 'UNK': final_sentence_tokens.append(word) else: final_sentence_tokens.append(word) all_sentence_tokens.append(final_sentence_tokens) with open('data/validation_sentences.json', 'w') as json_file: json.dump(all_sentence_tokens, json_file, ensure_ascii=False)
def stem_tokenize(self, record): """Tokenize persian words and then stem each of them and return a list of words Parameters ---------- record : str a document Returns ------- output : list of string List of word """ output = [] for w in hazm.WordTokenizer().tokenize(record): output.append(self._stem(w)) return output
def bag_of_word(): hotel_pol = pd.read_csv('data/hotel-polarity.tsv', sep='\t') tokenizer = hazm.WordTokenizer()
from nltk.tokenize import RegexpTokenizer from nltk.corpus import stopwords from nltk.stem import PorterStemmer import nltk import hazm from services.index import positional_indexer, bigram_indexer from collections import Counter import pandas as pd from collections import defaultdict en_stop_words = set(stopwords.words('english')) en_tokenizer = RegexpTokenizer(r'\w+') en_stemmer = PorterStemmer() en_lemmatizer = nltk.WordNetLemmatizer() fa_normalizer = hazm.Normalizer() fa_tokenizer = hazm.WordTokenizer() fa_stemmer = hazm.Stemmer() fa_lemmatizer = hazm.Lemmatizer() doc_id = 1 document_base = [] raw_document_base = [] en_common = [] fa_common = [] en_tokens = [] fa_tokens = [] doc_indices_by_type = defaultdict(list) document_type = ['World', 'Sports', 'Business', 'Sci/Tech'] def documents_cnt():
def normalizing_training_set(): with open('data/train.json', 'r', encoding='utf-8') as json_file: training_data = json.load(json_file) parsivar_normalizer = parsivar.Normalizer() hazm_normalizer = hazm.Normalizer() sentence_tokenizer = hazm.SentenceTokenizer() word_tokenizer = hazm.WordTokenizer(join_verb_parts=False) word_frequency = {} all_sentence_tokens = [] for text in training_data: text = parsivar_normalizer.sub_alphabets(text) text = hazm_normalizer.normalize(text) text = remove_english_characters(text) text = mask_numbers(text) text = remove_punctuations(text) text = remove_diacritics(text) text = remove_emojis(text) text = text.replace('\n', ' ') text = text.replace('?', '؟') text = text.replace('؟', ' ؟ ') text = text.replace('.', ' . ') text = text.replace(' ', ' ') sentences = sentence_tokenizer.tokenize(text) for sentence in sentences: words = word_tokenizer.tokenize(sentence) if words[-1] == '.' or words[-1] == '؟': words = words[:-1] if len(words) == 0: continue for word in words: if word not in word_frequency: word_frequency[word] = 0 word_frequency[word] += 1 all_sentence_tokens.append(words) with open('data/words_frequency.json', 'w') as json_file: json.dump(word_frequency, json_file, ensure_ascii=False) frequency_rank_threshold = 10000 most_frequent_words = sorted(word_frequency, key=word_frequency.get, reverse=True)[:frequency_rank_threshold] final_all_sentence_tokens = [] for sentence_tokens in all_sentence_tokens: final_sentence_tokens = [] for ind, token in enumerate(sentence_tokens): if token == 'NUM': if len(final_sentence_tokens ) == 0 or final_sentence_tokens[-1] != 'NUM': final_sentence_tokens.append(token) elif token not in most_frequent_words: if len(final_sentence_tokens ) == 0 or final_sentence_tokens[-1] != 'UNK': final_sentence_tokens.append(token) else: final_sentence_tokens.append(token) final_all_sentence_tokens.append(final_sentence_tokens) with open('data/training_sentences.json', 'w') as json_file: json.dump(final_all_sentence_tokens, json_file, ensure_ascii=False) with open('data/most_frequent_words.json', 'w') as json_file: json.dump(most_frequent_words, json_file, ensure_ascii=False)
def tokenize(self, record): return hazm.WordTokenizer().tokenize(record)
import pandas as pd import hazm import matplotlib.pyplot as plt from stopwords import all_stopwords from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split from sklearn.pipeline import Pipeline from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import SGDClassifier from sklearn.metrics import classification_report normalizer = hazm.Normalizer(persian_numbers=True) tokenizer = hazm.WordTokenizer(replace_numbers=True, replace_hashtags=True) lemmatizer = hazm.Lemmatizer() tagger = hazm.POSTagger(model='resources/postagger.model') def clean_text(sentence): sentence = normalizer.normalize(sentence) sentence = tokenizer.tokenize(sentence) sentence = tagger.tag(sentence) sentence = [lemmatizer.lemmatize(x[0], pos=x[1]) for x in sentence] return " ".join(sentence) data = pd.read_csv('Corpus/persica_org_english_cat.csv') data = data[['title', 'text', 'category2']].dropna() data['cleaned_text'] = data["title"] + " " + data['text'] doc = "" for item in data['cleaned_text']: doc += item + " "