Exemple #1
0
 def __init__(self):
     self.preprocessed_docs = []
     self.normalizer = hazm.Normalizer()
     self.word_tokenizer = hazm.WordTokenizer()
     self.stemmer = hazm.Stemmer()
     self.stop_words = hazm.stopwords_list()
     self.persian_garbage = {
         u'÷': u'',
         u'ٰ': u'',
         u'،': ' ',
         u'؟': ' ',
         u'؛': '',
         u'َ': '',
         u'ُ': '',
         u'ِ': '',
         u'ّ': '',
         u'ٌ': '',
         u'ٍ': '',
         u'ئ': u'ی',
         u'ي': u'ی',
         u'ة': u'ه',
         u'ء': u'',
         u'ك': u'ک',
         u'ْ': u'',
         u'أ': u'ا',
         u'إ': u'ا',
         u'ؤ': u'و',
         u'×': u'',
         u'٪': u'',
         u'٬': u'',
         u'آ': u'ا',
         u'●': u''
     }
Exemple #2
0
def normalizing_validation_set():
    with open('data/valid.json', 'r', encoding='utf-8') as json_file:
        validation_data = json.load(json_file)

    with open('data/most_frequent_words.json', 'r',
              encoding='utf-8') as json_file:
        most_frequent_words = json.load(json_file)

    parsivar_normalizer = parsivar.Normalizer()
    hazm_normalizer = hazm.Normalizer()
    sentence_tokenizer = hazm.SentenceTokenizer()
    word_tokenizer = hazm.WordTokenizer(join_verb_parts=False)

    all_sentence_tokens = []
    for text in validation_data:
        text = parsivar_normalizer.sub_alphabets(text)
        text = hazm_normalizer.normalize(text)
        text = remove_english_characters(text)
        text = mask_numbers(text)
        text = remove_punctuations(text)
        text = remove_diacritics(text)
        text = remove_emojis(text)

        text = text.replace('\n', ' ')
        text = text.replace('?', '؟')
        text = text.replace('؟', ' ؟ ')
        text = text.replace('.', ' . ')
        text = text.replace('  ', ' ')
        sentences = sentence_tokenizer.tokenize(text)

        for sentence in sentences:
            words = word_tokenizer.tokenize(sentence)

            if words[-1] == '.' or words[-1] == '؟':
                words = words[:-1]

            if len(words) == 0:
                continue

            final_sentence_tokens = []
            for ind, word in enumerate(words):
                if word == 'NUM':
                    if len(final_sentence_tokens
                           ) == 0 or final_sentence_tokens[-1] != 'NUM':
                        final_sentence_tokens.append(word)
                elif word not in most_frequent_words:
                    if len(final_sentence_tokens
                           ) == 0 or final_sentence_tokens[-1] != 'UNK':
                        final_sentence_tokens.append(word)
                else:
                    final_sentence_tokens.append(word)

            all_sentence_tokens.append(final_sentence_tokens)

    with open('data/validation_sentences.json', 'w') as json_file:
        json.dump(all_sentence_tokens, json_file, ensure_ascii=False)
Exemple #3
0
    def stem_tokenize(self, record):
        """Tokenize persian words and then stem each of them and return a list of words

        Parameters
        ----------
        record : str
            a document

        Returns
        -------
        output : list of string
            List of word
        """
        output = []
        for w in hazm.WordTokenizer().tokenize(record):
            output.append(self._stem(w))
        return output
Exemple #4
0
def bag_of_word():
    hotel_pol = pd.read_csv('data/hotel-polarity.tsv', sep='\t')
    tokenizer = hazm.WordTokenizer()
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
import hazm
from services.index import positional_indexer, bigram_indexer
from collections import Counter
import pandas as pd
from collections import defaultdict

en_stop_words = set(stopwords.words('english'))
en_tokenizer = RegexpTokenizer(r'\w+')
en_stemmer = PorterStemmer()
en_lemmatizer = nltk.WordNetLemmatizer()
fa_normalizer = hazm.Normalizer()
fa_tokenizer = hazm.WordTokenizer()
fa_stemmer = hazm.Stemmer()
fa_lemmatizer = hazm.Lemmatizer()

doc_id = 1
document_base = []
raw_document_base = []
en_common = []
fa_common = []
en_tokens = []
fa_tokens = []
doc_indices_by_type = defaultdict(list)
document_type = ['World', 'Sports', 'Business', 'Sci/Tech']


def documents_cnt():
Exemple #6
0
def normalizing_training_set():
    with open('data/train.json', 'r', encoding='utf-8') as json_file:
        training_data = json.load(json_file)

    parsivar_normalizer = parsivar.Normalizer()
    hazm_normalizer = hazm.Normalizer()
    sentence_tokenizer = hazm.SentenceTokenizer()
    word_tokenizer = hazm.WordTokenizer(join_verb_parts=False)

    word_frequency = {}
    all_sentence_tokens = []
    for text in training_data:
        text = parsivar_normalizer.sub_alphabets(text)
        text = hazm_normalizer.normalize(text)
        text = remove_english_characters(text)
        text = mask_numbers(text)
        text = remove_punctuations(text)
        text = remove_diacritics(text)
        text = remove_emojis(text)

        text = text.replace('\n', ' ')
        text = text.replace('?', '؟')
        text = text.replace('؟', ' ؟ ')
        text = text.replace('.', ' . ')
        text = text.replace('  ', ' ')
        sentences = sentence_tokenizer.tokenize(text)

        for sentence in sentences:
            words = word_tokenizer.tokenize(sentence)

            if words[-1] == '.' or words[-1] == '؟':
                words = words[:-1]

            if len(words) == 0:
                continue

            for word in words:
                if word not in word_frequency:
                    word_frequency[word] = 0
                word_frequency[word] += 1

            all_sentence_tokens.append(words)

    with open('data/words_frequency.json', 'w') as json_file:
        json.dump(word_frequency, json_file, ensure_ascii=False)

    frequency_rank_threshold = 10000
    most_frequent_words = sorted(word_frequency,
                                 key=word_frequency.get,
                                 reverse=True)[:frequency_rank_threshold]

    final_all_sentence_tokens = []
    for sentence_tokens in all_sentence_tokens:
        final_sentence_tokens = []
        for ind, token in enumerate(sentence_tokens):
            if token == 'NUM':
                if len(final_sentence_tokens
                       ) == 0 or final_sentence_tokens[-1] != 'NUM':
                    final_sentence_tokens.append(token)
            elif token not in most_frequent_words:
                if len(final_sentence_tokens
                       ) == 0 or final_sentence_tokens[-1] != 'UNK':
                    final_sentence_tokens.append(token)
            else:
                final_sentence_tokens.append(token)
        final_all_sentence_tokens.append(final_sentence_tokens)

    with open('data/training_sentences.json', 'w') as json_file:
        json.dump(final_all_sentence_tokens, json_file, ensure_ascii=False)
    with open('data/most_frequent_words.json', 'w') as json_file:
        json.dump(most_frequent_words, json_file, ensure_ascii=False)
Exemple #7
0
 def tokenize(self, record):
     return hazm.WordTokenizer().tokenize(record)
Exemple #8
0
import pandas as pd
import hazm
import matplotlib.pyplot as plt
from stopwords import all_stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report


normalizer = hazm.Normalizer(persian_numbers=True)
tokenizer = hazm.WordTokenizer(replace_numbers=True, replace_hashtags=True)
lemmatizer = hazm.Lemmatizer()
tagger = hazm.POSTagger(model='resources/postagger.model')

def clean_text(sentence):
    sentence = normalizer.normalize(sentence)
    sentence = tokenizer.tokenize(sentence)
    sentence = tagger.tag(sentence)
    sentence = [lemmatizer.lemmatize(x[0], pos=x[1]) for x in sentence]
    return " ".join(sentence)

data = pd.read_csv('Corpus/persica_org_english_cat.csv')
data = data[['title', 'text', 'category2']].dropna()
data['cleaned_text'] = data["title"] + " " + data['text']

doc = ""
for item in data['cleaned_text']:
    doc += item + " "