コード例 #1
0
ファイル: hw2.py プロジェクト: minam75/ai-hw2
def stem_data(dat):
    normalizer = hazm.Normalizer()
    dat = normalizer.normalize(dat)
    sent = hazm.sent_tokenize(dat)

    words = []

    for s in sent:
        tagged = list(tagger.tag(hazm.word_tokenize(s)))
        new_tag = list(tagged)

        for token in tagged:
            if token[0] in stop_words:
                new_tag.remove(token)

        lemmatizer = hazm.Lemmatizer()
        for token in new_tag:

            stemmed = lemmatizer.lemmatize(token[0], pos=token[1])
            stemmer = hazm.Stemmer()
            stemmed = stemmer.stem(stemmed)
            if len(stemmed) > 0 and ('#' not in stemmed):
                words.append(stemmed)

    return words
コード例 #2
0
    def __init__(self):
        self.punctuations = [
            '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-',
            '/', ':', ';', '<', '=', '>', '@', '[', '\\', ']', '^', '_', '`',
            '{', '|', '}', '~', '£', '¤', '§', '©', '«', '®', '°', '±', '²',
            '´', '¸', '»', '¼', '½', '¾', '×', '÷', 'ˈ', '˜', '˝', '٪', '٫',
            '٬', '‐', '–', '—', '‘', '’', '“', '”', '„', '…', '″', '‹', '›',
            '™', '↑', '→', '↓', '⋅', '⌘', '▪', '◄', '○', '♫', '✓', '❤', '《',
            '》', '爆', '者', '被', '\uf020', '\uf04f', '\uf05f', '\uf076',
            '\uf0a7', '\uf0fc', '﴾', '﴿', ':', '�', '?', '؟', '.', '،', '؛',
            '•', '●'
        ]
        self.diacritics_pattern = re.compile(
            "[\u064B-\u065e\u0670\u0674\u06c3\u06d4-\u06ed]")
        self.emojis_pattern = re.compile(
            "["
            u"\U0001F600-\U0001F64F"
            u"\U0001F300-\U0001F5FF"
            u"\U0001F680-\U0001F6FF"
            u"\U0001F1E0-\U0001F1FF"
            "]+",
            flags=re.UNICODE)
        self.latin_characters_pattern = re.compile("["
                                                   "\u0041-\u007a"
                                                   "\u00c0-\u036f"
                                                   "\u0400-\u050f"
                                                   "\u0342-\u03ff"
                                                   "]")
        self.numbers_pattern = re.compile("[0-9]")
        self.space_patterns = [
            (re.compile("[\u202c\u2005\u2009\u2029\u2066\u3000\ufe0f]"), ' '),
            (re.compile("[\f\r\t\n]"), ' '),
            (re.compile("[\u001f\u009d\u200a\u200e\u200f\u206d\xa0\xad]"),
             '\u200c'),
            (re.compile(
                "[\u007f\u0085\u061c\u200b\u200d\u202a\u202b\u206f\u2003"
                "\u2028\u2060\u2063\u2067\u2069\ufeff\ufffc\x18]"), ''),
        ]
        self.stopwords = hazm.stopwords_list()[:200] + [
            'ام', 'م', 'ات', 'ای', 'ی', 'ت', 'اش', 'ش', 'مان', 'یم', 'ایم',
            'تان', 'ید', 'اید', 'شان', 'ند', 'اند', 'است', 'هست', 'بود', 'شد',
            'شو', 'باش', 'خواه', 'ها', 'های', 'ان', 'یک', 'دو', 'سه', 'چهار',
            'پنج', 'شش', 'هفت', 'هشت', 'نه', 'ده', 'هستم', 'هستم', 'هست',
            'هستید', 'هستیم', 'نیستم', 'نیستی', 'نیست', 'نیستیم', 'نیستید',
            'نیستند'
        ]

        self.normalizer = parsivar.Normalizer()
        self.stemmer = parsivar.FindStems()
        self.lemmatizer = hazm.Lemmatizer()
コード例 #3
0
ファイル: mazm.py プロジェクト: mmsamiei/BS-Project
 def my_lemmatizer(string):
     ps = PersianStemmer()
     lemmatizer = hazm.Lemmatizer()
     current = ps.run(string)
     current = lemmatizer.lemmatize(string)
     # new = stemmer.stem(current)
     # while(current != new):
     #     current = new
     #     new = stemmer.stem(current)
     # result = new
     if("#" in current): # return bone mozareh!!
         result = current.split("#")[1]
     else:
         result = current
     return result
コード例 #4
0
def textNormalizer(lousyCollection):
    docs = list()
    normalizer = hz.Normalizer()
    lemmatizer = hz.Lemmatizer()
    stemmer = hz.Stemmer()
    for i in range(len(lousyCollection)):
        normalized = normalizer.normalize(lousyCollection[i])
        docs.append(delete_Punc(normalized))
    for doc in docs:
        tokens = hz.word_tokenize(doc)
        for token in tokens:
            tokens[tokens.index(token)] = lemmatizer.lemmatize(
                stemmer.stem(token))
        docs[docs.index(doc)] = tokens
    return docs
コード例 #5
0
    def clean_fa(self, data):
        data.text = self.fa_normalize(data.text)
        data.text = self.tokenizer(data.text)

        stemmer = hazm.Stemmer()
        lemmatizer = hazm.Lemmatizer()
        stopwords = hazm.stopwords_list()
        alphabet = set(list("ابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی"))

        data.text = data.apply(
            lambda row: self.stemLemmaStopWord(
                stemmer, lemmatizer, stopwords, alphabet, row.text
            ),
            axis=1,
        )
        return data
コード例 #6
0
    def stemming(self):
        if self.persian:
            stemmer = hazm.Stemmer()
            lemmatizer = hazm.Lemmatizer()
            for i in range(len(self.tokens)):
                self.tokens[i] = lemmatizer.lemmatize(
                    stemmer.stem(self.tokens[i]))
        else:
            porter = nltk.PorterStemmer()
            self.tokens = [porter.stem(word) for word in self.tokens]

            lemma = nltk.WordNetLemmatizer()
            self.tokens = [
                lemma.lemmatize(word, pos="v") for word in self.tokens
            ]
            self.tokens = [
                lemma.lemmatize(word, pos="n") for word in self.tokens
            ]
コード例 #7
0
 def __init__(self,
              mask=None,
              size=900,
              stop_words_addr=default_stop_words_path,
              mask_addr=None):
     self.hazm_normalizer = hazm.Normalizer()
     self.parsivar_normalizer = parsivar.Normalizer()
     self.stemmer = hazm.Stemmer()
     self.lemmatizer = hazm.Lemmatizer()
     self.stop_words = set(hazm.stopwords_list(stop_words_addr))
     mask = np.array(
         Image.open(mask_addr)) if mask_addr is not None else None
     self.generator = WordCloud(width=size,
                                height=size,
                                include_numbers=False,
                                persian_normalize=False,
                                collocations=True,
                                mask=mask,
                                background_color='white')
コード例 #8
0
    def create_emotion_embedding(word_emotion_path, token_field, embedding_dim):
        """
        create_emotion_embedding method is written for create emotional embedding matrix
        :param word_emotion_path: address of emotional dictionary
        :param token_field: token_field
        :param embedding_dim: dimension of emotional embedding
        :return:
            weight_matrix: emotional embedding
        """
        lematizer = hazm.Lemmatizer()
        # load pickle dictionary of emotional embedding
        with open(word_emotion_path, "rb") as file:
            word_emotion_dict = pkl.load(file)

        # create weight_matrix as zero matrix
        weight_matrix = np.zeros((len(token_field.vocab), embedding_dim))

        for token, idx in token_field.vocab.stoi.items():
            lemma_token = lematizer.lemmatize(token)
            emotion_embedding = word_emotion_dict.get(lemma_token)
            if emotion_embedding is not None:
                weight_matrix[idx] = emotion_embedding
        return weight_matrix
コード例 #9
0
# hazm.sent_tokenize('ما هم برای وصل کردن آمدیم! ولی برای پردازش، جدا بهتر نیست؟')
# ['ما هم برای وصل کردن آمدیم!', 'ولی برای پردازش، جدا بهتر نیست؟']

# word tokenizer
# hazm.word_tokenize('ولی برای پردازش، جدا بهتر نیست؟')
# ['ولی', 'برای', 'پردازش', '،', 'جدا', 'بهتر', 'نیست', '؟']


# Stemmer
stemmer = hazm.Stemmer()
# stemmer.stem('کتاب‌ها')
# 'کتاب'


# Lemmatizer
lemmatizer = hazm.Lemmatizer()
# lemmatizer.lemmatize('می‌روم')
# 'رفت#رو'


# Tagger
# tagger = hazm.POSTagger(model='resources/postagger.model')
# tagger.tag(hazm.word_tokenize('ما بسیار کتاب می‌خوانیم'))
# [('ما', 'PRO'), ('بسیار', 'ADV'), ('کتاب', 'N'), ('می‌خوانیم', 'V')]


# Chunker
# chunker = hazm.Chunker(model='resources/chunker.model')
# tagged = tagger.tag(word_tokenize('کتاب خواندن را دوست داریم'))
# tree2brackets(chunker.parse(tagged))
# '[کتاب خواندن NP] [را POSTP] [دوست داریم VP]'
コード例 #10
0
# Modules
import hazm as hz
import numpy as np
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
import xml.etree.ElementTree as et
from os import listdir
from os.path import isfile, join
from collections import defaultdict

# Parameters
normalizer = hz.Normalizer()
tagger = hz.POSTagger(model='resources/postagger.model')
stemmer = hz.Stemmer()
lemmatizer = hz.Lemmatizer()

lexicon_file_name = 'final_lexi'
data_path = './data/'

lexicon = None


# Make bag_of_words
def bow(text):
    global normalizer
    global tagger
    global stemmer
    global lemmatizer

    text = hz.sent_tokenize(normalizer.normalize(text))
コード例 #11
0
from __future__ import unicode_literals
import hazm as Hazm
import sys
from StopWords import stop_words
import re
import json
from wordfreq import zipf_frequency

if len(sys.argv) < 2:
    print('error')
    sys.exit()

raw_text = str(sys.argv[1])

normalizer_instance = Hazm.Normalizer()
lemmatizer_instance = Hazm.Lemmatizer()
stem_finder_instance = Hazm.Stemmer()
remove_non_persian_regex = re.compile('[^آ-ی]')
raw_text = remove_non_persian_regex.sub(
    ' ', raw_text)  #We replace all non persian texts
normalized_text = normalizer_instance.normalize(raw_text)
sentences = Hazm.sent_tokenize(normalized_text)

result_tokens = list()
less_accurate_tokens = list()


def add_to_tokens_if_not_exists(parsed_token):
    exists = False
    for result_token in result_tokens:
        if parsed_token == result_token:
コード例 #12
0
ファイル: lemmatizer.py プロジェクト: sarb9/KhabarYaab
 def __init__(self):
     self.lemmatizer = hazm.Lemmatizer()