def stem(self):
     """
     :return:
     """
     print('hi')
     stemmer = FindStems()
     for words in self.words:
         temp = []
         for word in words:
             temp.append(stemmer.convert_to_stem(str(word)))
         self.stem_words.append(temp)
     return self.stem_words
Ejemplo n.º 2
0
    def __init__(self):
        self.sw = pd.read_csv("data/stop_words/stpwrd.csv")
        # از فایل مربوطه ستون واژه های ایست را انتخاب میکنیم و به صورت لیست رشته ای باز میگردانیم
        self.sw = self.sw["StopWord"].astype(str).values.tolist()

        self.correction = pd.read_csv("data/Vocab_dataset_1.csv",
                                      index_col=0,
                                      header=None,
                                      squeeze=True).to_dict()

        self._normalizer = Normalizer(statistical_space_correction=True,
                                      date_normalizing_needed=True)
        self._normalizer1 = HazmNormal()
        self._tokenizer1 = HazmTokenizer(join_verb_parts=False,
                                         replace_hashtags=True,
                                         replace_numbers=True,
                                         separate_emoji=True)
        self._tokenizer2 = HazmTokenizer(join_verb_parts=False,
                                         replace_hashtags=True,
                                         replace_numbers=False,
                                         separate_emoji=True)
        self._stemmer = FindStems()

        # region Regular Experssion
        # عبارتی که کلمات و اعداد به هم چسبیده را از هم جدا میکند
        self.persianmixRE = re.compile(
            "(([\u0600-\u06FF]+)([0-9]+)|([0-9]+)([\u0600-\u06FF]+)|([a-zA-Z]+)([0-9]+)|([0-9]+)([a-zA-Z]+))"
        )
        # برای اینکه متوجه شویم در متن عدد وجود دارد از این عبارت استفاده میکنیم
        self.numRE = re.compile('\d')
        self.removeIrritateRE = re.compile(r'(.)\1{2,}', re.IGNORECASE)
        # Emoji
        self.emojiRE = re.compile(
            pattern="["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
            "]+",
            flags=re.UNICODE)
        # endregion
        self._emojiList = list(emoji.UNICODE_EMOJI.keys())
        #self.w2vModel = Word2Vec.load('P:/pkl/newSentence.bin')

        print("\n ** Persian Text PreParation by Naghme Jamali ** \n")
Ejemplo n.º 3
0
from parsivar import FindStems
import pickle
import re

stopwords = ["و", "از", "در", "برای", "چون"]

DEBUG_MODE = False
my_stemmer = FindStems()

postlist = {}
with open('InformationRetrieval/objs.pkl', 'rb') as f:
    postlist = pickle.load(f)


def search_term(term):  # returns list
    #     if term in STOP_WORDS:
    #         term = "$$$"
    if term in postlist.keys():
        return postlist[term].keys()
    else:
        return False


def search_phrase(phrase):  # phrase is a list of words .  returns set
    print("im in")
    print(phrase)
    answers = []
    for word in phrase:
        if word not in postlist.keys():
            return False
Ejemplo n.º 4
0
from random import shuffle
from parsivar import Normalizer
from parsivar import SpellCheck
from parsivar import FindStems
from parsivar import Tokenizer
from hazm import Lemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

datelist = []
my_tokenizer = Tokenizer()
lemmatizer = Lemmatizer()
my_stemmer = FindStems()
myspell_checker = SpellCheck()
my_normalizer = Normalizer()
with open("stopwords1.txt", "r") as file:
    f = file.read()
StopWords = f.split("\n")
vectorizer = CountVectorizer()


def CleanText(InputText):
    WordsList = my_tokenizer.tokenize_words(my_normalizer.normalize(InputText))
    for i in range(len(WordsList) - 1, -1, -1):
        if (WordsList[i] in StopWords):
            del WordsList[i]
            break

        WordsList[i] = lemmatizer.lemmatize(WordsList[i]).split("#")[-1]
Ejemplo n.º 5
0
class PreParation():
    _tokenizer = Tokenizer()
    sw = None  # Stop Words
    correction = None  # Correction Collection ازتمیخوام -> ازت میخوام
    _normalizer = None  # Normilizer ي --> ی
    _normalizer1 = None  # Normilizer
    _tokenizer1 = None  # Hazm Tokenizer
    _tokenizer2 = None  # Hazm Tokenizer
    _stemmer = None  # Stemmer گفت --> گو

    extraChar1 = ["؛", "؟", ",", ";", "!", "?", ".", ":", "،"]
    extraChar2 = [
        "'", '"', "+", "{", "}", "-", "(", ")", "$", "#", '/', "\\", "@", "*",
        "٪", "÷", "¿", "[", "]", "«", "»", "^", "`", "|", "¡", "˘", "¤", "£",
        "<", ">", "¯", "°", "٭", "٫"
    ]
    _emojiList = None

    # Regular Experssion
    persianmixRE = None
    numRE = None
    removeIrritateRE = None
    emojiRE = None

    # Embedding
    w2vModel = None

    # این کلاس برای مقدار دهی اولیه مورد استفاده قرار میگیرد
    def __init__(self):
        self.sw = pd.read_csv("data/stop_words/stpwrd.csv")
        # از فایل مربوطه ستون واژه های ایست را انتخاب میکنیم و به صورت لیست رشته ای باز میگردانیم
        self.sw = self.sw["StopWord"].astype(str).values.tolist()

        self.correction = pd.read_csv("data/Vocab_dataset_1.csv",
                                      index_col=0,
                                      header=None,
                                      squeeze=True).to_dict()

        self._normalizer = Normalizer(statistical_space_correction=True,
                                      date_normalizing_needed=True)
        self._normalizer1 = HazmNormal()
        self._tokenizer1 = HazmTokenizer(join_verb_parts=False,
                                         replace_hashtags=True,
                                         replace_numbers=True,
                                         separate_emoji=True)
        self._tokenizer2 = HazmTokenizer(join_verb_parts=False,
                                         replace_hashtags=True,
                                         replace_numbers=False,
                                         separate_emoji=True)
        self._stemmer = FindStems()

        # region Regular Experssion
        # عبارتی که کلمات و اعداد به هم چسبیده را از هم جدا میکند
        self.persianmixRE = re.compile(
            "(([\u0600-\u06FF]+)([0-9]+)|([0-9]+)([\u0600-\u06FF]+)|([a-zA-Z]+)([0-9]+)|([0-9]+)([a-zA-Z]+))"
        )
        # برای اینکه متوجه شویم در متن عدد وجود دارد از این عبارت استفاده میکنیم
        self.numRE = re.compile('\d')
        self.removeIrritateRE = re.compile(r'(.)\1{2,}', re.IGNORECASE)
        # Emoji
        self.emojiRE = re.compile(
            pattern="["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
            "]+",
            flags=re.UNICODE)
        # endregion
        self._emojiList = list(emoji.UNICODE_EMOJI.keys())
        #self.w2vModel = Word2Vec.load('P:/pkl/newSentence.bin')

        print("\n ** Persian Text PreParation by Naghme Jamali ** \n")

    # Remove Multiple Space - "Salam   khobi?" -> "Salam Khobi?"
    def RemoveMultipleSpace(self, txt):
        return re.sub(' +', ' ', txt)

    # Remove Emoji
    def RemoveEmoji(self, txt):
        for emoji in self._emojiList:
            txt = txt.replace(emoji, ' EMOJI ')
        return txt
        # return self.RemoveMultipleSpace(self.emojiRE.sub(r' ', txt))

    def RemoveExtraChar1(self, txt):
        for i in self.extraChar1:
            txt = txt.replace(i, " ")
        return txt

    def RemoveExtraChar2(self, txt):
        for i in self.extraChar2:
            txt = txt.replace(i, " ")
        return txt

    # تبدیل شماره ها به انگلیسی
    def NumberEN(self, input):
        return input.replace("۰", "0").replace("۱", "1").replace(
            "۲",
            "2").replace("۳", "3").replace("۴", "4").replace("۵", "5").replace(
                "٥", "5").replace("۶", "6").replace("v", "7").replace(
                    "۷", "7").replace("۸", "8").replace("۹", "9")

    # حذف ایست واژه در زبان فارسی
    def stop_word(self, data):
        text = self.RemoveMultipleSpace(data)
        text = self.RemoveMultipleSpace(' '.join(
            [word for word in data.split() if word not in self.sw]))
        return text.strip()
        # if text != " " and text != "":
        #     return ' '.join([word for word in text.split() if word not in self.sw]).strip()
        # else:
        #     return ''

    # جداسازی اعداد و متن از یکدیگر
    def splitnumber(self, txt):
        if self.numRE.search(txt) != None:
            res = self.persianmixRE.match(txt).groups()
            return (" ".join([word for word in res[1:] if word != None]))
        return txt

    # ریشه کلمات را باز میگرداند
    # در صورتی که چند ریشه داشته باشد ما اولی را برمیداریم
    # در صورتی که در این ابزار ریشه برای کلمه پیدا نشد
    # همان کلمه را باز میگرداند
    def Stem(self, txt):
        _txt = self._stemmer.convert_to_stem(txt).split('&')
        return _txt[0]

    # حروف تکراری موجود در متن را حذف مینماید
    # برای مثال خوووووبی میشود خوبی
    def removeIrritate(self, txt):
        return self.removeIrritateRE.sub(r'\1', txt)

    def CorrectionText(self, texts):
        _texts = []
        for _text in self.wordToken(texts):
            if _text in self.correction:
                _texts.append(self.correction[_text])
            else:
                _texts.append(_text)
        return ' '.join(_texts)

    # تمیز کردن متون:
    # نرمال سازی داده ها
    # حذف حروف و نشانه های اضافه
    # در ورود یک فلگ میزاریم برای مواقعی که میخواهیم از حذف ایست واژه ها استفاده کنیم
    def cleanText(self, txt, stopword=False, isSplitNumber=True):
        #txt = txt.replace("\u200c", " ")
        txt = txt.replace("آ", "ا")
        if stopword:  # آیا ایست واژه حذف شوند؟
            txt = self.stop_word(txt)

        txt = self.removeIrritate(txt)  # حذف کاراکترهای تکراری
        txt = self.RemoveEmoji(txt)
        txt = self._normalizer1.normalize(txt)

        txt = self.RemoveExtraChar2(txt)
        txt = self.RemoveMultipleSpace(txt)
        txt = self.NumberEN(txt)

        txt1 = []
        for t in self.wordToken(txt):
            if isSplitNumber:
                try:
                    t = self.splitnumber(t)  # جداسازی اعداد از متن در یک کلمه
                except:
                    pass
            for _t in t.split():
                w1 = self.Stem(_t)
                txt1.append(w1)
        if stopword:
            return self.stop_word(' '.join(txt1)).strip()
        return (' '.join(txt1)).strip()

    # مرحله دوم از حذف علائم و نشانه ها
    # و چرا این همه جایگزینی کلمات و نشانه ها، بدلیل اینکه
    # کتابخانه های موجود عکس العملی به این علامت و نگارش های غلط
    # نشان نمیدهد و برای اینکه بتوان آن ها را تصحیح کرد از روش استفاده میکنیم
    # تا کلمات تصحیح شوند و به شکل درست خود بازگردند
    def cleanText2(self, txt, stopword=False, isTokenize=False):
        _txt = txt
        if stopword:
            _txt = self.stop_word(txt)

        # در این خط، نیم فاصله ای که ابزارهای پارسی وار و هضم ایجاد کرده اند را به فاصله تبدیل میکنیم
        #_txt = _txt.replace("\u200c", " ")
        _txt = ' '.join(self.wordToken(_txt, replaceNumber=True))
        _txt = self.CorrectionText(_txt)

        _txt = self._normalizer1.normalize(_txt)
        _txt = self.NumberEN(_txt)
        if stopword:
            _txt = self.stop_word(_txt).strip()
        return _txt

    # توکنایز در سطح جمله
    def token(self, txt):
        _sents = self._tokenizer.tokenize_sentences(txt)
        sents = []
        for _txt in _sents:
            _txt = self.RemoveExtraChar1(_txt)
            _txt = self.RemoveMultipleSpace(_txt).strip()
            sents.append(_txt)
        return sents

    # جایگزینی اعداد      Exmaple: 5 عدد  --> NUM1 عدد
    # هضم نمیتواند بعضی از اعداد را تشحیص دهد
    # برای همین این تابع را نوشتیم تا آن را پوشش دهد
    def ReplaceNumber(self, input1):
        if input1.isnumeric():
            return "NUM" + str(len(input1))
        else:
            return input1

    def wordToken(self,
                  txt,
                  replaceNumber=False,
                  removeExtra=False,
                  stopword=False):
        if removeExtra:
            txt = self.RemoveExtraChar1(txt)
        txt = self.RemoveMultipleSpace(txt).strip()

        _words = []
        if replaceNumber:
            _words.extend(
                self.ReplaceNumber(x) for x in self._tokenizer1.tokenize(txt))
        else:
            _words.extend(self._tokenizer2.tokenize(txt))

        if stopword:
            return [word for word in _words if word not in self.sw]
        else:
            return _words
Ejemplo n.º 6
0
def get_unique_words(text_path):
    '''
    This function get text corpus path as input and give set of unique words as output.

    Parameters
    ----------
    text_path : str
        Path to where our corpus exist (must in .txt format).

    Raises If spell check package of parsivar won't found.'
    ------
    
    Returns
    -------
    words_set : set
        Set of unique words from corpus.

    '''
    # Create a list from all text
    text_names = glob(join_path(text_path, '*.txt'))
    sentences = []
    for name in text_names:
        with open(name, 'r') as text:
            text_sentences = tuple(text.readlines())
        for sentence in text_sentences:
            sentence = sentence.strip('\n')
            sentences.append(sentence[sentence.find('|') + 1:])

    # Specify Signs and Numbers in order to avoid words contain them enter in our final Set
    signs = [
        '،', '«', '»', '.', ')', '(', '"', ':', ';', '%', '-', '?', ',', '؛',
        "'", '_'
    ]
    numbers = [f'{i}' for i in range(10)]

    # Create Set of all words in corpus
    try:
        spell = SpellCheck()
    except:
        raise Exception(
            'Please download spell.zip from https://www.dropbox.com/s/tlyvnzv1ha9y1kl/spell.zip?dl=0 and extract to path to parsivar/resource.'
        )
    normal = Normalizer()
    token = Tokenizer()
    stemm = FindStems()
    words_set = set()
    print('\n Start to extract and clean words from sentences! \n')
    with progressbar.ProgressBar(max_value=len(sentences),
                                 redirect_stdout=True) as bar:
        for sentence, i in zip(sentences, range(len(sentences))):
            sentence = normal.normalize(spell.spell_corrector(sentence))
            sentence = sentence.replace(u'\u200c', ' ')
            words = token.tokenize_words(sentence)
            for word in words:
                word = stemm.convert_to_stem(word)
                if '&' in word:  #This pattern found manually in text
                    word = word[:word.find('&')]
                if word in signs:  # Ignore signs
                    bar.update(i)
                    continue
                for let in word:  # Ignore words contain numbers
                    if let in numbers:
                        bar.update(i)
                        continue
                if len(word) <= 1:  # ignore one (or less)letter strings
                    bar.update(i)
                    continue
                words_set.add(word)
                bar.update(i)
    return words_set
import pickle
from parsivar import FindStems
from hazm import stopwords_list, Lemmatizer
import numpy as np

from constants import given_doc_root_path, document_root_path, limit_index
from LP_toolkits import normalizer

Lemmatizer = Lemmatizer()
Stemmer = FindStems()
stopwords = set(stopwords_list())


# define stemmer function.
def stemmer(email):
    """
    :param email: a string of email text
    :return: a string of input in which for each verb it's root has been replaced
    """
    tokens = ''
    for word in email.split():
        token = Lemmatizer.lemmatize(word)
        if '#' in token:
            token = token.split('#')
            if word in token[0]:
                token = token[0]
            else:
                token = token[1]
        else:
            token = Stemmer.convert_to_stem(word)
            if '&' in token:
Ejemplo n.º 8
0
def Evaluate_lemmatizer(inputs, labels, lib='hazm'):
    predicted_labels_with_pos = []
    predicted_labels_no_pos = []

    if lib == 'hazm':
        lemmatizer = Lemmatizer()
        for sentence in inputs:
            sent_labels_with_pos = []
            sent_labels_no_pos = []

            for (word, pos) in sentence:
                if pos == 'ADJ':
                    pos = 'AJ'
                sent_labels_with_pos.append(lemmatizer.lemmatize(word, pos))
                sent_labels_no_pos.append(lemmatizer.lemmatize(word))

            predicted_labels_with_pos.append(sent_labels_with_pos)
            predicted_labels_no_pos.append(sent_labels_no_pos)

    elif lib == 'parsivar':
        stemmer = FindStems()
        for sentence in inputs:
            sent_labels_with_pos = []
            sent_labels_no_pos = []

            for (word, pos) in sentence:
                sent_labels_with_pos.append(stemmer.convert_to_stem(word, pos))
                sent_labels_no_pos.append(stemmer.convert_to_stem(word))

            for i in range(len(sentence)):
                if sentence[i][1] == 'V':
                    sent_labels_with_pos[i] = re.sub(r"&", r"#",
                                                     sent_labels_with_pos[i])
                    sent_labels_no_pos[i] = re.sub(r"&", r"#",
                                                   sent_labels_no_pos[i])

            predicted_labels_with_pos.append(sent_labels_with_pos)
            predicted_labels_no_pos.append(sent_labels_no_pos)

    precisions_with_pos = []
    precisions_no_pos = []
    all_truly_labeled_with_pos = []

    for i in range(len(labels)):
        truly_labeled_with_pos = [
            predicted_labels_with_pos[i][j] == labels[i][j]
            for j in range(len(labels[i]))
        ]
        all_truly_labeled_with_pos.append(truly_labeled_with_pos)
        num_truly_labeled_with_pos = sum(truly_labeled_with_pos)
        truly_labeled_no_pos = [
            predicted_labels_no_pos[i][j] == labels[i][j]
            for j in range(len(labels[i]))
        ]
        num_truly_labeled_no_pos = sum(truly_labeled_no_pos)

        precision_with_pos = num_truly_labeled_with_pos / len(labels[i])
        precision_no_pos = num_truly_labeled_no_pos / len(labels[i])
        precisions_with_pos.append(precision_with_pos)
        precisions_no_pos.append(precision_no_pos)

    per_pos = {}
    detailed_analyze = {}
    for i in range(len(inputs)):
        for j in range(len(inputs[i])):

            if inputs[i][j][1] not in per_pos.keys():
                per_pos[inputs[i][j][1]] = {'true': 0, 'false': 0}

            if all_truly_labeled_with_pos[i][j]:
                per_pos[inputs[i][j][1]]['true'] += 1
            else:
                per_pos[inputs[i][j][1]]['false'] += 1

            if inputs[i][j][1] not in detailed_analyze.keys():
                detailed_analyze[inputs[i][j][1]] = {'true': [], 'false': []}

            # detailed_analyze[inputs[i][j][1]]['gold'].append(labels[i][j])
            if all_truly_labeled_with_pos[i][j]:
                detailed_analyze[inputs[i][j][1]]['true'].append(
                    inputs[i][j][0])
                # detailed_analyze[inputs[i][j][1]]['false'].append('NONE')
            else:
                detailed_analyze[inputs[i][j][1]]['false'].append(
                    inputs[i][j][0])
                # detailed_analyze[inputs[i][j][1]]['true'].append('NONE')

    accuracy_per_pos = {
        k: v['true'] / (v['true'] + v['false'])
        for k, v in per_pos.items()
    }
    for k, v in detailed_analyze.items():
        v['true'] = set(v['true'])
        v['false'] = set(v['false'])
    precision_with_pos = sum(precisions_with_pos) / len(precisions_with_pos)
    precision_no_pos = sum(precisions_no_pos) / len(precisions_no_pos)
    return precision_with_pos, precision_no_pos, accuracy_per_pos, detailed_analyze
Ejemplo n.º 9
0
from parsivar import Normalizer
from parsivar import Tokenizer
from parsivar import FindStems
from parsivar import POSTagger

my_normalizer = Normalizer()
my_tokenizer = Tokenizer()
my_stemmer = FindStems()
my_tagger = POSTagger(
    tagging_model="stanford")  # tagging_model = "wapiti" or "stanford"

tokens = []
ignore = ['،', '؟']

f = open("zaban-e_atash.txt", "r")
for line in f:
    tmp_txt = line
    words = my_tokenizer.tokenize_words(my_normalizer.normalize(tmp_txt))
    for word in words:
        if word not in tokens:
            if word not in ignore:
                tokens.append(word)

f.close()

print("<table>")
print("<td>Stem</td><td>Token & POS</td>")

for token in tokens:
    stem = my_stemmer.convert_to_stem(token)
    token_pos = my_tagger.parse(my_tokenizer.tokenize_words(token))