Python stopwords_list Examples, hazm.stopwords_list Python Examples

Example #1

0

Show file

 def __init__(self):
     self.preprocessed_docs = []
     self.normalizer = hazm.Normalizer()
     self.word_tokenizer = hazm.WordTokenizer()
     self.stemmer = hazm.Stemmer()
     self.stop_words = hazm.stopwords_list()
     self.persian_garbage = {
         u'÷': u'',
         u'ٰ': u'',
         u'،': ' ',
         u'؟': ' ',
         u'؛': '',
         u'َ': '',
         u'ُ': '',
         u'ِ': '',
         u'ّ': '',
         u'ٌ': '',
         u'ٍ': '',
         u'ئ': u'ی',
         u'ي': u'ی',
         u'ة': u'ه',
         u'ء': u'',
         u'ك': u'ک',
         u'ْ': u'',
         u'أ': u'ا',
         u'إ': u'ا',
         u'ؤ': u'و',
         u'×': u'',
         u'٪': u'',
         u'٬': u'',
         u'آ': u'ا',
         u'●': u''
     }

Example #2

0

Show file

File: refiners.py Project: IUST-DMLab/farsbase

def similar(s1, s2):
    normalizer = hazm.Normalizer()
    s1 = normalizer.normalize(s1)
    s2 = normalizer.normalize(s2)

    list_s1 = [
        word for word in s1.split(" ") if word not in hazm.stopwords_list()
    ]
    list_s2 = [
        word for word in s2.split(" ") if word not in hazm.stopwords_list()
    ]

    stemmer = hazm.Stemmer()
    stem_s1 = [stemmer.stem(word) for word in list_s1]

    same_words = set.intersection(set(list_s1), set(list_s2))
    return len(same_words)

Example #3

0

Show file

File: MySearchEngine.py Project: mahsirat-atiye/MIR

def prepare_text(text):
    text = text.lower()
    text = re.sub('\d+', '', text)
    text = text.translate(str.maketrans(punctuations, ' ' * len(punctuations)))
    text = ' '.join(
        re.sub(r'[^ضصثقفغعهخحجچشسیبلاتنمکگظطزرذدپوئژآؤ \n]', ' ',
               text).split())
    text = text.strip()
    normalized_text = normalizer.normalize(text)
    words = word_tokenize(normalized_text)
    words = [w for w in words if w != '.']
    words = [w for w in words if w not in stopwords_list()]
    words = [stemmer.stem(w) for w in words]
    return words

Example #4

0

Show file

File: data_cleaner.py Project: amiralikaboli/Spam-Detection

    def __init__(self):
        self.punctuations = [
            '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-',
            '/', ':', ';', '<', '=', '>', '@', '[', '\\', ']', '^', '_', '`',
            '{', '|', '}', '~', '£', '¤', '§', '©', '«', '®', '°', '±', '²',
            '´', '¸', '»', '¼', '½', '¾', '×', '÷', 'ˈ', '˜', '˝', '٪', '٫',
            '٬', '‐', '–', '—', '‘', '’', '“', '”', '„', '…', '″', '‹', '›',
            '™', '↑', '→', '↓', '⋅', '⌘', '▪', '◄', '○', '♫', '✓', '❤', '《',
            '》', '爆', '者', '被', '\uf020', '\uf04f', '\uf05f', '\uf076',
            '\uf0a7', '\uf0fc', '﴾', '﴿', '：', '�', '?', '؟', '.', '،', '؛',
            '•', '●'
        ]
        self.diacritics_pattern = re.compile(
            "[\u064B-\u065e\u0670\u0674\u06c3\u06d4-\u06ed]")
        self.emojis_pattern = re.compile(
            "["
            u"\U0001F600-\U0001F64F"
            u"\U0001F300-\U0001F5FF"
            u"\U0001F680-\U0001F6FF"
            u"\U0001F1E0-\U0001F1FF"
            "]+",
            flags=re.UNICODE)
        self.latin_characters_pattern = re.compile("["
                                                   "\u0041-\u007a"
                                                   "\u00c0-\u036f"
                                                   "\u0400-\u050f"
                                                   "\u0342-\u03ff"
                                                   "]")
        self.numbers_pattern = re.compile("[0-9]")
        self.space_patterns = [
            (re.compile("[\u202c\u2005\u2009\u2029\u2066\u3000\ufe0f]"), ' '),
            (re.compile("[\f\r\t\n]"), ' '),
            (re.compile("[\u001f\u009d\u200a\u200e\u200f\u206d\xa0\xad]"),
             '\u200c'),
            (re.compile(
                "[\u007f\u0085\u061c\u200b\u200d\u202a\u202b\u206f\u2003"
                "\u2028\u2060\u2063\u2067\u2069\ufeff\ufffc\x18]"), ''),
        ]
        self.stopwords = hazm.stopwords_list()[:200] + [
            'ام', 'م', 'ات', 'ای', 'ی', 'ت', 'اش', 'ش', 'مان', 'یم', 'ایم',
            'تان', 'ید', 'اید', 'شان', 'ند', 'اند', 'است', 'هست', 'بود', 'شد',
            'شو', 'باش', 'خواه', 'ها', 'های', 'ان', 'یک', 'دو', 'سه', 'چهار',
            'پنج', 'شش', 'هفت', 'هشت', 'نه', 'ده', 'هستم', 'هستم', 'هست',
            'هستید', 'هستیم', 'نیستم', 'نیستی', 'نیست', 'نیستیم', 'نیستید',
            'نیستند'
        ]

        self.normalizer = parsivar.Normalizer()
        self.stemmer = parsivar.FindStems()
        self.lemmatizer = hazm.Lemmatizer()

Example #5

0

Show file

    def __init__(self, feature_set, orientations=None, language='english'):
        self.language = language
        self.normalizer[language] = hazm.Normalizer()
        if language == 'persian':
            self.stopwords[language] = hazm.stopwords_list()
            self.regex_words[language] = r"[\w']+|[.,!?;،؟؛]"
        else:
            self.stopwords[language] = set(stopwords.words('english'))
            self.regex_words[language] = r"[\w']+|[.,!?;]"

        if orientations:
            self.orientations = orientations

        self.feature_set = feature_set
        self.weights = {}
        self.hash_dictionary[self.language] = {}

Example #6

0

Show file

File: preprocess.py Project: AUT-Twitter-Analytics/Streamlit-Dashboard

    def clean_fa(self, data):
        data.text = self.fa_normalize(data.text)
        data.text = self.tokenizer(data.text)

        stemmer = hazm.Stemmer()
        lemmatizer = hazm.Lemmatizer()
        stopwords = hazm.stopwords_list()
        alphabet = set(list("ابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی"))

        data.text = data.apply(
            lambda row: self.stemLemmaStopWord(
                stemmer, lemmatizer, stopwords, alphabet, row.text
            ),
            axis=1,
        )
        return data

Example #7

0

Show file

File: word_cloud.py Project: hadisfr/social-media-word-cloud-generator

 def __init__(self,
              mask=None,
              size=900,
              stop_words_addr=default_stop_words_path,
              mask_addr=None):
     self.hazm_normalizer = hazm.Normalizer()
     self.parsivar_normalizer = parsivar.Normalizer()
     self.stemmer = hazm.Stemmer()
     self.lemmatizer = hazm.Lemmatizer()
     self.stop_words = set(hazm.stopwords_list(stop_words_addr))
     mask = np.array(
         Image.open(mask_addr)) if mask_addr is not None else None
     self.generator = WordCloud(width=size,
                                height=size,
                                include_numbers=False,
                                persian_normalize=False,
                                collocations=True,
                                mask=mask,
                                background_color='white')

Example #8

0

Show file

File: functions.py Project: mahdisadeghi74/Mrgenius5_python

    def search_wikipedia(cls, word):
        page = requests.get("https://fa.wikipedia.org/wiki/" + word)
        soup = BeautifulSoup(page.content, features="html.parser")

        word_tokenized = []
        try:
            size = len(soup.find_all('p'))
            content = soup.find_all('p')
            for i in range(size):
                word_tokenized.append(word_tokenize(content[i].get_text()))

            filtered_words = []
            for list in word_tokenized:
                for word in list:
                    if word not in stopwords_list():
                        if word.isalpha():
                            filtered_words.append(word)

            most_common_words = (
                collections.Counter(filtered_words).most_common(10))
            return most_common_words

        except:
            return "error"

Example #9

0

Show file

File: stopwords.py Project: The-Daishogun/ml_news_classifier

from hazm import stopwords_list


def get_stopwords():
    with open("stopwords.txt", 'r') as f:
        results = f.readline().split()
    return results


stopwords = stopwords_list()
punctuation = get_stopwords()

all_stopwords = punctuation + stopwords + ["NUM"] + [
    'آقا', 'آور', 'افزا', 'باش', 'بردار', 'بست', 'بند', 'توان', 'توانست',
    'دارا', 'دان', 'ده', 'رس', 'ریخت', 'ریز', 'سال', 'سو', 'شخص', 'شو', 'هست',
    'وقت', 'کس', 'کن', 'گذار', 'گذاشت', 'گرد', 'گشت', 'گو', 'گیر', 'یاب'
] + ['بس']

Example #10

0

Show file

File: data_handler.py Project: MhmDSmdi/Demo-Question-Recommendation-System

import pickle

from hazm import Normalizer, Stemmer, Lemmatizer, sent_tokenize, word_tokenize, stopwords_list

stops = set(stopwords_list())


def load_dataset(file_name, column_name='question'):
    data = pickle.load(open(file_name, "rb"))
    statements = []
    for i in range(len(data)):
        statements.append(data[i][column_name])
    return statements


def statement_pre_processing(input_statement):
    normalizer = Normalizer()
    lemmatizer = Lemmatizer()
    input_statement = normalizer.normalize(input_statement)
    input_statement = [
        lemmatizer.lemmatize(word) for word in word_tokenize(input_statement)
        if word not in stops
    ]
    return input_statement


def dataset_cleaner(dataset):
    statements = []
    normalizer = Normalizer()
    lemmatizer = Lemmatizer()
    for i in range(len(dataset)):

Example #11

0

Show file

zoomitComments.dtypes
zoomitComments.describe()
zoomitComments.columns
zoomitComments.head()
zoomitComments=zoomitComments.drop(["ParentCommentid","UpdateDate2","CreateDate2","UpdatedByUserId","Name","Email"], axis=1)

zoomitComments['Message']=zoomitComments['Message'].astype(str)
zoomitComments['Message'] = zoomitComments['Message'].agg(lambda x: reg.sub('[<br />]',' ',x))
zoomitComments['wordCount'] = zoomitComments["Message"].agg(lambda x: len(x.split(" ")))
zoomitComments['charCount'] = zoomitComments["Message"].agg(lambda x: len(x))
zoomitComments['Message'] = zoomitComments['Message'].agg(lambda x: reg.sub('\s+',' ',x))

#zoomitComments['Message']=zoomitComments['Message'].agg(lambda x: (' ').join(reg.sub('.','',[w for w in x.split() if reg.match('([\w]+\.)+[\w]+(?=[\s]|$)',w)]))

stopWords=hm.stopwords_list()
zoomitComments['#_of_StopWords']=zoomitComments['Message'].agg(lambda x: len([w for w in x.split() if w in stopWords]))

stemWords=hm.Stemmer()
zoomitComments['Message']=zoomitComments['Message'].agg(lambda x: (' ').join([stemWords.stem(w) for w in x.split()]))

pubComment=zoomitComments.loc[zoomitComments['Status']==1,:].loc[:,['Message']]
unpubComment=zoomitComments.loc[zoomitComments['Status']==0,:].loc[:,['Message']]


len(unpubComment)
zoomitComments['Status'].unique()


import matplotlib.pyplot as pPlot
from PIL import Image

Example #12

0

Show file

import hazm as hz
from PersianStemmer import PersianStemmer
import search_engine.words_lists as wl
from itertools import combinations
import re
import string
import search_engine.configurations as config

ps = PersianStemmer()
stop_words = hz.stopwords_list()
normalizer = hz.Normalizer()
stemmer = hz.Stemmer()
lemmatizer = hz.Lemmatizer()


def process_single_document(doc_content):
    # 1: remove html tags and irrelevant contents
    cleaned_content_from_tag = remove_tags(doc_content)
    # 2: normalize text
    normalize_text = normalizer.normalize(cleaned_content_from_tag)
    # 3: tokenize
    words_token = hz.word_tokenize(normalize_text)
    config.pure_number_tokens += len(words_token)
    return words_token


def preprocess_single_word_in_query(word):
    # 1: normalized
    word = normalizer.normalize(word)
    # 2: lemmatized and stemmer
    word_lemmatized = lemmatizer.lemmatize(word)

Example #13

0

Show file

 def __init__(self):
     self.Normalizer = hazm.Normalizer()
     self.stopwords_list = hazm.stopwords_list()
     self.Stemmer = hazm.Stemmer()

Example #14

0

Show file

class Preprocessor:
    normalizer = Normalizer()
    stemmer = Stemmer()
    lemmatizer = Lemmatizer()
    tokenizer = WordTokenizer()
    stop_words = stopwords_list()

    @staticmethod
    def remove_noise(text: str) -> str:
        return Preprocessor.__remove_punctuation(
            Preprocessor.__remove_emojis(text))

    @staticmethod
    def remove_stop_words(tokens: List) -> str:
        return [t for t in tokens if t not in Preprocessor.stop_words]

    @staticmethod
    def __remove_emojis(text: str):

        emoji_pattern = re.compile(
            "["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
            u"\U00002702-\U000027B0"
            u"\U000024C2-\U0001F251"
            u"\U0001f926-\U0001f937"
            u'\U00010000-\U0010ffff'
            u"\u200d"
            u"\u2640-\u2642"
            u"\u2600-\u2B55"
            u"\u23cf"
            u"\u23e9"
            u"\u231a"
            u"\u3030"
            u"\ufe0f"
            "]+",
            flags=re.UNICODE)

        first_cleaned_text = emoji_pattern.sub(r'', text)  # no emoji
        return emoji.get_emoji_regexp().sub(r'', first_cleaned_text)

    @staticmethod
    def __remove_punctuation(text: str):
        try:
            return re.sub(
                r'[\.\?\!\,\:\;\،\(\)\؛\#\%\^\&\$\~\'\"\×\-\_\*\>\<\+\=\\\/]',
                '', text)
        except TypeError as e:
            print(e, text)

    @staticmethod
    def normalize(text: str) -> str:
        return Preprocessor.normalizer.normalize(text)

    @staticmethod
    def stem(word: str) -> str:
        return Preprocessor.stemmer.stem(word)

    @staticmethod
    def lemmatize(word: str) -> str:
        return Preprocessor.lemmatizer.lemmatize(word)

    @staticmethod
    def tokenize(text: str) -> str:
        return Preprocessor.tokenizer.tokenize(text)

    @staticmethod
    def preprocess(text: str) -> str:
        cleaned_text = Preprocessor.remove_noise(str(text))
        normalized_text = Preprocessor.normalize(cleaned_text)
        tokens = Preprocessor.tokenize(normalized_text)
        none_stop_words = Preprocessor.remove_stop_words(tokens)
        # stems = [Preprocessor.stem(w) for w in tokens]
        lemmatized = [Preprocessor.lemmatize(w) for w in none_stop_words]
        return ' '.join(lemmatized)

Example #15

0

Show file

# ایجاد یک دیکشنری
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'Label']].values)

# ایجاد یک نمود فراوانی تکرار ایموجی‌ها
fig = plt.figure(figsize=(8,6))
df2.groupby('Label').comment.count().sort_values().plot.bar(
    ylim=0, title= 'Term Frequency of each Emoji \n')
plt.xlabel('\n Number of ocurrences', fontsize = 10);
plt.show()

# پالایش کامنت‌ها
normalizer =  hazm.Normalizer()
tokenizer = hazm.SentenceTokenizer()
tokens = hazm.word_tokenize 
S_Words = list(hazm.stopwords_list())

#بازنمایی متن
tfidf = TfidfVectorizer(lowercase=False, 
                        preprocessor=normalizer.normalize, 
                        tokenizer=tokens,
                        ngram_range=(1, 2),
                        stop_words=S_Words)
comments = df2.comment
features = tfidf.fit_transform(comments).toarray()
labels = df2.category_id

# مقایسه همه مدل‌ها
models = [
    MultinomialNB(),
    RandomForestClassifier(n_estimators=200, max_depth=5, random_state=0),

Example #16

0

Show file

File: app.py Project: AUT-Twitter-Analytics/Streamlit-Dashboard

        punctuations_list = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ''' + string.punctuation

        def remove_punctuations(text):
            translator = str.maketrans('', '', punctuations_list)
            return text.translate(translator)

        words = remove_punctuations(words)
        words = re.sub(
            'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))',
            '',
            words,
            flags=re.MULTILINE)
        words = re.sub(r"@(\w+)", ' ', words, flags=re.MULTILINE)
        wordcloud = WordCloudFa(persian_normalize=True,
                                stopwords=list(STOPWORDS) +
                                hazm.stopwords_list(),
                                include_numbers=False,
                                background_color='white',
                                width=700,
                                height=500)
        frequencies = wordcloud.process_text(words)
        wc = wordcloud.generate_from_frequencies(frequencies)
        image = wc.to_image()
        st.image(image)

        # Dataframe
        st.subheader('**Data**')
        st.write(data)
        # Random Tweet
        col1, col2 = st.beta_columns(2)
        with col1:

Example #17

0

Show file

File: consts.py Project: amiralikaboli/IR-Project

import hazm

stopwords = ['ام', 'م', 'ات', 'ای', 'ی', 'ت', 'اش', 'ش', 'مان', 'یم', 'ایم', 'تان', 'ید', 'اید', 'شان', 'ند', 'اند',
             'است', 'هست', 'بود', 'شد', 'شو', 'باش', 'خواه', 'ها', 'های', 'ان', 'یک', 'دو', 'سه', 'چهار', 'پنج', 'شش',
             'هفت', 'هشت', 'نه', 'ده', 'هستم', 'هستم', 'هست', 'هستید', 'هستیم', 'نیستم', 'نیستی', 'نیست', 'نیستیم',
             'نیستید', 'نیستند'] + hazm.stopwords_list()[:200]
punctuations = ['.', '‫‪،‬‬', '!', '؟', '?', ':', '؛', '(', ')', '{', '}', '[', ']', '«', '»', '-', '/', '٪', '%', '"',
                "'", '،', '_', '=', '<', '>', '+', '@', '*', ',', ';', '&', '#', '٬', '`', '|', ',']
diacritics = ['\u064B', '\u064C', '\u064D', '\u064E', '\u064F', '\u0650', '\u0651', '\u0652', '\u0653', '\u0654',
              '\u0655']
character_mapping = {
    'ا': ['ا', 'إ', 'أ', 'آ', 'ٱ'],
    'و': ['و', 'ؤ'],
    'ی': ['ی', 'ي', 'ئ'],
    'ک': ['ک', 'ك'],
    'ه': ['ه', 'ة', 'ۀ'],
}
english_characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
half_spaces = ['\u200C', '\u200f', '\xa0']
multi_words_token = ['‫چنان چه‬', 'بنا بر این', 'مع ذلک', 'فی مابین', 'فوق العاده', 'بی شک', 'در خصوص', 'این که',
                     'به دلیل', 'به خاطر', 'بر اساس', 'از جمله', 'با توجه به', 'اشاره به', 'بین الملل', 'در راستا',
                     'در اختیار', 'خاطر نشان', 'ما فوق', 'بدین شکل']
digit_characters = '1234567890۱۲۳۴۵۶۷۸۹۰١٢٣٤٥٦٧٨٩'

# Heaps Law: y = 0.49 * x + 1.60
# Zipf Law: y = -1.73 * x + 8.36 ?????

Example #18

0

Show file

File: NLP for Phone Price Prediction.py Project: pmadinei/price-tf-idf

    for j in dd['title'][i]:
        temp = j.split('\u200c')
        for q in temp:
            temp2.append(q)
    dd['title'][i] = copy.deepcopy(temp2)

# ## Remove stopwords and Special Chars
# Because they are useless to the meaning of sentences and as a result, to prices.

# In[9]:

special_chars = [
    '!', '"', '#', '(', ')', '*', ',', '-', '.', '/', '\'', '«', '»', '،', '؛',
    '؟', '.', '…', '$'
]
stopwords = hazm.stopwords_list()
for i in range(len(dd2)):
    for j in dd2['desc'][i]:
        if (j in special_chars) or (j in stopwords):
            while (j in dd2['desc'][i]):
                dd2['desc'][i].remove(j)

    for j in dd2['title'][i]:
        if (j in special_chars) or (j in stopwords):
            while (j in dd2['desc'][i]):
                dd2['desc'][i].remove(j)

for i in range(len(dd)):
    for j in dd['desc'][i]:
        if (j in special_chars) or (j in stopwords):
            while (j in dd['desc'][i]):