Ejemplo n.º 1
0
def process_tweet(tweet):
    """Process tweet function.
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet
    """
    stemmer = TurkishStemmer()
    stopwords_english = stopwords.words('turkish')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False,
                               strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stemWord(word)  # stemming word
            tweets_clean.append(stem_word)

    return ' '.join([elem for elem in tweets_clean])
Ejemplo n.º 2
0
def my_form_post():
    get_article = request.form['text']
    snow = TurkishStemmer()
    get_article = get_article.lower()
    cleanr = re.compile('<.*?>')
    get_article = re.sub(cleanr, ' ', get_article)
    get_article = re.sub(r'[?|!|:|´|\'|"|#]', r'', get_article)
    get_article = re.sub(r'[.|,|)|´|:|(|\|/]', r' ', get_article)

    words = [
        snow.stemWord(word) for word in get_article.split()
        if word not in set(stopwords.words('turkish'))
    ]  # Stemming and removing stopwords
    get_article = ' '.join(words)
    predict = (model.predict([get_article]))
    predicted = predict[0]
    predicted = predicted.upper()
    predicted = predicted.replace("_", " ")

    return '''
        <html>
            <head>
            <link rel="stylesheet" type="text/css" href="/static/mainstyle3.css">
                <title>Tahmin Zamanı</title>
            </head>
            <body>
            <div class="container">
                <h1>Haber başlığın şununla ilgili olabilir</h1>
                <h2 class="rainbow">{}</h2>
            </div>
            </body>
        </html>'''.format(predicted)
Ejemplo n.º 3
0
def run():
    turkStem=TurkishStemmer()
    input_data = input("Lütfen sorunuzu girin.")

    words = nltk.word_tokenize(input_data)
    words = [word.lower() for word in words if word.isalpha()]
    after_stem = [turkStem.stemWord(word) for word in words]
    print("AFTER SNOWBALL STEMMER: ", after_stem)

    ##print(after_stem)
    ## print("after stem",turkStem.stemWord(a))
    ## print(turkStem.stemWord("ilişkilendiremediklerimiz, selam, gözlük , gözlem"))

    return after_stem
Ejemplo n.º 4
0
def stemming_words(text):    
    wpt = WordPunctTokenizer()
    words = wpt.tokenize(text)
    
    turkishStemmer = TurkishStemmer()
    
    stemmed_words = []
    for word in words:
        stemmed_words.append(turkishStemmer.stemWord(word))
    text = ' '.join([str(word) for word in stemmed_words])  
    
#     print (stemmed_words)
    
    return text 
Ejemplo n.º 5
0
def _make_stem(job):
    global df
    df_str = df["stem"].astype(str)
    turk_stemmer = TurkishStemmer()
    length = df.shape[0]
    for index in range(length):
        _print_progress_bar(index,
                            length,
                            job=job,
                            prefix=f"{job} Progress:",
                            length=50)
        words = df_str[index].split()
        words = " ".join(turk_stemmer.stemWords(words))
        df["stem"][index] = words
Ejemplo n.º 6
0
 def stemming_words(self, text):
     wpt = WordPunctTokenizer()
     words = wpt.tokenize(text)
     turkishStemmer = TurkishStemmer()
     stemmed_words = []
     for word in words:
         stemmed_words.append(turkishStemmer.stemWord(word))
         # try:
         #     # stemmed_words.append(turkishStemmer.stemWord(word))
         #     stemmed_words.append(word[0:5])
         # except:
         #     # stemmed_words.append(turkishStemmer.stemWord(word))
         #     stemmed_words.append(word)
     text = ' '.join([str(word) for word in stemmed_words])
     return text
Ejemplo n.º 7
0
def pam_turkish(input_file):
    """ 
    This function runs PAM algorithm for the specified Turkish text, extracts n-grams, returns topics with subtopics of the text file.
    """
    print("importing...")
    from snowballstemmer import TurkishStemmer
    from nltk.corpus import stopwords
    stemmer = TurkishStemmer()
    stops = set(stopwords.words('turkish'))
    print("preparing corpus...")
    train_corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer(stemmer=stemmer._stem()),
                                   stopwords=lambda x: len(x) <= 2 or x in stops)

    # data_feeder yields a tuple of (raw string, user data) or a str (raw string)
    train_corpus.process(open(input_file, encoding='utf-8'))

    # make PA model and train
    print("training model...")
    mdl = tp.PAModel(k1=5, k2=25, min_cf=10, min_df=1, corpus=train_corpus, seed=42)
    for i in range(0, 100, 10):  # increase 100 for more accurate results, but it will take more time
        mdl.train(10)
        print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word))

    # mdl.summary()
    # save pam for reuse
    mdl.save('trained_pam.bin')
    # for loading use mdl.load('trained_pam.bin')

    # Creating ngrams, max_len determines bigram or trigram, 3 means trigram
    ngrams = train_corpus.extract_ngrams(min_cf=2, max_len=3)
    for c in ngrams:
        if len(c.words) == 3:
            print(c.words[0], c.words[1], c.words[2], sep='\t')  # ngram words
    topic_result = []
    for k in range(mdl.k1):
        print("== Topic #{} ==".format(k))
        subs = []
        subs_prob = []
        sub_topics = mdl.get_sub_topics(k, top_n=5)
        for subtopic, probability in sub_topics:
            for word, p in mdl.get_topic_words(subtopic, top_n=1):
                subs.append(word)
            subs_prob.append(probability)
        for word, prob in mdl.get_topic_words(k, top_n=1):
            print(word, prob, sep='\t')
            topic_result.append({"topic": word, "prob": prob, "subs": subs, "subs_prob": subs_prob})

    return topic_result
Ejemplo n.º 8
0
class Infer(object):

    turkish_stemmer = TurkishStemmer()
    word2idx = Infer.load_wordIdx_txt(
        "D:/Users/gurkan.sahin/Desktop/NLP/cnn_text_class/word2idx.txt")

    def __init__(self):
        pass

    def get_pred_class(doc):
        words = Infer.stemming(doc)
        print(words)
        """
        numeric_doc = [Infer.word2idx[word] for word in words]
        print(numeric_doc, len(numeric_doc))
        """

    def load_wordIdx_txt(dict_dir):
        import json
        with open(dict_dir, "r") as json_file:
            return json.load(json_file)

    def stemming(doc):
        words = doc.split()
        for idx in range(len(words)):
            words[idx] = Infer.turkish_stemmer.stemWord(words[idx])
        return words
Ejemplo n.º 9
0
 def FileOrder(self):
     kelime = TurkishStemmer()
     for i in self.fullText:
         if (i == "" or i == "\n"):
             pass
         else:
             self.parsText.append(i)
     for i in self.parsText:
         if (kelime.stemWord(i.lower()) == "kaynak"):
             self.source_indis = self.number
         if (kelime.stemWord(i.lower()) == "önsöz"):
             self.Onsoz = self.number
         if (kelime.stemWord(i.lower()) == "ekler"):
             self.IndısEk = self.number
         else:
             self.number += 1
     print("\t Toplam Boşluk Karakteri Sayısı: ", len(self.fullText) - self.number)
     print("\t Boşluk karakteri olmadan toplam satır sayısı: ", self.number)
     print("\t Kaynakca Başlangıç indisi: ", self.source_indis)
     print("\t Onsoz Başlangıç indisi: ", self.Onsoz)
     print("\t Toplam Yapılan Atıf: ", (self.number - self.source_indis))
Ejemplo n.º 10
0
 def DosyaDuzenle(self):
     kelime = TurkishStemmer()
     for i in self.allText:
         if (i == "" or i == "\n"):
             pass
         else:
             self.parsText.append(i)
     for i in self.parsText:
         if (kelime.stemWord(i.lower()) == "kaynak"):
             self.kaynak_indis = self.sayac
         if (kelime.stemWord(i.lower()) == "önsöz"):
             self.Onsoz = self.sayac
         if (kelime.stemWord(i.lower()) == "ekler"):
             self.IndısEk = self.sayac
         else:
             self.sayac += 1
     print("\t Toplam Boşluk Karakteri Sayısı: ",
           len(self.allText) - self.sayac)
     print("\t Boşluk karakteri olmadan toplam satır sayısı: ", self.sayac)
     print("\t Kaynakca Başlangıç indisi: ", self.kaynak_indis)
     print("\t Onsoz Başlangıç indisi: ", self.Onsoz)
     print("\t Toplam Yapılan Atıf: ", (self.sayac - self.kaynak_indis))
Ejemplo n.º 11
0
    'hem', 'milyon', 'kez', 'otuz', 'beş', 'elli', 'bizi', 'da', 'sekiz', 've',
    'çok', 'bu', 'veya', 'ya', 'kırk', 'onların', 'ona', 'bana', 'yetmiş',
    'milyar', 'þunu', 'senden', 'birşeyi', 'dokuz', 'yani', 'kimi', 'þeyler',
    'kim', 'neden', 'senin', 'yedi', 'niye', 'üç', 'şey', 'mı', 'tüm',
    'onlari', 'bunda', 'ise', 'þundan', 'hep', 'þuna', 'bin', 'ben', 'ondan',
    'kimden', 'bazı', 'belki', 'ne', 'bundan', 'gibi', 'de', 'onlardan',
    'sizi', 'sizin', 'daha', 'niçin', 'þunda', 'bunu', 'beni', 'ile', 'şu',
    'şeyi', 'sizden', 'defa', 'biz', 'için', 'dahi', 'siz', 'nerde', 'kime',
    'birþey', 'birkez', 'her', 'biri', 'on', 'mü', 'diye', 'acaba', 'sen',
    'en', 'hepsi', 'bir', 'bizden', 'sanki', 'benim', 'nerede', 'onu',
    'benden', 'yüz', 'birkaç', 'çünkü', 'nasýl', 'hiç', 'katrilyon'
]
stopwords.extend(newStop)

temp = []
snow = TurkishStemmer()
for eachNew in all_news:
    eachNew.title = eachNew.title.lower()
    eachNew.content = eachNew.content.lower()  # Converting to lowercase
    cleanr = re.compile('<.*?>')
    eachNew.title = re.sub(cleanr, ' ', eachNew.title)
    eachNew.content = re.sub(cleanr, ' ',
                             eachNew.content)  # Removing HTML tags
    eachNew.title = re.sub(r'[?|!|:|´|\'|"|#]', r'', eachNew.title)
    eachNew.content = re.sub(r'[?|!|´|:|\'|"|#]', r'', eachNew.content)
    eachNew.title = re.sub(r'[.|,|)|´|:|(|\|/]', r' ', eachNew.title)
    eachNew.content = re.sub(r'[.|:|´|,|)|(|\|/]', r' ',
                             eachNew.content)  # Removing Punctuations

    words = [
        snow.stemWord(word) for word in eachNew.title.split()
Ejemplo n.º 12
0
from snowballstemmer import TurkishStemmer

snow = TurkishStemmer()

s1 = "değiştir"

s2 = "istemiyorum"

print(snow.stemWord(s1))

print(snow.stemWord(s2))
import multiprocessing
Ejemplo n.º 13
0
from snowballstemmer import TurkishStemmer

tr_stemmer = TurkishStemmer()

text = "Merhaba selam alperen"

stemmed_words = tr_stemmer.stemWords(text)

my_data = [" ".join(a) for a in stemmed_words]

print(my_data)



# -*- coding: utf-8 -*-
"""NLP_TR_LDA_MULTICORE.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/116UKkvGAYKyopDukBoSInwrBhJ5wLHh5
"""

#https://towardsdatascience.com/nlp-extracting-the-main-topics-from-your-dataset-using-lda-in-minutes-21486f5aa925
"""# Turkish Stemmer Test"""

from snowballstemmer import TurkishStemmer
turkStem = TurkishStemmer()
turkStem.stemWord("gelmişti")
"""# Import Libraries and Load Data"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import re
import warnings
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import nltk
nltk.download('stopwords')
WPT = nltk.WordPunctTokenizer()
stop_word_list = nltk.corpus.stopwords.words('turkish')
Ejemplo n.º 15
0
import json #datasetlerimiz json metin dosyası formatında olacak.
import numpy as np
import random
import pickle #modeller pickle dosyası şeklinde kaydedilecek.
from tensorflow.keras.models import Sequential #modellerimizdeki katmanların lineer bir dizisini tutacağız.
from tensorflow.keras.layers import Dense, Embedding, Dropout, Activation, GlobalAveragePooling1D #katmanlarımız için gerekli olan yapılar.
from tensorflow.keras.optimizers import SGD #gradient descent optimizasyonları için kullanacağız.
import nltk #dil işleme kütüphanemiz.
from snowballstemmer import TurkishStemmer #türkçe destekle kelime köklerini ayıracağız.

nltk.download("punkt") #cümleleri kelimelere aıyrmak için öncelikle nltk modülümüzü indiriyoruz.

with open("dataset.json") as file: #dataset dosyamızı açıyoruz.
    intents=json.load(file) #data değişkenine json dosyası açıldı.

stemmer=TurkishStemmer() #kök ayırma işlemini türkçe destekle yapıyoruz.

words=[] #ayıklanmış kelimelerimizin tutulacağı liste.
classes=[] #json dosyamızdaki etiketlerimizin tutulacağı liste.
documents=[] #json dosyamızdaki etiket ve patternların tutulacağı liste.
ignore_letters=["!","'","?",",","."] #cümle içindeki bu noktalama işaretlerini atlıyoruz.

for intent in intents["intents"]:
    for pattern in intent["patterns"]:
        word=nltk.word_tokenize(pattern) #json dosyamızdaki patternlerdeki cümleleri kelimelere ayırıyoruz.
        words.extend(word) #ayırdığımız kelimeleri listeye ekliyoruz.
        print(words)
        documents.append((word, intent["tag"])) #ayıklanmış kelime listemizi ve ait olduğu etiketi ekliyoruz.
        if intent["tag"] not in classes:
            classes.append(intent["tag"]) #etiketimizi listeye ekliyoruz.
Ejemplo n.º 16
0
!pip install gensim
import gensim
from gensim.utils import simple_preprocess
#from gensim.parsing.preprocessing import STOPWORDS           #Does not support Turkish yet.
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(400)

import nltk
nltk.download('wordnet')

import pandas as pd
from snowballstemmer import TurkishStemmer
stemmer=TurkishStemmer()

"""# Tokenizing and Stemmin Functions"""

def lemmatize_stemming(text):
    return stemmer.stemWord(text)       #lemmitize was removed because it is not working in turkish

# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in stop_word_list and len(token) > 3:
            result.append(lemmatize_stemming(token))
     
    return result
Ejemplo n.º 17
0
import pandas as pd
import numpy as np
from snowballstemmer import TurkishStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier

###################################################################

turkStem = TurkishStemmer()

df = pd.read_excel("istanbul_sozlesmesi_prep.xlsx")
df.drop(["fav_count"], axis=1, inplace=True)

#Bag of Words yöntemi ile sayısallaştırma yapar
cv = CountVectorizer()
word_vector = cv.fit_transform(df["text"].apply(
    lambda x: " ".join([turkStem.stemWord(i) for i in x.split()])))

#etiketli olan 600 verilik kısmı eğitim ve test seti olarak kullanmak üzere X ve y değişkenlerine atar
X = word_vector[:600, :]
y = df["category"].head(600)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
Ejemplo n.º 18
0
stopwords = list(stopwords.words('turkish'))
newStop = ['bir', 'ol', 'ola', 'belki', 'olur', 'bugün', 'yarın', 'şimdi', 'mu', 'onlar','seksen','ama','trilyon','buna'
,'bizim','şeyden','yirmi','altı','iki','seni','doksan','dört','bunun','ki','nereye','altmış','hem','milyon','kez','otuz','beş'
,'elli','bizi','da','sekiz','ve','çok','bu','veya','ya','kırk','onların','ona','bana','yetmiş','milyar','þunu'
,'senden','birşeyi','dokuz','yani','kimi','þeyler','kim','neden','senin','yedi','niye','üç','şey','mı','tüm','onlari'
,'bunda','ise','þundan','hep','þuna','bin','ben','ondan','kimden','bazı','belki','ne','bundan','gibi','de','onlardan','sizi','sizin'
,'daha','niçin','þunda','bunu','beni','ile','şu','şeyi','sizden','defa','biz','için','dahi','siz','nerde','kime','birþey'
,'birkez','her','biri','on','mü','diye','acaba','sen','en','hepsi','bir','bizden','sanki','benim','nerede','onu','benden'
,'yüz','birkaç','çünkü','nasýl','hiç','katrilyon']
stopwords.extend(newStop)
while True:
    print("Enter article:")
    X = input()
    if (X == '0'):
        break
    snow = TurkishStemmer()
    X = X.lower()
    cleanr = re.compile('<.*?>')
    X = re.sub(cleanr, ' ', X)
    X = re.sub(r'[?|!|:|´|\'|"|#]', r'', X)
    X = re.sub(r'[.|,|)|´|:|(|\|/]', r' ', X)
    words = [snow.stemWord(word) for word in X.split() if
             word not in stopwords]  # Stemming and removing stopwords
    X = ' '.join(words)

    text = X
    # Create and generate a word cloud image:
    wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white").generate(text)
    # Display the generated image:
    plt.figure()
    plt.imshow(wordcloud, interpolation="bilinear")
Ejemplo n.º 19
0
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import mysql.connector
import numpy as np
import re
from pandas import DataFrame
from snowballstemmer import TurkishStemmer
turkStem = TurkishStemmer()
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

import pickle

# and later you can load it
#with open('filename.pkl', 'rb') as f:
# clf = pickle.load(f)

mydb = mysql.connector.connect(
    host="localhost",
    user="******",
    passwd="123hal123",
    database="comments",
)

mycursor = mydb.cursor()
Ejemplo n.º 20
0
import tensorflow as tf
from snowballstemmer import TurkishStemmer
import numpy as np
import random
import json
#import requests
#import bs4

# nltk.download('punkt')

# Json dosyası olarak oluşturulan Covid-19 metin veri setini yükleme
with open(r"covidDataset.json", encoding="utf8") as file:
    data = json.load(file)

# Değişken tanımlamaları
stemmer = TurkishStemmer()
words = []
labels = []
docs_x = []
docs_y = []
tag = " "
global cevap
# Cümlelerin kelimelere ve etiketlere ayrılması
for intent in data["intents"]:
    for pattern in intent["patterns"]:
        wrds = nltk.word_tokenize(pattern)
        words.extend(wrds)
        docs_x.append(wrds)
        docs_y.append(intent["tag"])

    if intent["tag"] not in labels:
Ejemplo n.º 21
0
import warnings
from pandas import DataFrame
from nltk.corpus import stopwords as stop
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

if False:
    nltk.download('wordnet')
    nltk.download('punkt')
    nltk.download('averaged_perceptron_tagger')
warnings.filterwarnings(action='ignore')

wpt = nltk.WordPunctTokenizer()
PorterStemmer = PorterStemmer()
SnowballStemmer = TurkishStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stop.words('turkish'))


def remove_hyperlink(sentence: str) -> str:
    """
    This method remove hyperlinks & emails & mentions  from given sentence

    Args:
         sentence: input sentence file, :type str
    Returns:
        hyperlink removed sentence
    """
    sentence = re.sub(r"\S*@\S*\s?", " ", sentence)
    sentence = re.sub(r"www\S+", " ", sentence)
Ejemplo n.º 22
0
class Preprocess(object):
    
    vocab = []
    word2idx = {}
    idx2word = {}
    turkish_stemmer = TurkishStemmer()
    
    
    def __init__(self):
        self.corpus = []
        self.label = []
        
        
        
    def read_corpus(self, corpus_dir, label_dir, first_k_char_stem=0):
        with open(corpus_dir, "r") as sentences:
            for __sentence in sentences:
                #stemmed_line = Preprocess.stemming(__sentence, first_k_char_stem) #first_k_char stemming
                stemmed_line = Preprocess.snowball_stemmer(__sentence)
                self.corpus.append(stemmed_line)
                [self.add_vocab(word) for word in stemmed_line]
        
        with open(label_dir, "r") as labels:
            for __label in labels:
                self.label.append(int(__label.strip()))
        
        


    def snowball_stemmer(sentence):
        words = sentence.split()
        for idx in range(len(words)):
            words[idx] = Preprocess.turkish_stemmer.stemWord(words[idx])
        return words
    



    def stemming(sentence, first_k_char_stem):
        words = sentence.split()
        if first_k_char_stem != 0:
            for idx in range(len(words)):
                words[idx] = words[idx][:first_k_char_stem]
                
        return words

    
    
    
    def add_vocab(self, word):
        if word not in Preprocess.vocab:
            Preprocess.vocab.append(word)
            """
            0 index for padding word
            """
            Preprocess.word2idx[word] = len(Preprocess.vocab)
            Preprocess.idx2word[len(Preprocess.vocab)] = word
    
    
    
    def get_vocab():
        return Preprocess.vocab
    
    def get_corpus(self):
        return self.corpus
    
    def get_label(self):
        return self.label
    
    def get_word2idx():
        return Preprocess.word2idx
    
    def get_idx2word():
        return Preprocess.idx2word
Ejemplo n.º 23
0
clean_text = " ".join(list_without_punct)
logging.info('Preprocessing has finished')

print('unique word count: ', len(set(clean_text.split())))
print('whole word count: ', len(clean_text.split()))

logging.info('Tokenize words')
words = tokenize.word_tokenize(clean_text)

nltk.download('stopwords')
stop_word_list = nltk.corpus.stopwords.words('turkish')
filtered_words = [token for token in words if token not in stop_word_list]

logging.info('Stemming words')
from snowballstemmer import TurkishStemmer
turkStem = TurkishStemmer()
stemmed_clean_text = []
for w in filtered_words:
    stemmed_clean_text.append(turkStem.stemWord(w))

logging.info('Convert list into list of list for word2Vec')
list_of_list = [[x] for x in stemmed_clean_text]

#CBOW Model
logging.info('Cbow Model will be trained')
cbowModel = gensim.models.Word2Vec(list_of_list,
                                   size=100,
                                   window=2,
                                   min_count=1,
                                   workers=4,
                                   sg=0)
Ejemplo n.º 24
0
# =============================================================================
# If its frequency is 1 inside all data, it be requed to remove because of Feature Selection
# =============================================================================

#pd.Series(" ".join(df["News"]).split()).value_counts()

# =============================================================================
# Tokenizing
# =============================================================================
df = df.apply(lambda x: TextBlob(x).words)

# =============================================================================
# Stemming
# =============================================================================
stemmer = TurkishStemmer()
df = df.apply(lambda x: " ".join(stemmer.stemWord(word) for word in x))

# =============================================================================
# AddingClass   0 ekonomi     1 magazin     2 saglik     3 spor
# =============================================================================
Category = ["ekonomi" for i in range(150)]
Category.extend(["magazin" for i in range(150)])
Category.extend(["saglik" for i in range(150)])
Category.extend(["spor" for i in range(150)])
Category.extend(["ekonomi" for i in range(80)])
Category.extend(["magazin" for i in range(80)])
Category.extend(["saglik" for i in range(80)])
Category.extend(["spor" for i in range(80)])

dframe = pd.DataFrame(df, columns=["News"])
Ejemplo n.º 25
0
!pip install gensim
import gensim
from gensim.utils import simple_preprocess
#from gensim.parsing.preprocessing import STOPWORDS           #Does not support Turkish yet.
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(400)

nltk.download('wordnet')

"""**Checking Stemmer**"""

import pandas as pd
from snowballstemmer import TurkishStemmer
stemmer=TurkishStemmer()
original_words = ['Başarılı', 'insanlar', 'adamlar', 'öldüler', 'içindekiler','kapısındaki', 'yiyecekler,', 'çıkaranlar', 
           'lahanalar', 'takımların','sırası', 'futbolcuların', 'yedikleri']
singles = [stemmer.stemWord(plural) for plural in original_words]

pd.DataFrame(data={'original word':original_words, 'stemmed':singles })

"""**Stemming and Tokenizing Functions**"""

def lemmatize_stemming(text):       # Lemmetizing is removed because it is not appropriate for turkish
    return stemmer.stemWord(text)

# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :