コード例 #1
0
class Infer(object):

    turkish_stemmer = TurkishStemmer()
    word2idx = Infer.load_wordIdx_txt(
        "D:/Users/gurkan.sahin/Desktop/NLP/cnn_text_class/word2idx.txt")

    def __init__(self):
        pass

    def get_pred_class(doc):
        words = Infer.stemming(doc)
        print(words)
        """
        numeric_doc = [Infer.word2idx[word] for word in words]
        print(numeric_doc, len(numeric_doc))
        """

    def load_wordIdx_txt(dict_dir):
        import json
        with open(dict_dir, "r") as json_file:
            return json.load(json_file)

    def stemming(doc):
        words = doc.split()
        for idx in range(len(words)):
            words[idx] = Infer.turkish_stemmer.stemWord(words[idx])
        return words
コード例 #2
0
ファイル: utils.py プロジェクト: ekrembal/CyberGuard
def process_tweet(tweet):
    """Process tweet function.
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet
    """
    stemmer = TurkishStemmer()
    stopwords_english = stopwords.words('turkish')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False,
                               strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stemWord(word)  # stemming word
            tweets_clean.append(stem_word)

    return ' '.join([elem for elem in tweets_clean])
コード例 #3
0
def my_form_post():
    get_article = request.form['text']
    snow = TurkishStemmer()
    get_article = get_article.lower()
    cleanr = re.compile('<.*?>')
    get_article = re.sub(cleanr, ' ', get_article)
    get_article = re.sub(r'[?|!|:|´|\'|"|#]', r'', get_article)
    get_article = re.sub(r'[.|,|)|´|:|(|\|/]', r' ', get_article)

    words = [
        snow.stemWord(word) for word in get_article.split()
        if word not in set(stopwords.words('turkish'))
    ]  # Stemming and removing stopwords
    get_article = ' '.join(words)
    predict = (model.predict([get_article]))
    predicted = predict[0]
    predicted = predicted.upper()
    predicted = predicted.replace("_", " ")

    return '''
        <html>
            <head>
            <link rel="stylesheet" type="text/css" href="/static/mainstyle3.css">
                <title>Tahmin Zamanı</title>
            </head>
            <body>
            <div class="container">
                <h1>Haber başlığın şununla ilgili olabilir</h1>
                <h2 class="rainbow">{}</h2>
            </div>
            </body>
        </html>'''.format(predicted)
コード例 #4
0
def _make_stem(job):
    global df
    df_str = df["stem"].astype(str)
    turk_stemmer = TurkishStemmer()
    length = df.shape[0]
    for index in range(length):
        _print_progress_bar(index,
                            length,
                            job=job,
                            prefix=f"{job} Progress:",
                            length=50)
        words = df_str[index].split()
        words = " ".join(turk_stemmer.stemWords(words))
        df["stem"][index] = words
コード例 #5
0
ファイル: helpers.py プロジェクト: 5l1v3r1/ML-Project
def stemming_words(text):    
    wpt = WordPunctTokenizer()
    words = wpt.tokenize(text)
    
    turkishStemmer = TurkishStemmer()
    
    stemmed_words = []
    for word in words:
        stemmed_words.append(turkishStemmer.stemWord(word))
    text = ' '.join([str(word) for word in stemmed_words])  
    
#     print (stemmed_words)
    
    return text 
コード例 #6
0
def run():
    turkStem=TurkishStemmer()
    input_data = input("Lütfen sorunuzu girin.")

    words = nltk.word_tokenize(input_data)
    words = [word.lower() for word in words if word.isalpha()]
    after_stem = [turkStem.stemWord(word) for word in words]
    print("AFTER SNOWBALL STEMMER: ", after_stem)

    ##print(after_stem)
    ## print("after stem",turkStem.stemWord(a))
    ## print(turkStem.stemWord("ilişkilendiremediklerimiz, selam, gözlük , gözlem"))

    return after_stem
コード例 #7
0
 def stemming_words(self, text):
     wpt = WordPunctTokenizer()
     words = wpt.tokenize(text)
     turkishStemmer = TurkishStemmer()
     stemmed_words = []
     for word in words:
         stemmed_words.append(turkishStemmer.stemWord(word))
         # try:
         #     # stemmed_words.append(turkishStemmer.stemWord(word))
         #     stemmed_words.append(word[0:5])
         # except:
         #     # stemmed_words.append(turkishStemmer.stemWord(word))
         #     stemmed_words.append(word)
     text = ' '.join([str(word) for word in stemmed_words])
     return text
コード例 #8
0
def pam_turkish(input_file):
    """ 
    This function runs PAM algorithm for the specified Turkish text, extracts n-grams, returns topics with subtopics of the text file.
    """
    print("importing...")
    from snowballstemmer import TurkishStemmer
    from nltk.corpus import stopwords
    stemmer = TurkishStemmer()
    stops = set(stopwords.words('turkish'))
    print("preparing corpus...")
    train_corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer(stemmer=stemmer._stem()),
                                   stopwords=lambda x: len(x) <= 2 or x in stops)

    # data_feeder yields a tuple of (raw string, user data) or a str (raw string)
    train_corpus.process(open(input_file, encoding='utf-8'))

    # make PA model and train
    print("training model...")
    mdl = tp.PAModel(k1=5, k2=25, min_cf=10, min_df=1, corpus=train_corpus, seed=42)
    for i in range(0, 100, 10):  # increase 100 for more accurate results, but it will take more time
        mdl.train(10)
        print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word))

    # mdl.summary()
    # save pam for reuse
    mdl.save('trained_pam.bin')
    # for loading use mdl.load('trained_pam.bin')

    # Creating ngrams, max_len determines bigram or trigram, 3 means trigram
    ngrams = train_corpus.extract_ngrams(min_cf=2, max_len=3)
    for c in ngrams:
        if len(c.words) == 3:
            print(c.words[0], c.words[1], c.words[2], sep='\t')  # ngram words
    topic_result = []
    for k in range(mdl.k1):
        print("== Topic #{} ==".format(k))
        subs = []
        subs_prob = []
        sub_topics = mdl.get_sub_topics(k, top_n=5)
        for subtopic, probability in sub_topics:
            for word, p in mdl.get_topic_words(subtopic, top_n=1):
                subs.append(word)
            subs_prob.append(probability)
        for word, prob in mdl.get_topic_words(k, top_n=1):
            print(word, prob, sep='\t')
            topic_result.append({"topic": word, "prob": prob, "subs": subs, "subs_prob": subs_prob})

    return topic_result
コード例 #9
0
 def FileOrder(self):
     kelime = TurkishStemmer()
     for i in self.fullText:
         if (i == "" or i == "\n"):
             pass
         else:
             self.parsText.append(i)
     for i in self.parsText:
         if (kelime.stemWord(i.lower()) == "kaynak"):
             self.source_indis = self.number
         if (kelime.stemWord(i.lower()) == "önsöz"):
             self.Onsoz = self.number
         if (kelime.stemWord(i.lower()) == "ekler"):
             self.IndısEk = self.number
         else:
             self.number += 1
     print("\t Toplam Boşluk Karakteri Sayısı: ", len(self.fullText) - self.number)
     print("\t Boşluk karakteri olmadan toplam satır sayısı: ", self.number)
     print("\t Kaynakca Başlangıç indisi: ", self.source_indis)
     print("\t Onsoz Başlangıç indisi: ", self.Onsoz)
     print("\t Toplam Yapılan Atıf: ", (self.number - self.source_indis))
コード例 #10
0
 def DosyaDuzenle(self):
     kelime = TurkishStemmer()
     for i in self.allText:
         if (i == "" or i == "\n"):
             pass
         else:
             self.parsText.append(i)
     for i in self.parsText:
         if (kelime.stemWord(i.lower()) == "kaynak"):
             self.kaynak_indis = self.sayac
         if (kelime.stemWord(i.lower()) == "önsöz"):
             self.Onsoz = self.sayac
         if (kelime.stemWord(i.lower()) == "ekler"):
             self.IndısEk = self.sayac
         else:
             self.sayac += 1
     print("\t Toplam Boşluk Karakteri Sayısı: ",
           len(self.allText) - self.sayac)
     print("\t Boşluk karakteri olmadan toplam satır sayısı: ", self.sayac)
     print("\t Kaynakca Başlangıç indisi: ", self.kaynak_indis)
     print("\t Onsoz Başlangıç indisi: ", self.Onsoz)
     print("\t Toplam Yapılan Atıf: ", (self.sayac - self.kaynak_indis))
コード例 #11
0
ファイル: HomeWork_Spyder.py プロジェクト: HSKayman/DEU_CSC
# =============================================================================
# If its frequency is 1 inside all data, it be requed to remove because of Feature Selection
# =============================================================================

#pd.Series(" ".join(df["News"]).split()).value_counts()

# =============================================================================
# Tokenizing
# =============================================================================
df = df.apply(lambda x: TextBlob(x).words)

# =============================================================================
# Stemming
# =============================================================================
stemmer = TurkishStemmer()
df = df.apply(lambda x: " ".join(stemmer.stemWord(word) for word in x))

# =============================================================================
# AddingClass   0 ekonomi     1 magazin     2 saglik     3 spor
# =============================================================================
Category = ["ekonomi" for i in range(150)]
Category.extend(["magazin" for i in range(150)])
Category.extend(["saglik" for i in range(150)])
Category.extend(["spor" for i in range(150)])
Category.extend(["ekonomi" for i in range(80)])
Category.extend(["magazin" for i in range(80)])
Category.extend(["saglik" for i in range(80)])
Category.extend(["spor" for i in range(80)])

dframe = pd.DataFrame(df, columns=["News"])
コード例 #12
0
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import mysql.connector
import numpy as np
import re
from pandas import DataFrame
from snowballstemmer import TurkishStemmer
turkStem = TurkishStemmer()
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

import pickle

# and later you can load it
#with open('filename.pkl', 'rb') as f:
# clf = pickle.load(f)

mydb = mysql.connector.connect(
    host="localhost",
    user="******",
    passwd="123hal123",
    database="comments",
)

mycursor = mydb.cursor()
コード例 #13
0
ファイル: app.py プロジェクト: HolyPickle/machine_learning
    'hem', 'milyon', 'kez', 'otuz', 'beş', 'elli', 'bizi', 'da', 'sekiz', 've',
    'çok', 'bu', 'veya', 'ya', 'kırk', 'onların', 'ona', 'bana', 'yetmiş',
    'milyar', 'þunu', 'senden', 'birşeyi', 'dokuz', 'yani', 'kimi', 'þeyler',
    'kim', 'neden', 'senin', 'yedi', 'niye', 'üç', 'şey', 'mı', 'tüm',
    'onlari', 'bunda', 'ise', 'þundan', 'hep', 'þuna', 'bin', 'ben', 'ondan',
    'kimden', 'bazı', 'belki', 'ne', 'bundan', 'gibi', 'de', 'onlardan',
    'sizi', 'sizin', 'daha', 'niçin', 'þunda', 'bunu', 'beni', 'ile', 'şu',
    'şeyi', 'sizden', 'defa', 'biz', 'için', 'dahi', 'siz', 'nerde', 'kime',
    'birþey', 'birkez', 'her', 'biri', 'on', 'mü', 'diye', 'acaba', 'sen',
    'en', 'hepsi', 'bir', 'bizden', 'sanki', 'benim', 'nerede', 'onu',
    'benden', 'yüz', 'birkaç', 'çünkü', 'nasýl', 'hiç', 'katrilyon'
]
stopwords.extend(newStop)

temp = []
snow = TurkishStemmer()
for eachNew in all_news:
    eachNew.title = eachNew.title.lower()
    eachNew.content = eachNew.content.lower()  # Converting to lowercase
    cleanr = re.compile('<.*?>')
    eachNew.title = re.sub(cleanr, ' ', eachNew.title)
    eachNew.content = re.sub(cleanr, ' ',
                             eachNew.content)  # Removing HTML tags
    eachNew.title = re.sub(r'[?|!|:|´|\'|"|#]', r'', eachNew.title)
    eachNew.content = re.sub(r'[?|!|´|:|\'|"|#]', r'', eachNew.content)
    eachNew.title = re.sub(r'[.|,|)|´|:|(|\|/]', r' ', eachNew.title)
    eachNew.content = re.sub(r'[.|:|´|,|)|(|\|/]', r' ',
                             eachNew.content)  # Removing Punctuations

    words = [
        snow.stemWord(word) for word in eachNew.title.split()
コード例 #14
0
import json #datasetlerimiz json metin dosyası formatında olacak.
import numpy as np
import random
import pickle #modeller pickle dosyası şeklinde kaydedilecek.
from tensorflow.keras.models import Sequential #modellerimizdeki katmanların lineer bir dizisini tutacağız.
from tensorflow.keras.layers import Dense, Embedding, Dropout, Activation, GlobalAveragePooling1D #katmanlarımız için gerekli olan yapılar.
from tensorflow.keras.optimizers import SGD #gradient descent optimizasyonları için kullanacağız.
import nltk #dil işleme kütüphanemiz.
from snowballstemmer import TurkishStemmer #türkçe destekle kelime köklerini ayıracağız.

nltk.download("punkt") #cümleleri kelimelere aıyrmak için öncelikle nltk modülümüzü indiriyoruz.

with open("dataset.json") as file: #dataset dosyamızı açıyoruz.
    intents=json.load(file) #data değişkenine json dosyası açıldı.

stemmer=TurkishStemmer() #kök ayırma işlemini türkçe destekle yapıyoruz.

words=[] #ayıklanmış kelimelerimizin tutulacağı liste.
classes=[] #json dosyamızdaki etiketlerimizin tutulacağı liste.
documents=[] #json dosyamızdaki etiket ve patternların tutulacağı liste.
ignore_letters=["!","'","?",",","."] #cümle içindeki bu noktalama işaretlerini atlıyoruz.

for intent in intents["intents"]:
    for pattern in intent["patterns"]:
        word=nltk.word_tokenize(pattern) #json dosyamızdaki patternlerdeki cümleleri kelimelere ayırıyoruz.
        words.extend(word) #ayırdığımız kelimeleri listeye ekliyoruz.
        print(words)
        documents.append((word, intent["tag"])) #ayıklanmış kelime listemizi ve ait olduğu etiketi ekliyoruz.
        if intent["tag"] not in classes:
            classes.append(intent["tag"]) #etiketimizi listeye ekliyoruz.
コード例 #15
0
class Preprocess(object):
    
    vocab = []
    word2idx = {}
    idx2word = {}
    turkish_stemmer = TurkishStemmer()
    
    
    def __init__(self):
        self.corpus = []
        self.label = []
        
        
        
    def read_corpus(self, corpus_dir, label_dir, first_k_char_stem=0):
        with open(corpus_dir, "r") as sentences:
            for __sentence in sentences:
                #stemmed_line = Preprocess.stemming(__sentence, first_k_char_stem) #first_k_char stemming
                stemmed_line = Preprocess.snowball_stemmer(__sentence)
                self.corpus.append(stemmed_line)
                [self.add_vocab(word) for word in stemmed_line]
        
        with open(label_dir, "r") as labels:
            for __label in labels:
                self.label.append(int(__label.strip()))
        
        


    def snowball_stemmer(sentence):
        words = sentence.split()
        for idx in range(len(words)):
            words[idx] = Preprocess.turkish_stemmer.stemWord(words[idx])
        return words
    



    def stemming(sentence, first_k_char_stem):
        words = sentence.split()
        if first_k_char_stem != 0:
            for idx in range(len(words)):
                words[idx] = words[idx][:first_k_char_stem]
                
        return words

    
    
    
    def add_vocab(self, word):
        if word not in Preprocess.vocab:
            Preprocess.vocab.append(word)
            """
            0 index for padding word
            """
            Preprocess.word2idx[word] = len(Preprocess.vocab)
            Preprocess.idx2word[len(Preprocess.vocab)] = word
    
    
    
    def get_vocab():
        return Preprocess.vocab
    
    def get_corpus(self):
        return self.corpus
    
    def get_label(self):
        return self.label
    
    def get_word2idx():
        return Preprocess.word2idx
    
    def get_idx2word():
        return Preprocess.idx2word
コード例 #16
0
import warnings
from pandas import DataFrame
from nltk.corpus import stopwords as stop
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

if False:
    nltk.download('wordnet')
    nltk.download('punkt')
    nltk.download('averaged_perceptron_tagger')
warnings.filterwarnings(action='ignore')

wpt = nltk.WordPunctTokenizer()
PorterStemmer = PorterStemmer()
SnowballStemmer = TurkishStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stop.words('turkish'))


def remove_hyperlink(sentence: str) -> str:
    """
    This method remove hyperlinks & emails & mentions  from given sentence

    Args:
         sentence: input sentence file, :type str
    Returns:
        hyperlink removed sentence
    """
    sentence = re.sub(r"\S*@\S*\s?", " ", sentence)
    sentence = re.sub(r"www\S+", " ", sentence)