class Infer(object): turkish_stemmer = TurkishStemmer() word2idx = Infer.load_wordIdx_txt( "D:/Users/gurkan.sahin/Desktop/NLP/cnn_text_class/word2idx.txt") def __init__(self): pass def get_pred_class(doc): words = Infer.stemming(doc) print(words) """ numeric_doc = [Infer.word2idx[word] for word in words] print(numeric_doc, len(numeric_doc)) """ def load_wordIdx_txt(dict_dir): import json with open(dict_dir, "r") as json_file: return json.load(json_file) def stemming(doc): words = doc.split() for idx in range(len(words)): words[idx] = Infer.turkish_stemmer.stemWord(words[idx]) return words
def process_tweet(tweet): """Process tweet function. Input: tweet: a string containing a tweet Output: tweets_clean: a list of words containing the processed tweet """ stemmer = TurkishStemmer() stopwords_english = stopwords.words('turkish') # remove stock market tickers like $GE tweet = re.sub(r'\$\w*', '', tweet) # remove old style retweet text "RT" tweet = re.sub(r'^RT[\s]+', '', tweet) # remove hyperlinks tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet) # remove hashtags # only removing the hash # sign from the word tweet = re.sub(r'#', '', tweet) # tokenize tweets tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True) tweet_tokens = tokenizer.tokenize(tweet) tweets_clean = [] for word in tweet_tokens: if (word not in stopwords_english and # remove stopwords word not in string.punctuation): # remove punctuation # tweets_clean.append(word) stem_word = stemmer.stemWord(word) # stemming word tweets_clean.append(stem_word) return ' '.join([elem for elem in tweets_clean])
def my_form_post(): get_article = request.form['text'] snow = TurkishStemmer() get_article = get_article.lower() cleanr = re.compile('<.*?>') get_article = re.sub(cleanr, ' ', get_article) get_article = re.sub(r'[?|!|:|´|\'|"|#]', r'', get_article) get_article = re.sub(r'[.|,|)|´|:|(|\|/]', r' ', get_article) words = [ snow.stemWord(word) for word in get_article.split() if word not in set(stopwords.words('turkish')) ] # Stemming and removing stopwords get_article = ' '.join(words) predict = (model.predict([get_article])) predicted = predict[0] predicted = predicted.upper() predicted = predicted.replace("_", " ") return ''' <html> <head> <link rel="stylesheet" type="text/css" href="/static/mainstyle3.css"> <title>Tahmin Zamanı</title> </head> <body> <div class="container"> <h1>Haber başlığın şununla ilgili olabilir</h1> <h2 class="rainbow">{}</h2> </div> </body> </html>'''.format(predicted)
def _make_stem(job): global df df_str = df["stem"].astype(str) turk_stemmer = TurkishStemmer() length = df.shape[0] for index in range(length): _print_progress_bar(index, length, job=job, prefix=f"{job} Progress:", length=50) words = df_str[index].split() words = " ".join(turk_stemmer.stemWords(words)) df["stem"][index] = words
def stemming_words(text): wpt = WordPunctTokenizer() words = wpt.tokenize(text) turkishStemmer = TurkishStemmer() stemmed_words = [] for word in words: stemmed_words.append(turkishStemmer.stemWord(word)) text = ' '.join([str(word) for word in stemmed_words]) # print (stemmed_words) return text
def run(): turkStem=TurkishStemmer() input_data = input("Lütfen sorunuzu girin.") words = nltk.word_tokenize(input_data) words = [word.lower() for word in words if word.isalpha()] after_stem = [turkStem.stemWord(word) for word in words] print("AFTER SNOWBALL STEMMER: ", after_stem) ##print(after_stem) ## print("after stem",turkStem.stemWord(a)) ## print(turkStem.stemWord("ilişkilendiremediklerimiz, selam, gözlük , gözlem")) return after_stem
def stemming_words(self, text): wpt = WordPunctTokenizer() words = wpt.tokenize(text) turkishStemmer = TurkishStemmer() stemmed_words = [] for word in words: stemmed_words.append(turkishStemmer.stemWord(word)) # try: # # stemmed_words.append(turkishStemmer.stemWord(word)) # stemmed_words.append(word[0:5]) # except: # # stemmed_words.append(turkishStemmer.stemWord(word)) # stemmed_words.append(word) text = ' '.join([str(word) for word in stemmed_words]) return text
def pam_turkish(input_file): """ This function runs PAM algorithm for the specified Turkish text, extracts n-grams, returns topics with subtopics of the text file. """ print("importing...") from snowballstemmer import TurkishStemmer from nltk.corpus import stopwords stemmer = TurkishStemmer() stops = set(stopwords.words('turkish')) print("preparing corpus...") train_corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer(stemmer=stemmer._stem()), stopwords=lambda x: len(x) <= 2 or x in stops) # data_feeder yields a tuple of (raw string, user data) or a str (raw string) train_corpus.process(open(input_file, encoding='utf-8')) # make PA model and train print("training model...") mdl = tp.PAModel(k1=5, k2=25, min_cf=10, min_df=1, corpus=train_corpus, seed=42) for i in range(0, 100, 10): # increase 100 for more accurate results, but it will take more time mdl.train(10) print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word)) # mdl.summary() # save pam for reuse mdl.save('trained_pam.bin') # for loading use mdl.load('trained_pam.bin') # Creating ngrams, max_len determines bigram or trigram, 3 means trigram ngrams = train_corpus.extract_ngrams(min_cf=2, max_len=3) for c in ngrams: if len(c.words) == 3: print(c.words[0], c.words[1], c.words[2], sep='\t') # ngram words topic_result = [] for k in range(mdl.k1): print("== Topic #{} ==".format(k)) subs = [] subs_prob = [] sub_topics = mdl.get_sub_topics(k, top_n=5) for subtopic, probability in sub_topics: for word, p in mdl.get_topic_words(subtopic, top_n=1): subs.append(word) subs_prob.append(probability) for word, prob in mdl.get_topic_words(k, top_n=1): print(word, prob, sep='\t') topic_result.append({"topic": word, "prob": prob, "subs": subs, "subs_prob": subs_prob}) return topic_result
def FileOrder(self): kelime = TurkishStemmer() for i in self.fullText: if (i == "" or i == "\n"): pass else: self.parsText.append(i) for i in self.parsText: if (kelime.stemWord(i.lower()) == "kaynak"): self.source_indis = self.number if (kelime.stemWord(i.lower()) == "önsöz"): self.Onsoz = self.number if (kelime.stemWord(i.lower()) == "ekler"): self.IndısEk = self.number else: self.number += 1 print("\t Toplam Boşluk Karakteri Sayısı: ", len(self.fullText) - self.number) print("\t Boşluk karakteri olmadan toplam satır sayısı: ", self.number) print("\t Kaynakca Başlangıç indisi: ", self.source_indis) print("\t Onsoz Başlangıç indisi: ", self.Onsoz) print("\t Toplam Yapılan Atıf: ", (self.number - self.source_indis))
def DosyaDuzenle(self): kelime = TurkishStemmer() for i in self.allText: if (i == "" or i == "\n"): pass else: self.parsText.append(i) for i in self.parsText: if (kelime.stemWord(i.lower()) == "kaynak"): self.kaynak_indis = self.sayac if (kelime.stemWord(i.lower()) == "önsöz"): self.Onsoz = self.sayac if (kelime.stemWord(i.lower()) == "ekler"): self.IndısEk = self.sayac else: self.sayac += 1 print("\t Toplam Boşluk Karakteri Sayısı: ", len(self.allText) - self.sayac) print("\t Boşluk karakteri olmadan toplam satır sayısı: ", self.sayac) print("\t Kaynakca Başlangıç indisi: ", self.kaynak_indis) print("\t Onsoz Başlangıç indisi: ", self.Onsoz) print("\t Toplam Yapılan Atıf: ", (self.sayac - self.kaynak_indis))
# ============================================================================= # If its frequency is 1 inside all data, it be requed to remove because of Feature Selection # ============================================================================= #pd.Series(" ".join(df["News"]).split()).value_counts() # ============================================================================= # Tokenizing # ============================================================================= df = df.apply(lambda x: TextBlob(x).words) # ============================================================================= # Stemming # ============================================================================= stemmer = TurkishStemmer() df = df.apply(lambda x: " ".join(stemmer.stemWord(word) for word in x)) # ============================================================================= # AddingClass 0 ekonomi 1 magazin 2 saglik 3 spor # ============================================================================= Category = ["ekonomi" for i in range(150)] Category.extend(["magazin" for i in range(150)]) Category.extend(["saglik" for i in range(150)]) Category.extend(["spor" for i in range(150)]) Category.extend(["ekonomi" for i in range(80)]) Category.extend(["magazin" for i in range(80)]) Category.extend(["saglik" for i in range(80)]) Category.extend(["spor" for i in range(80)]) dframe = pd.DataFrame(df, columns=["News"])
import pandas as pd from sklearn.feature_extraction.text import CountVectorizer import mysql.connector import numpy as np import re from pandas import DataFrame from snowballstemmer import TurkishStemmer turkStem = TurkishStemmer() from sklearn.naive_bayes import GaussianNB from sklearn.naive_bayes import MultinomialNB from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import CountVectorizer from sklearn.linear_model import LogisticRegression from sklearn.feature_extraction.text import TfidfTransformer from nltk.corpus import stopwords from nltk.stem import SnowballStemmer import pickle # and later you can load it #with open('filename.pkl', 'rb') as f: # clf = pickle.load(f) mydb = mysql.connector.connect( host="localhost", user="******", passwd="123hal123", database="comments", ) mycursor = mydb.cursor()
'hem', 'milyon', 'kez', 'otuz', 'beş', 'elli', 'bizi', 'da', 'sekiz', 've', 'çok', 'bu', 'veya', 'ya', 'kırk', 'onların', 'ona', 'bana', 'yetmiş', 'milyar', 'þunu', 'senden', 'birşeyi', 'dokuz', 'yani', 'kimi', 'þeyler', 'kim', 'neden', 'senin', 'yedi', 'niye', 'üç', 'şey', 'mı', 'tüm', 'onlari', 'bunda', 'ise', 'þundan', 'hep', 'þuna', 'bin', 'ben', 'ondan', 'kimden', 'bazı', 'belki', 'ne', 'bundan', 'gibi', 'de', 'onlardan', 'sizi', 'sizin', 'daha', 'niçin', 'þunda', 'bunu', 'beni', 'ile', 'şu', 'şeyi', 'sizden', 'defa', 'biz', 'için', 'dahi', 'siz', 'nerde', 'kime', 'birþey', 'birkez', 'her', 'biri', 'on', 'mü', 'diye', 'acaba', 'sen', 'en', 'hepsi', 'bir', 'bizden', 'sanki', 'benim', 'nerede', 'onu', 'benden', 'yüz', 'birkaç', 'çünkü', 'nasýl', 'hiç', 'katrilyon' ] stopwords.extend(newStop) temp = [] snow = TurkishStemmer() for eachNew in all_news: eachNew.title = eachNew.title.lower() eachNew.content = eachNew.content.lower() # Converting to lowercase cleanr = re.compile('<.*?>') eachNew.title = re.sub(cleanr, ' ', eachNew.title) eachNew.content = re.sub(cleanr, ' ', eachNew.content) # Removing HTML tags eachNew.title = re.sub(r'[?|!|:|´|\'|"|#]', r'', eachNew.title) eachNew.content = re.sub(r'[?|!|´|:|\'|"|#]', r'', eachNew.content) eachNew.title = re.sub(r'[.|,|)|´|:|(|\|/]', r' ', eachNew.title) eachNew.content = re.sub(r'[.|:|´|,|)|(|\|/]', r' ', eachNew.content) # Removing Punctuations words = [ snow.stemWord(word) for word in eachNew.title.split()
import json #datasetlerimiz json metin dosyası formatında olacak. import numpy as np import random import pickle #modeller pickle dosyası şeklinde kaydedilecek. from tensorflow.keras.models import Sequential #modellerimizdeki katmanların lineer bir dizisini tutacağız. from tensorflow.keras.layers import Dense, Embedding, Dropout, Activation, GlobalAveragePooling1D #katmanlarımız için gerekli olan yapılar. from tensorflow.keras.optimizers import SGD #gradient descent optimizasyonları için kullanacağız. import nltk #dil işleme kütüphanemiz. from snowballstemmer import TurkishStemmer #türkçe destekle kelime köklerini ayıracağız. nltk.download("punkt") #cümleleri kelimelere aıyrmak için öncelikle nltk modülümüzü indiriyoruz. with open("dataset.json") as file: #dataset dosyamızı açıyoruz. intents=json.load(file) #data değişkenine json dosyası açıldı. stemmer=TurkishStemmer() #kök ayırma işlemini türkçe destekle yapıyoruz. words=[] #ayıklanmış kelimelerimizin tutulacağı liste. classes=[] #json dosyamızdaki etiketlerimizin tutulacağı liste. documents=[] #json dosyamızdaki etiket ve patternların tutulacağı liste. ignore_letters=["!","'","?",",","."] #cümle içindeki bu noktalama işaretlerini atlıyoruz. for intent in intents["intents"]: for pattern in intent["patterns"]: word=nltk.word_tokenize(pattern) #json dosyamızdaki patternlerdeki cümleleri kelimelere ayırıyoruz. words.extend(word) #ayırdığımız kelimeleri listeye ekliyoruz. print(words) documents.append((word, intent["tag"])) #ayıklanmış kelime listemizi ve ait olduğu etiketi ekliyoruz. if intent["tag"] not in classes: classes.append(intent["tag"]) #etiketimizi listeye ekliyoruz.
class Preprocess(object): vocab = [] word2idx = {} idx2word = {} turkish_stemmer = TurkishStemmer() def __init__(self): self.corpus = [] self.label = [] def read_corpus(self, corpus_dir, label_dir, first_k_char_stem=0): with open(corpus_dir, "r") as sentences: for __sentence in sentences: #stemmed_line = Preprocess.stemming(__sentence, first_k_char_stem) #first_k_char stemming stemmed_line = Preprocess.snowball_stemmer(__sentence) self.corpus.append(stemmed_line) [self.add_vocab(word) for word in stemmed_line] with open(label_dir, "r") as labels: for __label in labels: self.label.append(int(__label.strip())) def snowball_stemmer(sentence): words = sentence.split() for idx in range(len(words)): words[idx] = Preprocess.turkish_stemmer.stemWord(words[idx]) return words def stemming(sentence, first_k_char_stem): words = sentence.split() if first_k_char_stem != 0: for idx in range(len(words)): words[idx] = words[idx][:first_k_char_stem] return words def add_vocab(self, word): if word not in Preprocess.vocab: Preprocess.vocab.append(word) """ 0 index for padding word """ Preprocess.word2idx[word] = len(Preprocess.vocab) Preprocess.idx2word[len(Preprocess.vocab)] = word def get_vocab(): return Preprocess.vocab def get_corpus(self): return self.corpus def get_label(self): return self.label def get_word2idx(): return Preprocess.word2idx def get_idx2word(): return Preprocess.idx2word
import warnings from pandas import DataFrame from nltk.corpus import stopwords as stop from nltk.stem import PorterStemmer from nltk.stem import WordNetLemmatizer from nltk import pos_tag if False: nltk.download('wordnet') nltk.download('punkt') nltk.download('averaged_perceptron_tagger') warnings.filterwarnings(action='ignore') wpt = nltk.WordPunctTokenizer() PorterStemmer = PorterStemmer() SnowballStemmer = TurkishStemmer() lemmatizer = WordNetLemmatizer() stop_words = set(stop.words('turkish')) def remove_hyperlink(sentence: str) -> str: """ This method remove hyperlinks & emails & mentions from given sentence Args: sentence: input sentence file, :type str Returns: hyperlink removed sentence """ sentence = re.sub(r"\S*@\S*\s?", " ", sentence) sentence = re.sub(r"www\S+", " ", sentence)