def get_predictions(text): k load_model() text = re.sub("\W", " ", text) text = re.sub("[0-9]", " ", text) text = text.lower() # buyuk harftan kucuk harfe cevirme text = nltk.word_tokenize(text) text = [ word for word in text if not word in set(stopwords.words("turkish")) ] kokbul = TurkishStemmer() text = [kokbul.stem(word) for word in text] text = " ".join(text) sequence = tokenizer.texts_to_sequences([text]) # pad the sequence sequence = pad_sequences(sequence, maxlen=100) # get the prediction with graph.as_default(): prediction = model.predict(sequence)[0] print(prediction) # one-hot encoded vector, revert using np.argmax return int2label[np.argmax(prediction)]
def get_named_entities(mdl, tokens): stemmer = TurkishStemmer() res = mdl.analyze(tokens) entities = [] for entity in res["entities"]: for entity2 in entity["text"].split(", "): ne = stemmer.stem(entity2).split("'")[0] entities.append((entity["type"], ne, entity["score"])) return entities
def prep(word_list): '''Returns preprocessed word_list''' stemmer = TurkishStemmer() word_list = [stemmer.stem(x) for x in word_list] word_list = [ x.replace('ğ', 'g').replace('ı', 'i').replace('ç', 'c').replace( 'ş', 's').replace('ü', 'u').replace('ö', 'o') for x in word_list ] return word_list
def tokenize_and_stem(sentence,token_min_len=2,token_max_len=50,lower=True): '''Tokenizes the given sentence and applies stemmer on each token''' stemmer = TurkishStemmer() tokenizer = TweetTokenizer() sentence = utils.to_unicode(sentence.lower()) tokens = tokenizer.tokenize(sentence) tokens = [stemmer.stem(x.strip(bad_chars)) for x in tokens if x != '' and not x.startswith('@')] tokens = [x.replace('ğ','g').replace('ı','i').replace('ç','c').replace('ş','s').replace('ü','u').replace('ö','o') for x in tokens] return tokens
def getUrlKeyWord(url): data, text = getUrlContent(url) turkStem = TurkishStemmer() kelimeler = [] for w in data: kelimeler.append(turkStem.stem(w)) ds = pd.Series(kelimeler) val = ds.value_counts() return val, text
def cleaning(self, doc): stemmer = TurkishStemmer() doc = doc.lower() table = str.maketrans('', '', string.punctuation) stripped = doc.translate(table) clean_text = [] for i in stripped.split(): clean_text.append(stemmer.stem(i)) return ' '.join(clean_text)
def trstemmer_process(self, words): """Method to extract stems""" words_trstemmer = dict.fromkeys(self.labels, "") find_stem2 = TurkishStemmer() for label in self.labels: tmp = [] for w in words[label]: tmp.append(find_stem2.stem(w)) words_trstemmer[label] = tmp return words_trstemmer
def cleaning(self, doc): stemmer = TurkishStemmer() doc = doc.lower() filter_punch = str.maketrans('', '', string.punctuation) stripped = doc.translate(filter_punch) clean_text = [] for i in stripped.split(): if i not in self.stop_words: clean_text.append(stemmer.stem(i)) return ' '.join(clean_text)
def preprocess_review(review): ''' Takes a single review as an input Returns a processed, clean (tokenized, puntuations/stop words removed and stemmed) sentence ''' tokenized = word_tokenize(review.lower(), language='turkish') no_punc = [t for t in tokenized if t.isalpha()] no_stop = [t for t in no_punc if t not in stopwords.words('turkish')] stemmer = TurkishStemmer() review_cleaned = [stemmer.stem(t) for t in no_stop] return review_cleaned
def dump_best_model(): global stopwords_path global data_path stemmer = TurkishStemmer() stoplist=fill_stopword(stopwords_path) stemmer = TurkishStemmer() stoplist=fill_stopword(stopwords_path) model = LinearSVC() tokens,data_id,data_labels,all_sentences = read_all_file(data_path,stoplist,stemmer,remove_vowels=False) model,count_vect = get_tfidf_model(model, all_sentences,data_labels) model_list = [model,count_vect] dumping('model.plk',model_list)
def pad_seq(seq, lookup, maxlen): indices = [] stemmer = TurkishStemmer() for word in seq: if word in lookup: indices.append(lookup[word]) else: word = stemmer.stem(word) if word in lookup: indices.append(lookup[word]) else: indices.append(lookup[UNK]) return indices + [0]*(maxlen - len(seq))
def word_stemmer(self, text): words = self.word_tokenize(text) stopword = stopwords.words('turkish') word_stems = [] title_stems = [] ts = TurkishStemmer() for word in words: for w in word: if w not in stopword: word_stems.append(ts.stem(w)) return word_stems
def get_stem(user_input): stemmer=TurkishStemmer() #user message is tokenized by word tokenizer word_tokenize=nltk.word_tokenize(user_input) word_tokenize_stemmed=[] #print("kelime listesi:",word_tokenize) for i in word_tokenize: word_tokenize_stemmed.append(stemmer.stem(i)) stemmed_user_input=" ".join(word_tokenize_stemmed) return stemmed_user_input
def tokenize(text): wordstoken = (word_tokenize(text)) stopWords = set(stopwords.words('turkish')) filtered_sentence = [w for w in wordstoken if not w in stopWords] filtered_sentence = [] for w in wordstoken: if w not in stopWords: filtered_sentence.append(w) stemmer = TurkishStemmer() stemmerText = [] for i in filtered_sentence: s = stemmer.stem(i) stemmerText.append(s) return stemmerText
def normalization_data(self): import nltk nltk.download('stopwords') import re import numpy as np WPT = nltk.WordPunctTokenizer() stop_word_list = nltk.corpus.stopwords.words('turkish') from snowballstemmer import stemmer from TurkishStemmer import TurkishStemmer stemmer = TurkishStemmer() yorumlar = [] for i in range(0, len(self.df)): yorum = re.sub( "[^AaBbCcÇçDdEeFfGgĞğHhİiIıJjKkLlMmNnOoÖöPpRrSsŞşTtUuÜüVvYyZz']", ' ', self.df['text'][i]) #drop things that without letters yorum = re.sub("[']", '', yorum) #drop things that without letters yorum = yorum.lower() yorum = yorum.strip() yorum = yorum.split() yorum = [ stemmer.stem(word) for word in yorum if word not in stop_word_list ] yorum = ' '.join(yorum) yorumlar.append(yorum) # print(yorumlar)ds return yorumlar
def __init__(self, index_dir: str): ts = TurkishStemmer() self.__schema = fields.Schema( message=fields.TEXT(stored=True, field_boost=1.5, analyzer=analysis.StemmingAnalyzer() | analysis.NgramFilter(minsize=2, maxsize=5)), meta_content=fields.TEXT( stored=True, analyzer=analysis.StemmingAnalyzer() | analysis.NgramFilter(minsize=2, maxsize=5)), message_id=fields.NUMERIC(stored=True, bits=64), chat_id=fields.NUMERIC(stored=True, bits=64), message_tr=fields.TEXT( stored=False, field_boost=1.5, analyzer=analysis.StemmingAnalyzer(stemfn=ts.stem, stoplist=STOP_WORDS_TR) | analysis.NgramFilter(minsize=2, maxsize=5)), meta_content_tr=fields.TEXT( stored=False, analyzer=analysis.StemmingAnalyzer(stemfn=ts.stem, stoplist=STOP_WORDS_TR) | analysis.NgramFilter(minsize=2, maxsize=5)), ) if not os.path.isdir(index_dir): os.mkdir(index_dir) self.__index = index.create_in(index_dir, self.__schema) else: self.__index = index.open_dir(index_dir)
def preprocess(self, dt): lower_map = { ord(u'I'): u'ı', ord(u'İ'): u'i', } dt = dt.translate(lower_map).lower() dt = dt.replace("'", " ") tokens = word_tokenize(dt) # remove all tokens that are not alphabetic words = [ self.clean_alpha_num(word) for word in tokens if len(self.clean_alpha_num(word)) > 0 ] #print(words) # remove stopwords from nltk.corpus import stopwords stop_words = stopwords.words('turkish') stop_words = np.concatenate((stop_words, stopwords.words('english'))) stop_words2 = [ "acaba", "adeta", "ait", "altı", "ama", "ancak", "artık", "aslında", "asıl", "ayrıca", "bazen", "başka", "belki", "ben", "beri", "beş", "bide", "bir", "biraz", "birkaç", "birçok", "biz", "bu", "bura", "böyle", "cuma", "cumartesi", "da", "dahil", "dair", "de", "defa", "diye", "diğer", "dokuz", "dolayı", "dört", "en", "et", "eğer", "fakat", "falan", "filan", "galiba", "gel", "gene", "gibi", "göre", "hadi", "hangi", "hem", "herhalde", "herhangi", "iki", "ile", "için", "kere", "kez", "kim", "kimi", "lakin", "lütfen", "mesela", "mi", "mü", "mı", "new", "niye", "ol", "on", "oysa", "pazar", "pazartesi", "pek", "perşembe", "rağmen", "resmen", "salı", "sekiz", "sen", "seni", "siz", "sırf", "tabi", "tabii", "tane", "the", "un", "vala", "var", "veya", "yada", "yahu", "yaklaşık", "yani", "yap", "yedi", "yoksa", "zaten", "çarşamba", "çok", "çünkü", "üzere", "üç", "şey", "şu" ] stop_words = np.unique(np.concatenate((stop_words, stop_words2))) words = [ word for word in words if word not in stop_words and len(word) > 1 ] # Stem words porter = TurkishStemmer() words = [porter.stem(word) for word in words] return words
def get_prediction_from_dumping_model(model_list,test_path): global stopwords_path global data_path stemmer = TurkishStemmer() stoplist=fill_stopword(stopwords_path) test_sentences = read_test_file(test_path,stoplist,stemmer) train_tfidf_model_prediction(model_list[0],model_list[1], test_sentences)
def __init__(self, en_w2v, tr_w2v, similarity_threshold=0.75, cluster_nsamples=5, vector_size=300): self.en_w2v = en_w2v self.tr_w2v = tr_w2v self.vector_size = vector_size self.turkish_stemmer = TurkishStemmer() self.similarity_threshold = similarity_threshold self.cluster_nsamples = cluster_nsamples self.clusters = dict()
def StemingOfStringTurkish(Str1): stopword = stopwords.words('turkish') ps = PorterStemmer() words = word_tokenize(Str1) stemmer = TurkishStemmer() Str2 = "" for word in words: word = word.lower() word = re.sub(r'[^a-zA-ZğĞıİöÖüÜşŞçÇ]', '', word) try: if word not in stopword: word = stemmer.stem(word) Str2 += word Str2 += " " except UnicodeWarning: print("!!!!! " + word) return Str2
def get_prediction_from_best_model(test_path): global stopwords_path global data_path stemmer = TurkishStemmer() stoplist=fill_stopword(stopwords_path) model = LinearSVC() tokens,data_id,data_labels,all_sentences = read_all_file(data_path,stoplist,stemmer,remove_vowels=False) model,count_vect = get_tfidf_model(model, all_sentences,data_labels) test_sentences = read_test_file(test_path,stoplist,stemmer) train_tfidf_model_prediction(model,count_vect, test_sentences)
class Stemmer: def __init__(self): from TurkishStemmer import TurkishStemmer self.stemmer = TurkishStemmer() pass def stem(self, text): words = text.split(" ") stemmed = "" for w in words: stemmed += self.stemmer.stem(w) + " " return stemmed.rstrip()
def get_predictions(text): text = re.sub("\W", " ", text) text = re.sub("[0-9]", " ", text) text = text.lower() # buyuk harftan kucuk harfe cevirme text = nltk.word_tokenize(text) text = [ word for word in text if not word in set(stopwords.words("turkish")) ] kokbul = TurkishStemmer() text = [kokbul.stem(word) for word in text] text = " ".join(text) print(text) sequence = tokenizer.texts_to_sequences([text]) # pad the sequence sequence = pad_sequences(sequence, maxlen=SEQUENCE_LENGTH) # get the prediction prediction = model.predict(sequence)[0] # one-hot encoded vector, revert using np.argmax a = "" a = prediction[0] return int2label[np.argmax(prediction)], a
def word2index(seq, lookup): indices = [] stemmer = TurkishStemmer() for word in seq: if word in lookup: indices.append(lookup[word]) else: if word in lookup: indices.append(lookup[word]) else: indices.append(lookup[UNK]) return indices
def gen_corpus(lan, doc_set): # list for tokenized documents in loop texts = [] bar = Bar('Generating ' + lan + ' corpus', max=len(doc_set)) if lan == 'CN': thu1 = thulac.thulac(seg_only=False) #默认模式 with open("./stopword_cn.txt", encoding='UTF-8') as f: lines = f.read().splitlines() stopwords = sorted(set(lines)) for i in doc_set: text = thu1.cut(i, text=False) #进行一句话分词 l = [item[0] for item in text] # print(l) # for i in b: # print (i) stemmed = [i for i in l if i not in stopwords] # texts.append(l) texts.append(stemmed) bar.next() # print (stemmed) bar.finish() if lan in ['EN', 'RU', 'TR']: # loop through document list for i in doc_set: # clean and tokenize document string raw = i.lower() tokens = tokenizer.tokenize(raw) # remove stop words from tokens stopped_tokens = [ i for i in tokens if not i in get_stop_words(lan.lower()) ] # stem tokens stemmer = PorterStemmer() if lan == 'EN': stemmer = PorterStemmer() if lan == 'RU': stemmer = SnowballStemmer("russian") if lan == 'TR': stemmer = TurkishStemmer() stemmed_tokens = [stemmer.stem(i) for i in stopped_tokens] # add tokens to list texts.append(stemmed_tokens) bar.next() bar.finish() bar.finish() return texts
def read3000tweet(): stemmer = TurkishStemmer() def readData(folderPath,label,all_sentences): dirlist = glob.glob(folderPath+"/*.txt") stoplist=fill_stopword(stopwords_path) for dirr in dirlist: with open(dirr, "r",encoding='latin-1') as fp: sentences=fp.readline() while sentences: sentences = re.sub('@\S*', ' ', sentences) sentences = re.sub('pic\\..*(\s)+', ' ', sentences) sentences = re.sub(r"’(\S)*(\s)",' ',sentences) sentences = re.sub(r"'(\S)*(\s)",' ',sentences) #print(sentences) tokenized_sents = [word_tokenize(trlower(sent)) for sent in sent_tokenize(sentences) ] temp=[] for each_sentence in tokenized_sents: temp+=[''.join(c for c in s if c not in string.punctuation and c.isalpha() ) for s in each_sentence if s not in stoplist] temp = [convert_turkish_char(stemmer.stem(s)) for s in temp ] temp = [s for s in temp if len(s)>1 and s not in stoplist] one_sentence = ' ' for i in temp: one_sentence += ' '+i tokens.append(temp) data_labels.append(label) all_sentences.append(one_sentence) sentences=fp.readline() fp.close() return data_labels,tokens, all_sentences all_sentences=[] data_labels=[] tokens=[] pathToDataset = "3000tweet" positives = readData(pathToDataset+"/1",1,all_sentences) negatives = readData(pathToDataset+"/-1",-1,all_sentences) neutrals = readData(pathToDataset+"/0",0,all_sentences) return all_sentences,data_labels,tokens
def __init__(self, normalized= True, classes= None, stemmed= True): if classes is None: classes = ["positive", "negative", "notr"] self.x = [] self.y = [] self.tokenizer = Tokenizer() self.stemmer = TurkishStemmer() self.word2vec = None self.cachefile = "data/data" + ("_normalized" if normalized else "") + ("_stemmed" if stemmed else "") + "_" + ("_".join(classes)) + ".pickle" if os.path.isfile(self.cachefile): with open(self.cachefile, 'rb') as cache: self.x, self.y = pickle.load(cache) else: for cls in classes: self._append_data(cls, normalized, stemmed) with open(self.cachefile, 'wb') as cache: pickle.dump((self.x, self.y), cache)
def __init__(self, lang="english", lower=True, digits=True, link=True, punc=True, stem=True, stop_words=True, min_length_count=2): self.lower = lower self.digits = digits self.link = link self.punc = punc self.stem = stem self.stop_words = stop_words self.min_length_count = min_length_count self.stopwords = stopwords self.lang = lang if self.lang == "turkish": self.stemmer = TurkishStemmer() else: self.stemmer = PorterStemmer()
allText += content allBlogs.append(content) classes.append(folderName) allBlogs = np.array(allBlogs) #Creating rank 1 array df_allBlogs = pd.DataFrame({'Blog': allBlogs, 'Class': classes}) #Creating data structure with labeled axes.(data, rows and columns) df_allBlogs = df_allBlogs[['Blog', 'Class']] print(df_allBlogs) #NOT: soyle söyle gibi ingilzce karakterleri Türkçe karakterlere çevirmek. nltk.download('punkt') tokenizer = nltk.data.load('tokenizers/punkt/PY3/turkish.pickle') nltk.download('stopwords') turkishStopWords = set(stopwords.words('turkish')) stemmer = TurkishStemmer() #Typos turkishNLPObject = detector.TurkishNLP() turkishNLPObject.download() turkishNLPObject.create_word_set() number=0 numberTokenized=0 numberTypoAndStopWords=0 numberAllAndStemmed=0 def preprocessBlogs(blog): #Converting to lowercase characters and removing leading and trailing whitespaces. blog = blog.lower() blog = blog.strip() global number
class MachineLearningClass(): ts = TurkishStemmer() sentenceDict = {} sentenceDictLen = {} sentenceDictLabel = {} no_of_inputs = 10 #Number of training files no_of_testSet = 10 #Number of test files for x in range(1, no_of_inputs + 1): sentenceList = '' sentenceListSum = '' fileName = 'Veriseti/Text' + str(x) + ".txt" fileNameSummary = 'Veriseti/Summary' + str(x) + ".txt" openText = open(fileName, "r") readText = openText.read() openSummary = open(fileNameSummary, "r") readSummary = openSummary.read() sentenceList = PreprocessingClass().filter_sentences(readText) sentenceListSum = PreprocessingClass().filter_sentences(readSummary) no_of_sentence = len(sentenceList) for y in range(0, no_of_sentence): sentId = str(x) + '.' + str(y) sentenceDict[sentId] = sentenceList[y] if (sentenceDict[sentId] in sentenceListSum): sentenceDictLabel[sentId] = 1 else: sentenceDictLabel[sentId] = 0 stems = PreprocessingClass().word_stemmer(sentenceDict[sentId]) sentenceDict[sentId] = ' '.join(stems) df = pd.DataFrame(sentenceDict.items(), columns=['SentenceId', 'Sentence']) dfSum = pd.DataFrame(sentenceDictLabel.items(), columns=['SentenceId', 'Label']) label = dfSum.iloc[:, -1].values sentence = df.iloc[:, -1].values with open('X.pickle', 'wb') as f: pickle.dump(sentence, f) with open('y.pickle', 'wb') as f: pickle.dump(label, f) dfLabel = pd.DataFrame(data=label, index=range(len(sentence)), columns=["Label"]) dfData = pd.concat([df, dfLabel], axis=1) vectorizer = CountVectorizer(max_features=50) X = vectorizer.fit_transform(sentence).toarray() transformer = TfidfTransformer() X = transformer.fit_transform(X).toarray() y = dfSum.iloc[:, -1].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0) gnb = GaussianNB() gnb.fit(X_train, y_train) y_pred = gnb.predict(X_test) cm = confusion_matrix(y_test, y_pred) print(cm) with open('classifier.pickle', 'wb') as f: pickle.dump(gnb, f) with open('tfidfmodel.pickle', 'wb') as f: pickle.dump(transformer, f) with open('vectorize.pickle', 'wb') as f: pickle.dump(vectorizer, f) def newPred(self, text): sentence_tokenizer = PunktSentenceTokenizer() sentences = sentence_tokenizer.tokenize(text) print(len(sentences)) predList = PreprocessingClass().filter_sentences(text) with open('classifier.pickle', 'rb') as f: clf = pickle.load(f) with open('vectorize.pickle', 'rb') as f: vect = pickle.load(f) with open('tfidfmodel.pickle', 'rb') as f: tfidf = pickle.load(f) predText = vect.fit_transform(predList).toarray() predText = tfidf.transform(predText).toarray() new_pred = clf.predict(predText) finalSum = [] for i, j in enumerate(new_pred): if j == 1: finalSum.append(sentences[i]) return finalSum
#-*- coding:utf-8 -*- from TurkishStemmer import TurkishStemmer tstem=TurkishStemmer() print tstem.stem("çürüklerimi")