Example #1
0
def get_predictions(text):
    k
    load_model()

    text = re.sub("\W", " ", text)
    text = re.sub("[0-9]", " ", text)
    text = text.lower()  # buyuk harftan kucuk harfe cevirme
    text = nltk.word_tokenize(text)
    text = [
        word for word in text if not word in set(stopwords.words("turkish"))
    ]
    kokbul = TurkishStemmer()
    text = [kokbul.stem(word) for word in text]
    text = " ".join(text)

    sequence = tokenizer.texts_to_sequences([text])
    # pad the sequence
    sequence = pad_sequences(sequence, maxlen=100)
    # get the prediction

    with graph.as_default():
        prediction = model.predict(sequence)[0]

    print(prediction)
    # one-hot encoded vector, revert using np.argmax
    return int2label[np.argmax(prediction)]
 def get_named_entities(mdl, tokens):
     stemmer = TurkishStemmer()
     res = mdl.analyze(tokens)
     entities = []
     for entity in res["entities"]:
         for entity2 in entity["text"].split(", "):
             ne = stemmer.stem(entity2).split("'")[0]
             entities.append((entity["type"], ne, entity["score"]))
     return entities
Example #3
0
def prep(word_list):
    '''Returns preprocessed word_list'''
    stemmer = TurkishStemmer()
    word_list = [stemmer.stem(x) for x in word_list]
    word_list = [
        x.replace('ğ', 'g').replace('ı', 'i').replace('ç', 'c').replace(
            'ş', 's').replace('ü', 'u').replace('ö', 'o') for x in word_list
    ]
    return word_list
Example #4
0
def tokenize_and_stem(sentence,token_min_len=2,token_max_len=50,lower=True):
	'''Tokenizes the given sentence and applies stemmer on each token'''
	stemmer = TurkishStemmer()
	tokenizer = TweetTokenizer()
	sentence = utils.to_unicode(sentence.lower())
	tokens = tokenizer.tokenize(sentence)
	tokens = [stemmer.stem(x.strip(bad_chars)) for x in tokens if x != '' and not x.startswith('@')]
	tokens = [x.replace('ğ','g').replace('ı','i').replace('ç','c').replace('ş','s').replace('ü','u').replace('ö','o') for x in tokens]
	return tokens
Example #5
0
def getUrlKeyWord(url):
    data, text = getUrlContent(url)
    turkStem = TurkishStemmer()
    kelimeler = []
    for w in data:
        kelimeler.append(turkStem.stem(w))

    ds = pd.Series(kelimeler)
    val = ds.value_counts()
    return val, text
Example #6
0
    def cleaning(self, doc):
        stemmer = TurkishStemmer()
        doc = doc.lower()
        table = str.maketrans('', '', string.punctuation)
        stripped = doc.translate(table)
        clean_text = []
        for i in stripped.split():
            clean_text.append(stemmer.stem(i))

        return ' '.join(clean_text)
    def trstemmer_process(self, words):
        """Method to extract stems"""

        words_trstemmer = dict.fromkeys(self.labels, "")
        find_stem2 = TurkishStemmer()
        for label in self.labels:
            tmp = []
            for w in words[label]:
                tmp.append(find_stem2.stem(w))
            words_trstemmer[label] = tmp

        return words_trstemmer
Example #8
0
    def cleaning(self, doc):
        stemmer = TurkishStemmer()
        doc = doc.lower()
        filter_punch = str.maketrans('', '', string.punctuation)
        stripped = doc.translate(filter_punch)

        clean_text = []
        for i in stripped.split():
            if i not in self.stop_words:
                clean_text.append(stemmer.stem(i))

        return ' '.join(clean_text)
Example #9
0
def preprocess_review(review):
    '''
	Takes a single review as an input
	Returns a processed, clean (tokenized, puntuations/stop words removed and stemmed) sentence
	'''
    tokenized = word_tokenize(review.lower(), language='turkish')
    no_punc = [t for t in tokenized if t.isalpha()]
    no_stop = [t for t in no_punc if t not in stopwords.words('turkish')]

    stemmer = TurkishStemmer()
    review_cleaned = [stemmer.stem(t) for t in no_stop]

    return review_cleaned
Example #10
0
def dump_best_model():
    global stopwords_path
    global data_path
    stemmer = TurkishStemmer()    
    stoplist=fill_stopword(stopwords_path)
    stemmer = TurkishStemmer()    
    stoplist=fill_stopword(stopwords_path)     
    
    model = LinearSVC()            
    tokens,data_id,data_labels,all_sentences = read_all_file(data_path,stoplist,stemmer,remove_vowels=False)
    model,count_vect = get_tfidf_model(model, all_sentences,data_labels)
    model_list = [model,count_vect]
    dumping('model.plk',model_list)    
Example #11
0
def pad_seq(seq, lookup, maxlen):
    indices = []
    stemmer = TurkishStemmer()
    for word in seq:
        if word in lookup:
            indices.append(lookup[word])
        else:
            word = stemmer.stem(word)
            if word in lookup:
                indices.append(lookup[word])
            else:
                indices.append(lookup[UNK])
    return indices + [0]*(maxlen - len(seq))
    def word_stemmer(self, text):

        words = self.word_tokenize(text)
        stopword = stopwords.words('turkish')

        word_stems = []
        title_stems = []
        ts = TurkishStemmer()
        for word in words:
            for w in word:
                if w not in stopword:
                    word_stems.append(ts.stem(w))

        return word_stems
Example #13
0
def get_stem(user_input):
    stemmer=TurkishStemmer()

    #user message is tokenized by word tokenizer
    word_tokenize=nltk.word_tokenize(user_input)
    word_tokenize_stemmed=[]

    #print("kelime listesi:",word_tokenize)
    for i in word_tokenize:
        word_tokenize_stemmed.append(stemmer.stem(i))

 
    stemmed_user_input=" ".join(word_tokenize_stemmed)
    return stemmed_user_input
Example #14
0
def tokenize(text):
    wordstoken = (word_tokenize(text))
    stopWords = set(stopwords.words('turkish'))
    filtered_sentence = [w for w in wordstoken if not w in stopWords]
    filtered_sentence = []
    for w in wordstoken:
        if w not in stopWords:
            filtered_sentence.append(w)
    stemmer = TurkishStemmer()
    stemmerText = []
    for i in filtered_sentence:
        s = stemmer.stem(i)
        stemmerText.append(s)
    return stemmerText
Example #15
0
    def normalization_data(self):
        import nltk
        nltk.download('stopwords')
        import re
        import numpy as np

        WPT = nltk.WordPunctTokenizer()
        stop_word_list = nltk.corpus.stopwords.words('turkish')
        from snowballstemmer import stemmer

        from TurkishStemmer import TurkishStemmer
        stemmer = TurkishStemmer()

        yorumlar = []
        for i in range(0, len(self.df)):

            yorum = re.sub(
                "[^AaBbCcÇçDdEeFfGgĞğHhİiIıJjKkLlMmNnOoÖöPpRrSsŞşTtUuÜüVvYyZz']",
                ' ', self.df['text'][i])  #drop things that without letters
            yorum = re.sub("[']", '', yorum)  #drop things that without letters
            yorum = yorum.lower()
            yorum = yorum.strip()
            yorum = yorum.split()

            yorum = [
                stemmer.stem(word) for word in yorum
                if word not in stop_word_list
            ]
            yorum = ' '.join(yorum)

            yorumlar.append(yorum)
        # print(yorumlar)ds
        return yorumlar
Example #16
0
 def __init__(self, index_dir: str):
     ts = TurkishStemmer()
     self.__schema = fields.Schema(
         message=fields.TEXT(stored=True,
                             field_boost=1.5,
                             analyzer=analysis.StemmingAnalyzer()
                             | analysis.NgramFilter(minsize=2, maxsize=5)),
         meta_content=fields.TEXT(
             stored=True,
             analyzer=analysis.StemmingAnalyzer()
             | analysis.NgramFilter(minsize=2, maxsize=5)),
         message_id=fields.NUMERIC(stored=True, bits=64),
         chat_id=fields.NUMERIC(stored=True, bits=64),
         message_tr=fields.TEXT(
             stored=False,
             field_boost=1.5,
             analyzer=analysis.StemmingAnalyzer(stemfn=ts.stem,
                                                stoplist=STOP_WORDS_TR)
             | analysis.NgramFilter(minsize=2, maxsize=5)),
         meta_content_tr=fields.TEXT(
             stored=False,
             analyzer=analysis.StemmingAnalyzer(stemfn=ts.stem,
                                                stoplist=STOP_WORDS_TR)
             | analysis.NgramFilter(minsize=2, maxsize=5)),
     )
     if not os.path.isdir(index_dir):
         os.mkdir(index_dir)
         self.__index = index.create_in(index_dir, self.__schema)
     else:
         self.__index = index.open_dir(index_dir)
Example #17
0
    def preprocess(self, dt):

        lower_map = {
            ord(u'I'): u'ı',
            ord(u'İ'): u'i',
        }

        dt = dt.translate(lower_map).lower()
        dt = dt.replace("'", " ")
        tokens = word_tokenize(dt)
        # remove all tokens that are not alphabetic
        words = [
            self.clean_alpha_num(word) for word in tokens
            if len(self.clean_alpha_num(word)) > 0
        ]
        #print(words)
        # remove stopwords
        from nltk.corpus import stopwords
        stop_words = stopwords.words('turkish')
        stop_words = np.concatenate((stop_words, stopwords.words('english')))
        stop_words2 = [
            "acaba", "adeta", "ait", "altı", "ama", "ancak", "artık",
            "aslında", "asıl", "ayrıca", "bazen", "başka", "belki", "ben",
            "beri", "beş", "bide", "bir", "biraz", "birkaç", "birçok", "biz",
            "bu", "bura", "böyle", "cuma", "cumartesi", "da", "dahil", "dair",
            "de", "defa", "diye", "diğer", "dokuz", "dolayı", "dört", "en",
            "et", "eğer", "fakat", "falan", "filan", "galiba", "gel", "gene",
            "gibi", "göre", "hadi", "hangi", "hem", "herhalde", "herhangi",
            "iki", "ile", "için", "kere", "kez", "kim", "kimi", "lakin",
            "lütfen", "mesela", "mi", "mü", "mı", "new", "niye", "ol", "on",
            "oysa", "pazar", "pazartesi", "pek", "perşembe", "rağmen",
            "resmen", "salı", "sekiz", "sen", "seni", "siz", "sırf", "tabi",
            "tabii", "tane", "the", "un", "vala", "var", "veya", "yada",
            "yahu", "yaklaşık", "yani", "yap", "yedi", "yoksa", "zaten",
            "çarşamba", "çok", "çünkü", "üzere", "üç", "şey", "şu"
        ]
        stop_words = np.unique(np.concatenate((stop_words, stop_words2)))

        words = [
            word for word in words if word not in stop_words and len(word) > 1
        ]

        # Stem words
        porter = TurkishStemmer()
        words = [porter.stem(word) for word in words]

        return words
Example #18
0
def get_prediction_from_dumping_model(model_list,test_path):
    global stopwords_path
    global data_path
    stemmer = TurkishStemmer()    
    stoplist=fill_stopword(stopwords_path)
    
    test_sentences = read_test_file(test_path,stoplist,stemmer)                    
    train_tfidf_model_prediction(model_list[0],model_list[1], test_sentences)        
 def __init__(self, en_w2v, tr_w2v, similarity_threshold=0.75, cluster_nsamples=5, vector_size=300):
     self.en_w2v = en_w2v
     self.tr_w2v = tr_w2v
     self.vector_size = vector_size
     self.turkish_stemmer = TurkishStemmer()
     self.similarity_threshold = similarity_threshold
     self.cluster_nsamples = cluster_nsamples
     self.clusters = dict()
def StemingOfStringTurkish(Str1):
    stopword = stopwords.words('turkish')
    ps = PorterStemmer()
    words = word_tokenize(Str1)
    stemmer = TurkishStemmer()
    Str2 = ""
    for word in words:
        word = word.lower()
        word = re.sub(r'[^a-zA-ZğĞıİöÖüÜşŞçÇ]', '', word)
        try:
            if word not in stopword:
                word = stemmer.stem(word)
                Str2 += word
                Str2 += " "
        except UnicodeWarning:
            print("!!!!! " + word)

    return Str2
Example #21
0
def get_prediction_from_best_model(test_path):
    global stopwords_path
    global data_path
    stemmer = TurkishStemmer()    
    stoplist=fill_stopword(stopwords_path)
    
    model = LinearSVC()            
    tokens,data_id,data_labels,all_sentences = read_all_file(data_path,stoplist,stemmer,remove_vowels=False)
    model,count_vect = get_tfidf_model(model, all_sentences,data_labels) 
    test_sentences = read_test_file(test_path,stoplist,stemmer)                    
    train_tfidf_model_prediction(model,count_vect, test_sentences)        
Example #22
0
class Stemmer:
    def __init__(self):
        from TurkishStemmer import TurkishStemmer
        self.stemmer = TurkishStemmer()
        pass

    def stem(self, text):
        words = text.split(" ")
        stemmed = ""
        for w in words:
            stemmed += self.stemmer.stem(w) + " "
        return stemmed.rstrip()
def get_predictions(text):
    text = re.sub("\W", " ", text)
    text = re.sub("[0-9]", " ", text)
    text = text.lower()  # buyuk harftan kucuk harfe cevirme
    text = nltk.word_tokenize(text)
    text = [
        word for word in text if not word in set(stopwords.words("turkish"))
    ]
    kokbul = TurkishStemmer()
    text = [kokbul.stem(word) for word in text]
    text = " ".join(text)
    print(text)
    sequence = tokenizer.texts_to_sequences([text])
    # pad the sequence
    sequence = pad_sequences(sequence, maxlen=SEQUENCE_LENGTH)
    # get the prediction
    prediction = model.predict(sequence)[0]
    # one-hot encoded vector, revert using np.argmax
    a = ""
    a = prediction[0]
    return int2label[np.argmax(prediction)], a
Example #24
0
def word2index(seq, lookup):
    indices = []
    stemmer = TurkishStemmer()
    for word in seq:
        if word in lookup:
            indices.append(lookup[word])
        else:
            
            if word in lookup:
                indices.append(lookup[word])
            else:
                indices.append(lookup[UNK])
    return indices
Example #25
0
def gen_corpus(lan, doc_set):
    # list for tokenized documents in loop
    texts = []
    bar = Bar('Generating ' + lan + ' corpus', max=len(doc_set))
    if lan == 'CN':
        thu1 = thulac.thulac(seg_only=False)  #默认模式
        with open("./stopword_cn.txt", encoding='UTF-8') as f:
            lines = f.read().splitlines()
            stopwords = sorted(set(lines))
        for i in doc_set:
            text = thu1.cut(i, text=False)  #进行一句话分词
            l = [item[0] for item in text]
            # print(l)
            # for i in b:
            #     print (i)
            stemmed = [i for i in l if i not in stopwords]
            # texts.append(l)
            texts.append(stemmed)
            bar.next()
        # print (stemmed)
        bar.finish()
    if lan in ['EN', 'RU', 'TR']:
        # loop through document list
        for i in doc_set:
            # clean and tokenize document string
            raw = i.lower()
            tokens = tokenizer.tokenize(raw)
            # remove stop words from tokens
            stopped_tokens = [
                i for i in tokens if not i in get_stop_words(lan.lower())
            ]
            # stem tokens
            stemmer = PorterStemmer()
            if lan == 'EN':
                stemmer = PorterStemmer()
            if lan == 'RU':
                stemmer = SnowballStemmer("russian")
            if lan == 'TR':
                stemmer = TurkishStemmer()
            stemmed_tokens = [stemmer.stem(i) for i in stopped_tokens]
            # add tokens to list
            texts.append(stemmed_tokens)
            bar.next()
        bar.finish()
    bar.finish()
    return texts
Example #26
0
def read3000tweet():
    stemmer = TurkishStemmer()
    def readData(folderPath,label,all_sentences):
        dirlist = glob.glob(folderPath+"/*.txt")
        stoplist=fill_stopword(stopwords_path) 
        for dirr in dirlist:
            with open(dirr, "r",encoding='latin-1') as fp:
                sentences=fp.readline()
                while sentences:  
                    sentences = re.sub('@\S*', ' ', sentences)
                    sentences = re.sub('pic\\..*(\s)+', ' ', sentences)
                    sentences = re.sub(r"’(\S)*(\s)",' ',sentences)
                    sentences = re.sub(r"'(\S)*(\s)",' ',sentences)    
                    
                    #print(sentences)
                    tokenized_sents = [word_tokenize(trlower(sent))  for sent in sent_tokenize(sentences) ]        
                    temp=[]
                    
                    for each_sentence in tokenized_sents:
                        temp+=[''.join(c for c in s if c not in string.punctuation and  c.isalpha() ) for s in each_sentence if s not in stoplist]             
                        temp = [convert_turkish_char(stemmer.stem(s)) for s in temp ]
                        temp = [s for s in temp if len(s)>1 and s not in stoplist]
                        
                    one_sentence = ' '
                    for i in temp:
                        one_sentence += ' '+i
                            
                            
                    tokens.append(temp)
                    data_labels.append(label)
                    all_sentences.append(one_sentence)
        
                    sentences=fp.readline()
            fp.close()  
        return data_labels,tokens, all_sentences
             
    all_sentences=[]
    data_labels=[]
    tokens=[]
    pathToDataset = "3000tweet"
    positives = readData(pathToDataset+"/1",1,all_sentences)
    negatives = readData(pathToDataset+"/-1",-1,all_sentences)  
    neutrals = readData(pathToDataset+"/0",0,all_sentences)
    return all_sentences,data_labels,tokens
Example #27
0
    def __init__(self, normalized= True, classes= None, stemmed= True):
        if classes is None:
            classes = ["positive", "negative", "notr"]

        self.x = []
        self.y = []
        self.tokenizer = Tokenizer()
        self.stemmer = TurkishStemmer()
        self.word2vec = None

        self.cachefile = "data/data" + ("_normalized" if normalized else "") + ("_stemmed" if stemmed else "") + "_" + ("_".join(classes)) + ".pickle"
        if os.path.isfile(self.cachefile):
            with open(self.cachefile, 'rb') as cache:
                self.x, self.y = pickle.load(cache)
        else:
            for cls in classes:
                self._append_data(cls, normalized, stemmed)

            with open(self.cachefile, 'wb') as cache:
                pickle.dump((self.x, self.y), cache)
 def __init__(self,
              lang="english",
              lower=True,
              digits=True,
              link=True,
              punc=True,
              stem=True,
              stop_words=True,
              min_length_count=2):
     self.lower = lower
     self.digits = digits
     self.link = link
     self.punc = punc
     self.stem = stem
     self.stop_words = stop_words
     self.min_length_count = min_length_count
     self.stopwords = stopwords
     self.lang = lang
     if self.lang == "turkish":
         self.stemmer = TurkishStemmer()
     else:
         self.stemmer = PorterStemmer()
      allText += content
      allBlogs.append(content)
      classes.append(folderName)

allBlogs = np.array(allBlogs)  #Creating rank 1 array
df_allBlogs = pd.DataFrame({'Blog': allBlogs, 'Class': classes})  #Creating data structure with labeled axes.(data, rows and columns)
df_allBlogs = df_allBlogs[['Blog', 'Class']]
print(df_allBlogs)

#NOT: soyle söyle gibi ingilzce karakterleri Türkçe karakterlere çevirmek.

nltk.download('punkt')
tokenizer = nltk.data.load('tokenizers/punkt/PY3/turkish.pickle')
nltk.download('stopwords')
turkishStopWords = set(stopwords.words('turkish'))
stemmer = TurkishStemmer()
#Typos
turkishNLPObject = detector.TurkishNLP()
turkishNLPObject.download()
turkishNLPObject.create_word_set()

number=0
numberTokenized=0
numberTypoAndStopWords=0
numberAllAndStemmed=0

def preprocessBlogs(blog):
  #Converting to lowercase characters and removing leading and trailing whitespaces.
  blog = blog.lower()
  blog = blog.strip()
  global number
Example #30
0
class MachineLearningClass():

    ts = TurkishStemmer()
    sentenceDict = {}
    sentenceDictLen = {}
    sentenceDictLabel = {}

    no_of_inputs = 10  #Number of training files
    no_of_testSet = 10  #Number of test files

    for x in range(1, no_of_inputs + 1):
        sentenceList = ''
        sentenceListSum = ''
        fileName = 'Veriseti/Text' + str(x) + ".txt"
        fileNameSummary = 'Veriseti/Summary' + str(x) + ".txt"
        openText = open(fileName, "r")
        readText = openText.read()
        openSummary = open(fileNameSummary, "r")
        readSummary = openSummary.read()

        sentenceList = PreprocessingClass().filter_sentences(readText)

        sentenceListSum = PreprocessingClass().filter_sentences(readSummary)

        no_of_sentence = len(sentenceList)

        for y in range(0, no_of_sentence):
            sentId = str(x) + '.' + str(y)
            sentenceDict[sentId] = sentenceList[y]

            if (sentenceDict[sentId] in sentenceListSum):
                sentenceDictLabel[sentId] = 1
            else:
                sentenceDictLabel[sentId] = 0

            stems = PreprocessingClass().word_stemmer(sentenceDict[sentId])
            sentenceDict[sentId] = ' '.join(stems)

    df = pd.DataFrame(sentenceDict.items(), columns=['SentenceId', 'Sentence'])
    dfSum = pd.DataFrame(sentenceDictLabel.items(),
                         columns=['SentenceId', 'Label'])
    label = dfSum.iloc[:, -1].values
    sentence = df.iloc[:, -1].values

    with open('X.pickle', 'wb') as f:
        pickle.dump(sentence, f)

    with open('y.pickle', 'wb') as f:
        pickle.dump(label, f)

    dfLabel = pd.DataFrame(data=label,
                           index=range(len(sentence)),
                           columns=["Label"])
    dfData = pd.concat([df, dfLabel], axis=1)

    vectorizer = CountVectorizer(max_features=50)
    X = vectorizer.fit_transform(sentence).toarray()

    transformer = TfidfTransformer()
    X = transformer.fit_transform(X).toarray()
    y = dfSum.iloc[:, -1].values

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.20,
                                                        random_state=0)

    gnb = GaussianNB()
    gnb.fit(X_train, y_train)
    y_pred = gnb.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    print(cm)

    with open('classifier.pickle', 'wb') as f:
        pickle.dump(gnb, f)

    with open('tfidfmodel.pickle', 'wb') as f:
        pickle.dump(transformer, f)

    with open('vectorize.pickle', 'wb') as f:
        pickle.dump(vectorizer, f)

    def newPred(self, text):
        sentence_tokenizer = PunktSentenceTokenizer()
        sentences = sentence_tokenizer.tokenize(text)
        print(len(sentences))
        predList = PreprocessingClass().filter_sentences(text)

        with open('classifier.pickle', 'rb') as f:
            clf = pickle.load(f)

        with open('vectorize.pickle', 'rb') as f:
            vect = pickle.load(f)

        with open('tfidfmodel.pickle', 'rb') as f:
            tfidf = pickle.load(f)

        predText = vect.fit_transform(predList).toarray()
        predText = tfidf.transform(predText).toarray()
        new_pred = clf.predict(predText)
        finalSum = []
        for i, j in enumerate(new_pred):
            if j == 1:
                finalSum.append(sentences[i])

        return finalSum
#-*- coding:utf-8 -*-
from TurkishStemmer import TurkishStemmer
tstem=TurkishStemmer()
print tstem.stem("çürüklerimi")