def __init__(self): with open('./stopwords.txt') as f: more_stopword=f.read().split('\n') SWfactory = StopWordRemoverFactory() stopword_data = ArrayDictionary(more_stopword+SWfactory.get_stop_words()) self.stopword = StopWordRemover(stopword_data)
def modif(kalimat): # nltk.download('punkt') pat = "" kalimat = kalimat.translate(str.maketrans('', '', string.punctuation)).lower() # case folding & menghilangkan tanda baca . , tokens = nltk.tokenize.word_tokenize(kalimat) # tokenization fac = StopWordRemoverFactory() #set stopword stop = fac.get_stop_words() stop.append("kak") #menambahkan "kak" ke dalam kamus stopword stop.remove("tidak") #menghapus kata "tidak" stop.remove("boleh") #menghapus kata "boleh" stop.remove("bisa") #menghapus kata "bisa" stop.remove("dimana") removed = [] for t in tokens: if t not in stop: removed.append(t) #stopword removal pat = "" for w in removed: pat += w + " " return (pat)
def stopwords_removal(self, text, stopwords, output_stopwords): with open(dataOutputPath, encoding='utf-8') as f: text = f.read() f.close() with open(stopwords, encoding='utf-8') as f: list_stopwords = f.read() f.close() stop_factory = StopWordRemoverFactory() more_stopwords = list_stopwords.split("\n") #Tambahkan Stopword Baru data = stop_factory.get_stop_words() + more_stopwords stopword = stop_factory.create_stop_word_remover() remove_stopwords = stopword.remove(text) with open(pathStopwords, 'w', encoding='utf-8') as f: f.write(remove_stopwords) f.close() print( "Stopwords Removal success!\nCount Words Frequency on process...") return remove_stopwords
def Preprocessing(data): print("Preprocessing") cleanData = [] tokenizer = RegexpTokenizer(r'\w+') factory_stopwords = StopWordRemoverFactory() stopwordsFact = factory_stopwords.get_stop_words() stemmer = StemmerFactory().create_stemmer() count = 0 for kalimat in data: removedHttp = re.sub(r"http\S+", '', kalimat) #hilangin link http removedPic = re.sub(r"pic.twitter\S+", '', removedHttp) #hilangin link pic.twitter lower = removedPic.lower() #casefolding tokenized = tokenizer.tokenize(lower) #tokenizer + punctuation removal stopwords = [] #Stopwords removal for kata in tokenized: if kata not in stopwordsFact: stopwords.append(kata) stemmed = [] #stemming for kata in stopwords: #stemming stemmed.append(stemmer.stem(kata)) #stemming cleanData.append(stemmed) count += 1 print(count) return cleanData
def word_tokenizer(text): #tokenizes and stems the text tokens = word_tokenize(text) fac2 = StemmerFactory() stemmer = fac2.create_stemmer() factory = StopWordRemoverFactory() tokens = [stemmer.stem(t) for t in tokens if t not in factory.get_stop_words()] return tokens
def generate_sastrawi_stopwords(): # get Sastrawi stopwords as list factory = StopWordRemoverFactory() stopwords = factory.get_stop_words() # write to txt file with open(stopwords_list_path + '/sastrawi-stopwords.txt', 'w') as file: for word in stopwords: file.write(word + "\n")
def Stopword_removal(sentence): stopword_factory = StopWordRemoverFactory() stopwords = stopword_factory.get_stop_words() words = sentence.split() output = "" for word in words: if word not in stopwords: output = output + " " + word return output
def __init__(self, min_cut=0.1, max_cut=0.9): """ Initilize the text summarizer. Words that have a frequency term lower than min_cut or higer than max_cut will be ignored. """ factory = StopWordRemoverFactory() self._min_cut = min_cut self._max_cut = max_cut self._stopwords = set(factory.get_stop_words() + list(punctuation))
def bahasa_stopwords(additional_words=[]): stopword_a = pd.read_csv('Data/indonesia_tweet/stopwordbahasa.csv', names=['stopword']) stopword = stopword_a['stopword'].tolist() factory = StopWordRemoverFactory() stopword_b = factory.get_stop_words() stopword.extend(stopword_b) if type(additional_words) != list: return TypeError else: if len(additional_words) > 0: stopword.extend(additional_words) stopword = list(dict.fromkeys(stopword)) return stopword
def cleanTweets(Tweets): factory = StopWordRemoverFactory(); stopwords = set(factory.get_stop_words()+['twitter','rt','pic','com','yg','ga','https']) factory = StemmerFactory(); stemmer = factory.create_stemmer() for i,tweet in enumerate(tqdm(Tweets)): txt = tweet['fullTxt'] # if you want to ignore retweets ==> if not re.match(r'^RT.*', txt): txt = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',' ',txt)# clean urls txt = txt.lower() # Lowercase txt = Tokenizer.tokenize(txt) symbols = set(['@']) # Add more if you want txt = [strip_non_ascii(t,symbols) for t in txt] #remove all non ASCII characters txt = ' '.join([t for t in txt if len(t)>1]) Tweets[i]['cleanTxt'] = txt # this is not a good Python practice, only for learning. txt = stemmer.stem(txt).split() Tweets[i]['nlp'] = ' '.join([t for t in txt if t not in stopwords]) return Tweets
def pre_processing(text): stopwords = pd.read_csv('stopwordbahasa.csv', names=['stopword'])['stopword'].tolist() stem = StemmerFactory() stemmer = stem.create_stemmer() factory = StopWordRemoverFactory() stopword = StopWordRemover(ArrayDictionary(factory.get_stop_words() + stopwords)) clean_str = text.lower() # lowercase clean_str = re.sub(r"(?:\@|#|https?\://)\S+", " ", clean_str) # eliminate username, url, hashtags clean_str = re.sub(r'&', '', clean_str) # remove & as it equals & clean_str = re.sub(r'[^\w\s]',' ', clean_str) # remove punctuation clean_str = re.sub('[\s\n\t\r]+', ' ', clean_str) # remove extra space clean_str = clean_str.strip() # trim clean_str = " ".join([stemmer.stem(word) for word in clean_str.split()]) # stem clean_str = stopword.remove(clean_str) # remove stopwords return clean_str
def preprocess(data): cleanData = [] tokenizer = RegexpTokenizer(r'\w+') factory_stopwords = StopWordRemoverFactory() stopwords = factory_stopwords.get_stop_words() count = 0 for i in range(len(data)): lowerText = data[i].lower()#Case folding tokenizedText = tokenizer.tokenize(lowerText)#Punctual removal and tokenization swRemovedText = []#Stopwords removal for j in range(len(tokenizedText)): if tokenizedText[j] not in stopwords: swRemovedText.append(tokenizedText[j]) cleanData.append(swRemovedText) count += 1 print(count, "data cleaned") return cleanData
def stopwords_removal(self, list_stopwords, output_stopwords): with open(listStopwordsPath, 'r', encoding='utf-8') as f: list_stopwords = f.read() f.close() stop_factory = StopWordRemoverFactory() more_stopwords = list_stopwords.split("\n") data = stop_factory.get_stop_words() + more_stopwords stopwords = stop_factory.create_stop_word_remover() remove_stopwords = stopwords.remove(self.text) with open(stopwordsRemovalPath, 'w', encoding='utf-8') as f: f.write(remove_stopwords) print("Done!") return remove_stopwords
def Preprocessing(data):#Preprocessing cleanData = [] tokenizer = RegexpTokenizer(r'\w+') factory_stopwords = StopWordRemoverFactory() stopwords = factory_stopwords.get_stop_words() factory_stemmer = StemmerFactory() stemmer = factory_stemmer.create_stemmer() for i in range(len(data)): lowerText = data[i].lower()#Case folding tokenizedText = tokenizer.tokenize(lowerText)#Punctual removal and tokenization swRemovedText = []#Stopwords removal for j in range(len(tokenizedText)): if tokenizedText[j] not in stopwords: swRemovedText.append(tokenizedText[j]) stemmedText = [] for k in range(len(swRemovedText)):#Stemming stemmedText.append(stemmer.stem(swRemovedText[k])) cleanData.append(stemmedText) return cleanData
def mnb(): factory = StopWordRemoverFactory() stop_word_list = factory.get_stop_words() stop = stop_word_list + list(punctuation) tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words=stop) df = convert_to_tidf() X_train, X_test, y_train, y_test = train_test_split(df['questions'], df['labels'], random_state=0) count_vect = CountVectorizer() X_train_counts = count_vect.fit_transform(X_train) tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) feed = MultinomialNB().fit(X_train_tfidf, y_train) return feed, count_vect
import pandas import pickle as pickle from sklearn.naive_bayes import GaussianNB from sklearn.metrics import accuracy_score from dateutil.parser import parse import numpy as np import re from Sastrawi.Stemmer.StemmerFactory import StemmerFactory from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory , ArrayDictionary , StopWordRemover factory = StopWordRemoverFactory() a = list(factory.get_stop_words()) if "di" in a: a.remove("di") if "adalah" in a: a.remove("adalah") dictionary = ArrayDictionary(a) stopwordId = StopWordRemover(dictionary) sf= StemmerFactory() stemmerId = sf.create_stemmer() def date_detection(doc,fuzzy=True): try: parse(doc, fuzzy=fuzzy) return True except ValueError: return False except : return False def all_caps_detection(doc):
from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import TfidfTransformer import pandas as pd from collections import Counter from modulenorm.modNormalize import normalize from modulenorm.modTokenizing import tokenize from rake_nltk import Rake # create stemmer factory = StemmerFactory() stemmer = factory.create_stemmer() # Stopword Removal stop_factory = StopWordRemoverFactory() stopword = stop_factory.create_stop_word_remover() data = stop_factory.get_stop_words() # Standar Word Checker dictFile = os.path.dirname(os.path.realpath(__file__)) + "/improveDict.txt" swChecker = SWChecker(dictFile) prosaHelper = Prosa() usenorm = normalize() sentence = "q cinta lo tp lo kaga sejak weekend lalu" # text_norm = usenorm.enterNormalize(sentence) # normalisasi enter, 1 revw 1 baris # # text_norm = usenorm.lowerNormalize(text_norm) # normalisasi huruf besar ke kecil # text_norm = usenorm.repeatcharNormalize(text_norm) # normalisasi titik yang berulang # text_norm = usenorm.linkNormalize(text_norm) # normalisasi link dalam text # text_norm = usenorm.spacecharNormalize(text_norm) # normalisasi spasi karakter # text_norm = usenorm.ellipsisNormalize(text_norm) # normalisasi elepsis (…)
synonyms = [synonym.lower() for synonym in synonyms.split()] for synonym in synonyms: table[synonym] = primary.lower() spelling = [] for idx, value in enumerate(tokenisasi): temp = [] for idy, value1 in enumerate(value): temp.append(''.join( table.get(word.lower(), word) for word in re.findall(r'(\W+|\w+)', value1))) spelling.append(temp) #stopword stop_factory = StopWordRemoverFactory() data_stopword = stop_factory.get_stop_words() stopword = stop_factory.create_stop_word_remover() stopword_removal = [] for idx, value in enumerate(spelling): temp = [] for idy, value1 in enumerate(value): temp.append(stopword.remove(value1)) stopword_removal.append(temp) #stemming: factory = StemmerFactory() stemmer = factory.create_stemmer() stemming = [] for idx, value in enumerate(stopword_removal): temp = [] for idy, value1 in enumerate(value):
# ABOUT : Simple Chatbot Itenas # ========================================================== import nltk import string import random from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory from colorama import Fore, Style text = open("itenasDoc.txt", "r", errors='ignore').read() text = text.lower() factory = StopWordRemoverFactory() stopwords_indonesia = factory.get_stop_words() sents_tokenize = nltk.sent_tokenize(text) # Inisialisasi kata yang akan digunakan sapa_user = ['hello', 'hallo', 'hi', 'hallo itenas'] greet_user = [ 'Hallo juga :D', "Senang bertemu dengan mu, apa yang bisa aku bantu?", "Hiiiiii ^_^" ] kelimat_perpisahan = ['selesai', 'quit', 'dadah', 'berhenti'] def tokenize_function(data): # Menghilangkan tanda baca yang nantinya tidak akan terpakai di pencari kesamaan texts = [token for token in data if token not in string.punctuation]
#stopword removal from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() getStopWord = factory.get_stop_words() print(getStopWord)
# # Melakukan Proses Stopword Removal # Dan Menambah Stoplist # Dengan Python Sastrawi # import StopWordRemoverFactory class from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory factory = StopWordRemoverFactory() # Menambah Stopword more_stopword = ['dengan', 'saya'] data = factory.get_stop_words() + more_stopword stopword = factory.create_stop_word_remover() # Kalimat kalimat = 'dengan menggunakan python dan library sastrawi saya dapat melakukan proses stopword removal' stop = stopword.remove(kalimat) print(stop)
def get_stopwords(): factory = StopWordRemoverFactory() stop_words = factory.get_stop_words() return stop_words
rawdata = [] for j in range(0, 8): x = open(str(j + 1) + '.txt', 'r').read() rawdata.append(x.replace('\n', ' ')) import nltk from nltk.tokenize import word_tokenize as token from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory from Sastrawi.Stemmer.StemmerFactory import StemmerFactory import string, numpy as np ST = StemmerFactory() stemmer = ST.create_stemmer() SW = StopWordRemoverFactory() stop_word = SW.get_stop_words() #rawdata print('rawdata') print(rawdata) doc = [] for i in rawdata: temp = [] for j in token(i): word = stemmer.stem(str.lower(j)) #if word not in stop_word and len(word) > 2 and not word.startswith(tuple(string.punctuation)+tuple([str(k) for k in range(10)])+tuple('¿')): temp.append(word) doc.append(temp) dictionary = [] for i in doc:
# df = convert_to_tidf() # X_train, X_test, y_train, y_test = train_test_split(df['questions'], df['labels'], random_state=0) # count_vect = CountVectorizer() # X_train_counts = count_vect.fit_transform(X_train) # tfidf_transformer = TfidfTransformer() # X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) # feed = MultinomialNB().fit(X_train_tfidf, y_train) # return feed, count_vect #X_test.iloc[0] from sklearn.naive_bayes import MultinomialNB nb = MultinomialNB() factory = StopWordRemoverFactory() stop_word_list = factory.get_stop_words() # stopwords added stopwords = stop_word_list + list(punctuation) # create vectorizer vect = TfidfVectorizer(sublinear_tf=True, min_df=3, norm='l2', encoding='id', ngram_range=(1, 2), stop_words=stopwords) # create data df = convert_to_tidf() features = vect.fit_transform(df.questions).toarray() labels = df.id_label # cross validation technique X_train, X_test, y_train, y_test = train_test_split(df['questions'], df['labels'], random_state=0) # import and instantiate CountVectorizer # vect = CountVectorizer(sublinear_tf=True, min_df=1, norm='l2', encoding='id', ngram_range=(1, 2),stop_words=stopwords)
return replaced def remove_stopwords(text): words = tokenizer.tokenize(text) filtered_words = [word for word in words if not word in stop_words] return (" ".join(filtered_words)) def preprocess_text(text, abbreviation_dict): abbreviation_replaced = replace_abbreviation(text, abbreviation_dict) regex_filtered = regex_filter(abbreviation_replaced) stopwords_removed = remove_stopwords(regex_filtered) return stopwords_removed tokenizer = WordPunctTokenizer() factory = StopWordRemoverFactory() stop_words = factory.get_stop_words() remove_mentions = r'@[A-Za-z0-9_]+' remove_links = r'https?://[A-Za-z0-9./]+' remove_retweets = r'RT' remove_hashtags = r'#[A-Za-z0-9_]+' remove_pics = r'pic.twitter.com/[A-Za-z0-9]+' letters_only = r'[^\w\s-]' combined_pat = r'|'.join((remove_mentions,remove_links,remove_retweets,remove_hashtags,remove_pics, letters_only)) data_path = 'data/all_tweets.csv' tweet_data = pd.read_csv(data_path) abbreviation_dict = get_abbreviated_dict() clean_tweets = tweet_data['tweet'].apply(lambda x: preprocess_text(x, abbreviation_dict))
def hoax_detection(): #Reading data as pandas dataframe frame = pd.read_csv('MasterBeritaAfterCleanCombined.csv', error_bad_lines=False, encoding='latin1') frame2 = pd.read_csv('new_TestData.csv', error_bad_lines=False, encoding='latin1') # TODO: remove this line # frame = frame.head(5) berita = '' berita = stem(berita) data = {'no': ['1'], 'berita': [berita], 'tagging': ['Hoax']} # frame2 = pd.DataFrame(data, columns=['no','berita','tagging']) #Inspecing Shape frame.shape frame2.shape #Inspecting top 5 rows frame.head() frame2.head() #Setting the DataFrame index (row labels) using one or more existing columns frame = frame.set_index("no") frame.head() frame2 = frame2.set_index("no") frame2.head() y = frame.tagging y.head() y2 = frame2.tagging frame.drop("tagging", axis=1) frame.head() frame2.drop("tagging", axis=1) # print(frame['berita']) # print(frame['berita']) X_train = frame['berita'] y_train = y print(X_train.shape) print(y_train.shape) # print(X_train) # print(y_train) # print(len(X_train)) # print(len(y_train)) # uux_train, X_test , uuy_train, y_test = train_test_split(frame2['berita'], y2, test_size=0.33, random_state=53) X_test = frame2['berita'] y_test = y2 print(len(X_test)) # stemming # print(frame['berita'][0]) # print(frame2['berita']) X_train.head() y_train.head() X_train, X_test, y_train, y_test = train_test_split(frame['berita'], y, test_size=0.33, random_state=53) factory = StopWordRemoverFactory() stopwords = factory.get_stop_words() # count_vectorizer = case folding, tokenizing, remove stopwords # analyze = count_vectorizer.build_analyzer() # analyze("Saya mau MAKAN dimakan di tempat makan") # print(count_vectorizer) # count_vectorizer = CountVectorizer(lowercase=True, stop_words=frozenset(stopwords)) # Fit and transform the training data. # count_train = count_vectorizer.fit_transform(X_train) # print(count_train) # Transform the test set # count_test = count_vectorizer.transform(X_test) # Initialize the `tfidf_vectorizer` tfidf_vectorizer = TfidfVectorizer(lowercase=True, stop_words=frozenset(stopwords), max_df=0.7) # Fit and transform the training data tfidf_train = tfidf_vectorizer.fit_transform(X_train) # Transform the test set tfidf_test = tfidf_vectorizer.transform(X_test) print(tfidf_test) print('separator') # Get the feature names of `tfidf_vectorizer` print(tfidf_vectorizer.get_feature_names()[-20:]) tfidf_df = pd.DataFrame(tfidf_train.A, columns=tfidf_vectorizer.get_feature_names()) # tfidf_df.to_excel('output-hoax-only.xlsx') # print(tfidf_df) # Get the feature names of `count_vectorizer` # print(count_vectorizer.get_feature_names()[0:10]) import matplotlib.pyplot as plt def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues): """ See full source and example: http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html This function prints and plots the confusion matrix. Normalization can be applied by setting `normalize=True`. """ plt.imshow(cm, interpolation='nearest', cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes) if normalize: cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] print("Normalized confusion matrix") else: print('Confusion matrix, without normalization') thresh = cm.max() / 2. for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, cm[i, j], horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') plt.show() start = timeit.default_timer() clf = MultinomialNB() clf.fit(tfidf_train, y_train) pred = clf.predict(tfidf_test) score = accuracy_score(y_test, pred) multinomialpred = pred print("#Result:#Multinomial#", pred) print("accuracy: %0.3f" % score) cm = confusion_matrix(y_test, pred, labels=['Hoax', 'Valid']) stop = timeit.default_timer() print('Time Multinomial: ', stop - start) plot_confusion_matrix( cm, classes=['Hoax', 'Valid'], title='MultinomialNB Confusion Matrix (Predict: Test)') # y_pred_prob = clf.predict_proba(tfidf_test) # print(y_pred_prob) # hoax_probs = y_pred_prob[:,1] # # # fpr, tpr, threshold = roc_curve(y_true=y_test, y_score=hoax_probs, pos_label='spam') # # Plot # plt.plot(fpr,tpr, color='red') # plt.title('Receiver Operating Characteristic Curve', size=20) # plt.plot([0, 1], [0, 1], color='green', linestyle=':') # plt.xlabel('False Positive Rate', size=15) # plt.ylabel('True Positive Rate', size=15) # plt.show() clf = MultinomialNB() clf.fit(tfidf_train, y_train) pred = clf.predict(tfidf_train) score = accuracy_score(y_train, pred) multinomialpred = pred cm = confusion_matrix(y_train, pred, labels=['Hoax', 'Valid']) stop = timeit.default_timer() plot_confusion_matrix( cm, classes=['Hoax', 'Valid'], title='MultinomialNB Confusion Matrix (Predict: Training)') start = timeit.default_timer() linear_clf = PassiveAggressiveClassifier() linear_clf.fit(tfidf_train, y_train) pred = linear_clf.predict(tfidf_test) score = accuracy_score(y_test, pred) passiveaggressivepred = pred print("#Result:#PassiveAggressiveClassifier#", pred) print("accuracy: %0.3f" % score) cm = confusion_matrix(y_test, pred, labels=['Hoax', 'Valid']) stop = timeit.default_timer() print('Time PassiveAggressiveClassifier: ', stop - start) plot_confusion_matrix( cm, classes=['Hoax', 'Valid'], title='PassiveAggressiveClassifier Confusion Matrix (Predict: Test)') linear_clf = PassiveAggressiveClassifier() linear_clf.fit(tfidf_train, y_train) pred = linear_clf.predict(tfidf_train) score = accuracy_score(y_train, pred) passiveaggressivepred = pred cm = confusion_matrix(y_train, pred, labels=['Hoax', 'Valid']) stop = timeit.default_timer() print('Time PassiveAggressiveClassifier: ', stop - start) plot_confusion_matrix( cm, classes=['Hoax', 'Valid'], title='PassiveAggressiveClassifier Confusion Matrix (Predict: Training)' ) start = timeit.default_timer() linear_clf_svm = svm.SVC() linear_clf_svm.fit(tfidf_train, y_train) pred = linear_clf_svm.predict(tfidf_test) score = accuracy_score(y_test, pred) print("accuracy: %0.3f" % score) print("#Result:#SVM#", pred) svmpred = pred cm = confusion_matrix(y_test, pred, labels=['Hoax', 'Valid']) stop = timeit.default_timer() print('Time SVM: ', stop - start) plot_confusion_matrix(cm, classes=['Hoax', 'Valid'], title='SVM Confusion Matrix (Predict: Test)') linear_clf_svm = svm.SVC() linear_clf_svm.fit(tfidf_train, y_train) pred = linear_clf_svm.predict(tfidf_train) score = accuracy_score(y_train, pred) svmpred = pred cm = confusion_matrix(y_train, pred, labels=['Hoax', 'Valid']) stop = timeit.default_timer() print('Time SVM: ', stop - start) plot_confusion_matrix(cm, classes=['Hoax', 'Valid'], title='SVM Confusion Matrix (Predict: Training)') def most_informative_feature_for_binary_classification( vectorizer, classifier, n=100): """ See: https://stackoverflow.com/a/26980472 Identify most important features if given a vectorizer and binary classifier. Set n to the number of weighted features you would like to show. (Note: current implementation merely prints and does not return top classes.) """ class_labels = classifier.classes_ feature_names = vectorizer.get_feature_names() topn_class1 = sorted(zip(classifier.coef_[0], feature_names))[:n] topn_class2 = sorted(zip(classifier.coef_[0], feature_names))[-n:] for coef, feat in topn_class1: print(class_labels[0], coef, feat) print() for coef, feat in reversed(topn_class2): print(class_labels[1], coef, feat) print('y_test') print(y_test) # print('score') # print(score) # y_pred_prob = clf.predict_proba(tfidf_test) # spam_probs = y_pred_prob[:,1] # print(spam_probs) # # # Build confusion metrics # fpr, tpr, threshold = roc_curve(y_true=y_test, y_score=spam_probs, pos_label='spam') # # Plot # plt.plot(fpr,tpr, color='red') # plt.title('Receiver Operating Characteristic Curve', size=20) # plt.plot([0, 1], [0, 1], color='green', linestyle=':') # plt.xlabel('False Positive Rate', size=15) # plt.ylabel('True Positive Rate', size=15) # plt.show() from sklearn.metrics import roc_curve fpr, tpr, thresholds = roc_curve(y_test, linear_clf.decision_function(tfidf_test), pos_label='neg') # find threshold closest to zero: close_zero = np.argmin(np.abs(thresholds)) plt.plot(fpr[close_zero], tpr[close_zero], 'o', markersize=10, label='threshold zero(default)', fillstyle='none', c='k', mew=2) plt.plot([0, 1], linestyle='-', lw=2, color='r', label='random', alpha=0.8) plt.legend(loc=4) plt.plot(fpr, tpr, label='ROC Curve') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate (recall)') plt.title('roc_curve') plt.show() from sklearn.metrics import auc print('AUC score is: ', auc(fpr, tpr)) # plot precision recall curve Multinomial # disp = plot_precision_recall_curve(linear_clf, tfidf_test, y_test) # y_score = linear_clf.decision_function(X_test) # average_precision = average_precision_score(y_test, y_score) # disp.ax_.set_title('2-class Precision-Recall curve: ' # 'AP={0:0.2f}'.format(average_precision)) # disp.show() # # most_informative_feature_for_binary_classification(tfidf_vectorizer, linear_clf, n=30) feature_names = tfidf_vectorizer.get_feature_names() sorted(zip(clf.coef_[0], feature_names), reverse=True)[:20] ### Most fake sorted(zip(clf.coef_[0], feature_names))[:20] tokens_with_weights = sorted(list(zip(feature_names, clf.coef_[0]))) for i in tokens_with_weights: print(i) break result = dict() result['multinomial'] = multinomialpred result['passive'] = passiveaggressivepred result['svm'] = svmpred # print(result) return result
""" # coding: utf-8 # In[73]: from Sastrawi.Stemmer.StemmerFactory import StemmerFactory from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory import pandas as pd # In[74]: stemm = StemmerFactory() stemmer = stemm.create_stemmer() stop = StopWordRemoverFactory() stopwords = stop.get_stop_words() # In[75]: data = pd.read_csv("D:\dataset_textmining\dataset4.csv", encoding="ISO-8859-1") dataset_uji = pd.read_csv("D:\dataset_textmining\datauji4.csv", encoding="ISO-8859-1") # ### Get komentar # In[76]: desc = data.loc[:, 'Komentar'] dataset = data.loc[:, ["Komentar", "Hasil Akhir"]] data_uji = dataset_uji.loc[:, 'Komentar']
#strip punctuation w = w.strip('\'"?,.') #check if the word stats with an alphabet or number val = re.search(r"^[a-zA-Z0-9][a-zA-Z0-9-]*$", w) #add tokens if (w in ['b', 'rt', 'at', 'user', 'url'] or val is None or len(w) <= 3): continue else: tokens.append(w) return tokens #List of Stop Words factory2 = StopWordRemoverFactory() stopwords = factory2.get_stop_words() # Load the Blog article data = pd.read_csv(open(r"D:\Kuliah\Big Data\Merge\CrawlingJaktim.csv")) file = data['Tweets'].tolist() # Word Vectorization print("Memulai proses vektorisasi kata...") vectorizer_count = CountVectorizer(preprocessor=my_preprocessor, tokenizer=my_tokenizer, stop_words=stopwords, min_df=2, max_df=0.95) vectorizer_tfidf = TfidfVectorizer(preprocessor=my_preprocessor, tokenizer=my_tokenizer, stop_words=stopwords,
def __init__(self): self._stopwords = StopWordRemoverFactory.get_stop_words(self)
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory from Sastrawi.Stemmer.StemmerFactory import StemmerFactory import string factory_stopwrods = StopWordRemoverFactory() stopwords = factory_stopwrods.get_stop_words() factory_stemmer = StemmerFactory() stemmer = factory_stemmer.create_stemmer() def clean_text(text): # removing punctuation for c in string.punctuation: text = text.replace(c, "") # removing excessive whitespace text = " ".join(text.split()) # text to array of word words = text.split() # removing stopwords words = [word for word in words if word not in stopwords] # stemming word in query words = [stemmer.stem(word) for word in words] return words