import pickle from sklearn.naive_bayes import BernoulliNB from sklearn.feature_extraction.text import CountVectorizer from filter import Filter import re from sklearn.metrics import accuracy_score from sklearn.externals import joblib file = open('trainedmodels/mi_bow.pkl', 'rb') bow = pickle.load(file) #Load features myfilter = Filter() reObject = re.compile('(.*)\t(.*)') df = myfilter.getFiltered(reObject, open('lib/SMSSpamCollection')) for_burnolli = CountVectorizer(vocabulary=bow) joblib.dump(for_burnolli, 'trainedmodels/bow_mi.pkl') vector = for_burnolli.transform(df['data']) detector = BernoulliNB().fit(vector, df['label']) joblib.dump(detector, 'trainedmodels/bernoulli.pkl') predictions = detector.predict(vector) print("Accuracy :", accuracy_score(df['label'], predictions))
from sklearn.externals import joblib import re from filter import Filter from sklearn.metrics import accuracy_score bow = joblib.load('trainedmodels/bow_mi.pkl') bernoulli = joblib.load('trainedmodels/bernoulli.pkl') file = open("lib/sms_for_test.txt") reObject = re.compile(r'(.*),(spam|ham)$') myfil = Filter() msg = myfil.getFiltered(reObject, file) vectors = bow.transform(msg['label']) predictions = bernoulli.predict(vectors) print("Accuracy :", accuracy_score(predictions, msg['data']))
from sklearn.externals import joblib import re from filter import Filter from sklearn.metrics import accuracy_score from sklearn.datasets import fetch_20newsgroups bow = joblib.load('trainedmodels/bow_20.pkl') tfidf = joblib.load('trainedmodels/tfidf_20.pkl') detector = joblib.load('trainedmodels/newsgroup.pkl') # file = open("lib/sms_for_test.txt") file = open("lib/20ng-test-no-short.txt") #reObject=re.compile(r'(.*),(spam|ham)$') reObject = re.compile(r'(.*)\t(.*)') filter = Filter() msg = filter.getFiltered(reObject, file) allowed = ['alt.atheism', 'talk.religion.misc'] msg = msg.loc[msg['label'].isin(allowed)] msgs_bag_of_word = bow.transform(msg['data']) # msgs_tfidf=tfidf.transform(msgs_bag_of_word) predictions = detector.predict(msgs_bag_of_word) print("Accuarcy", accuracy_score(msg['label'], predictions))