k_fold_num = 10 filename = './public/' # load data: load all the words in all the emails mailWords, classLables = naiveBayes.loadMailData(filename) skf = StratifiedKFold(classLables, k_fold_num) acc_per_fold = [] f1_per_fold = [] recall_per_fold = [] precision_per_fold = [] for train_index, test_index in skf: print("train_index->", train_index) print("test_index->", test_index) preVocabularyList = naiveBayes.createVocabularyList( [mailWords[i] for i in train_index]) # do wfo filter vocabularyList = naiveBayes.wfoFilter( preVocabularyList, [mailWords[i] for i in train_index], [classLables[i] for i in train_index]) print("vocabularyList finished") trainMarkedWords = naiveBayes.setOfWordsListToVecTor( vocabularyList, [mailWords[i] for i in train_index]) print("trainMarkedWords finished") testMarkedWords = naiveBayes.setOfWordsListToVecTor( vocabularyList, [mailWords[i] for i in test_index]) # # change it to array # trainMarkedWords = np.array(trainMarkedWords)
def trainingAdaboostGetDS(iterateNum=40): test_index = [ 2, 6, 7, 8, 13, 16, 19, 29, 35, 37, 40, 42, 43, 45, 46, 49, 51, 52, 64, 65, 71, 72, 78, 79, 80, 84, 85, 90, 91, 98, 103, 109, 111, 117, 123, 129, 135, 138, 142, 149, 169, 188, 191, 192, 203, 221, 225, 226, 229, 232, 236, 243, 250, 254, 257, 258, 259, 264, 268, 281, 298, 300, 308, 319, 322, 329, 333, 335, 338, 339, 340, 344, 347, 358, 359, 362, 382, 385, 391, 394, 402, 410, 415, 417, 418, 422, 423, 424, 425, 428, 437, 441, 456, 461, 462, 470, 472, 477, 480, 481 ] beginTime = datetime.datetime.now() filename = './public/' # load data: load all the words in all the emails mailWords, classLables = naiveBayes.loadMailData(filename) preVocabularyList = naiveBayes.createVocabularyList(mailWords) # do wfo filter vocabularyList = naiveBayes.wfoFilter(preVocabularyList, mailWords, classLables) print("length of vocabularyList", len(vocabularyList)) trainMarkedWords = naiveBayes.setOfWordsListToVecTor( vocabularyList, mailWords) print("trainMarkedWords finished") # change it to array trainMarkedWords = np.array(trainMarkedWords) print("data to matrix finished") # calculate each propabilaty of spam and ham P(wi/s) p(wi/h) pWordsSpamicity, pWordsHealthy, pSpam = \ naiveBayes.trainingNaiveBayes(trainMarkedWords, classLables) DS = np.ones(len(vocabularyList)) ds_result = {} minErrorRate = np.inf for i in range(iterateNum): errorCount = 0.0 for j in test_index: testWordsCount = naiveBayes.setOfWordsToVecTor( vocabularyList, mailWords[j]) testWordsMarkedArray = np.array(testWordsCount) ps, ph, mailType = naiveBayes.adaboostClassify( vocabularyList, pWordsSpamicity, pWordsHealthy, DS, pSpam, testWordsMarkedArray) if mailType != classLables[j]: errorCount += 1 alpha = ps - ph if alpha > 0: # actual: ham; predict:spam DS[testWordsMarkedArray != 0] = np.abs( (DS[testWordsMarkedArray != 0] - np.exp(alpha)) / DS[testWordsMarkedArray != 0]) else: # actual: spam; predict: ham DS[testWordsMarkedArray != 0] = ( DS[testWordsMarkedArray != 0] + np.exp(alpha)) / DS[testWordsMarkedArray != 0] print('DS:', DS) errorRate = errorCount / len(mailWords) if errorRate < minErrorRate: minErrorRate = errorRate ds_result['minErrorRate'] = minErrorRate ds_result['DS'] = DS print('# %d,errorcount %d ,errorrate %f' % (i, errorCount, errorRate)) if errorRate == 0.0: break ds_result['vocabularyList'] = vocabularyList ds_result['pWordsSpamicity'] = pWordsSpamicity ds_result['pWordsHealthy'] = pWordsHealthy ds_result['pSpam'] = pSpam return ds_result
import numpy as np import datetime import simpleNavie as naiveBayes beginTime = datetime.datetime.now() filename = './public/' #load data: load all the words in all the emails smsWords, classLables = naiveBayes.loadMailData(filename) #get the non-repeated features(vacabulary) preVocabularyList = naiveBayes.createVocabularyList(smsWords) #do wfo filter vocabularyList = naiveBayes.wfoFilter(preVocabularyList, smsWords, classLables) print("length of vocabularyList", len(vocabularyList)) fw = open('vocabularyList.txt', 'w') for i in vocabularyList: fw.write(i + '\n') fw.flush() fw.close() print( "vocabularyList finished") #change to vector: each email is a vector included in the trainMakredWords trainMarkedWords = naiveBayes.setOfWordsListToVecTor(vocabularyList, smsWords) print( "trainMarkedWords finished") # change it to array trainMarkedWords = np.array(trainMarkedWords) print( "data to matrix finished") #calculate each propabilaty of spam and ham P(wi/s) p(wi/h) pWordsSpamicity, pWordsHealthy, pSpam = naiveBayes.trainingNaiveBayes(trainMarkedWords, classLables) print("length of pWordsSpamicity:", len(pWordsSpamicity))
from sklearn import datasets from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.svm import LinearSVC from sklearn.calibration import calibration_curve, CalibratedClassifierCV from sklearn.metrics import (brier_score_loss, precision_score, recall_score, f1_score, log_loss) from sklearn.cross_validation import train_test_split import simpleNavie as naiveBayes filename = './public/' # load data: load all the words in all the emails mailWords, classLables = naiveBayes.loadMailData(filename) preVocabularyList = naiveBayes.createVocabularyList(mailWords) # do wfo filter vocabularyList = naiveBayes.wfoFilter(preVocabularyList, mailWords, classLables) print("vocabularyList finished") trainMarkedWords = naiveBayes.setOfWordsListToVecTor(vocabularyList, mailWords) print("trainMarkedWords finished") test_index = [2, 6, 7] X_train = [trainMarkedWords[i] for i in range(len(trainMarkedWords))] X_test = [trainMarkedWords[i] for i in test_index] y_train = [classLables[i] for i in range(len(trainMarkedWords))] y_test = [classLables[i] for i in test_index] lr = LogisticRegression()
def crossValidateEvaluate(): beginTime = datetime.datetime.now() filename = './public/' # load data: load all the words in all the emails mailWords, classLables = naiveBayes.loadMailData(filename) skf = StratifiedKFold(classLables, k_fold_num) acc_per_fold = [] f1_per_fold = [] recall_per_fold = [] precision_per_fold = [] for train_index, test_index in skf: print("train_index->", train_index) print("test_index->", test_index) preVocabularyList = naiveBayes.createVocabularyList( [mailWords[i] for i in train_index]) #do wfo filter vocabularyList = naiveBayes.wfoFilter( preVocabularyList, [mailWords[i] for i in train_index], [classLables[i] for i in train_index]) vocabularyList = preVocabularyList print("length of vocabularyList", len(vocabularyList)) fw = open('vocabularyList.txt', 'w') for i in vocabularyList: fw.write(i + '\n') fw.flush() fw.close() print("vocabularyList finished") trainMarkedWords = naiveBayes.setOfWordsListToVecTor( vocabularyList, [mailWords[i] for i in train_index]) print("trainMarkedWords finished") # change it to array trainMarkedWords = np.array(trainMarkedWords) print("data to matrix finished") # calculate each propabilaty of spam and ham P(wi/s) p(wi/h) pWordsSpamicity, pWordsHealthy, pSpam = \ naiveBayes.trainingNaiveBayes(trainMarkedWords, [classLables[i] for i in train_index]) fpSpam = open('pSpam.txt', 'w') spam = pSpam.__str__() fpSpam.write(spam) fpSpam.close() np.savetxt('pWordsSpamicity.txt', pWordsSpamicity, delimiter='\t') np.savetxt('pWordsHealthy.txt', pWordsHealthy, delimiter='\t') predict = naiveBayes.predict([mailWords[i] for i in test_index]) #predict = naiveBayes.adaboostPredict([smsWords[i] for i in test_index]) acc_per_fold.append( accuracy_score([classLables[i] for i in test_index], predict)) f1_per_fold.append( f1_score([classLables[i] for i in test_index], predict)) recall_per_fold.append( recall_score([classLables[i] for i in test_index], predict)) precision_per_fold.append( precision_score([classLables[i] for i in test_index], predict)) print("acc_per_fold:", acc_per_fold) print("f1_per_fold:", f1_per_fold) print("recall_per_fold:", recall_per_fold) print("precision_per_fold:", precision_per_fold) print("acc_per_fold:", acc_per_fold) print("f1_per_fold:", f1_per_fold) print("recall_per_fold:", recall_per_fold) print("precision_per_fold:", precision_per_fold) print("k-fold:", k_fold_num, " spend:", (datetime.datetime.now() - beginTime))