def RunMain(): print '************Welcome to the World of Bayes!***********\n' time.clock() t0 = float(time.clock()) # # load data, and save as the format under NaiveBayes. DIR_RESULT = "./Result/" DIR = "./clintontrump-data/" FILENAME_BASIC = "clintontrump." [vocList, wordNum] = ld.LoadData_vocabulary(DIR+FILENAME_BASIC+"vocabulary") [trainX, trainDocNum] = ld.LoadData_bagOfWords(DIR+FILENAME_BASIC+"bagofwords.train") [devX, devDocNum] = ld.LoadData_bagOfWords(DIR+FILENAME_BASIC+"bagofwords.dev") [testX, testDocNum] = ld.LoadData_bagOfWords(DIR+FILENAME_BASIC+"bagofwords.test") str0 = "realDonaldTrump" str1 = "HillaryClinton" trainY = ld.LoadData_labels(DIR+FILENAME_BASIC+"labels.train", str0) devY = ld.LoadData_labels(DIR+FILENAME_BASIC+"labels.dev", str0) t1 = float(time.clock()) print 'Loading data File. using time %.4f s, \n' % (t1-t0) # # define NaiveBayes instance, and calc prior P(y) nbModel = nb.NAIVE_BAYES_MODEL(wordNum, trainDocNum, trainX, trainY) nbModel.estimatePy_MLE() # *******part 1: basic implementation ###### Bernoulli model [berAccuracy, berTestHist, berConfuseMat] = LearnAndTest(nbModel, devX, devY, "Bernoulli") od.WritenFile_dev(DIR_RESULT+"Predict.Bernoulli_basic.dev", berTestHist, str0, str1) Pwy_b = copy.deepcopy(nbModel.Pwy_c) #print 'Bernoulli accuracy is %.4f \nconfuseMatrix is:\n' %(float(berAccuracy)/float(testDocNum)), berConfuseMat t2 = float(time.clock()) print 'Bernoulli Model learn & test, using time %.4f s, \n' % (t2-t1) ###### Multinomial will go through the similar process. [mulAccuracy, mulTestHist, mulConfuseMat] = LearnAndTest(nbModel, devX, devY, "Multinomial") od.WritenFile_dev(DIR_RESULT+"Predict.Multinomial_basic.dev", mulTestHist, str0, str1) Pwy_m = copy.deepcopy(nbModel.Pwy_c) #print 'Multinomial accuracy is %.4f \nconfuse matrix is:\n' %(float(mulAccuracy)/float(testDocNum)), mulConfuseMat t3 = float(time.clock()) print 'multinomial Model learn & test, using time %.4f s, \n' % (t3-t2) # ******* part 3: bonus**********since training data will be changed in part2, so part 3 first. ###### Bernoulli model t1 = float(time.clock()) [berAccuracy, berTestHist, berConfuseMat] = bo.LearnAndTest(nbModel, devX, devY, "Bernoulli") od.WritenFile_dev(DIR_RESULT+"Predict.Bernoulli_basic_with_tag.dev", berTestHist, str0, str1) #Pwy_b = copy.deepcopy(nbModel.Pwy_c) #print 'Bernoulli accuracy is %.4f \nconfuseMatrix is:\n' %(float(berAccuracy)/float(testDocNum)), berConfuseMat t2 = float(time.clock()) print 'Bernoulli Model learn & test, using time %.4f s, \n' % (t2-t1) ###### Multinomial will go through the similar process. t1 = float(time.clock()) [mulAccuracy, mulTestHist, mulConfuseMat] = bo.LearnAndTest(nbModel, devX, devY, "Multinomial") od.WritenFile_dev(DIR_RESULT+"Predict.Multinomial_basic_with_tag.dev", mulTestHist, str0, str1) #Pwy_m = copy.deepcopy(nbModel.Pwy_c) #print 'Multinomial accuracy is %.4f \nconfuse matrix is:\n' %(float(mulAccuracy)/float(testDocNum)), mulConfuseMat t2 = float(time.clock()) print 'multinomial Model learn & test, using time %.4f s, \n' % (t2-t1) #Predict the final ###### Bernoulli model t1 = float(time.clock()) berTestHist = bo.LearnAndPredict(nbModel, testX, "Bernoulli") od.WritenFile_dev(DIR_RESULT+"Predict.Bernoulli_basic_with_tag.test", berTestHist, str0, str1) #Pwy_b = copy.deepcopy(nbModel.Pwy_c) #print 'Bernoulli accuracy is %.4f \nconfuseMatrix is:\n' %(float(berAccuracy)/float(testDocNum)), berConfuseMat t2 = float(time.clock()) print 'Bernoulli Model learn & test, using time %.4f s, \n' % (t2-t1) ###### Multinomial will go through the similar process. t1 = float(time.clock()) mulTestHist = bo.LearnAndPredict(nbModel, testX, "Multinomial") od.WritenFile_dev(DIR_RESULT+"Predict.Multinomial_basic_with_tag.test", mulTestHist, str0, str1) #Pwy_m = copy.deepcopy(nbModel.Pwy_c) #print 'Multinomial accuracy is %.4f \nconfuse matrix is:\n' %(float(mulAccuracy)/float(testDocNum)), mulConfuseMat t2 = float(time.clock()) print 'multinomial Model learn & test, using time %.4f s, \n' % (t2-t1) ##### Ranking Top ten features topWord_list = [10, 100, 1000, 5000] std_threshold = [0.1, 0.3, 0.5, 0.7, 0.9] tfidf_threshold = [1, 2, 3, 4, 5] for word_num in topWord_list: print '\n * Remove top words based on Bernoulli', word_num [removedIdx, labelVec, redFeaNum] = im.find_top_words(nbModel.classNum, wordNum, word_num, Pwy_b) nbModel.setFeatureLabel(labelVec, redFeaNum) t1 = float(time.clock()) [berAccuracy, mulTestHist, mulConfuseMat] = LearnAndTest(nbModel, devX, devY, "Bernoulli") t2 = float(time.clock()) print 'Bernoulli Model learn & test, using time %.4f s, \n' % (t2-t1) t1 = float(time.clock()) [mulAccuracy, mulTestHist, mulConfuseMat] = LearnAndTest(nbModel, devX, devY, "Multinomial") t2 = float(time.clock()) print 'Multinomial Model learn & test, using time %.4f s, \n' % (t2-t1) print '\n * Remove top words based on Mulinomial', word_num [removedIdx, labelVec, redFeaNum] = im.find_top_words(nbModel.classNum, wordNum, word_num, Pwy_m) nbModel.setFeatureLabel(labelVec, redFeaNum) t1 = float(time.clock()) [berAccuracy, berTestHist, berConfuseMat] = LearnAndTest(nbModel, devX, devY, "Bernoulli") t2 = float(time.clock()) print 'Bernoulli Model learn & test, using time %.4f s, \n' % (t2-t1) t1 = float(time.clock()) [mulAccuracy, mulTestHist, mulConfuseMat] = LearnAndTest(nbModel, devX, devY, "Multinomial") t2 = float(time.clock()) print 'Multinomial Model learn & test, using time %.4f s, \n' % (t2-t1) for std_thre in std_threshold: # Find stop words based on STD and doc# print '\n * Remove STD words for Bernoulli', std_thre [removedIdx, labelVec, redFeaNum] = im.find_std_zero_words(wordNum, std_thre, Pwy_b) #showWords(vocList, removedIdx) nbModel.setFeatureLabel(labelVec, redFeaNum) t1 = float(time.clock()) [berAccuracy, berTestHist, berConfuseMat] = LearnAndTest(nbModel, devX, devY, "Bernoulli") t2 = float(time.clock()) print 'Bernoulli Model learn & test, using time %.4f s, \n' % (t2-t1) t1 = float(time.clock()) [mulAccuracy, mulTestHist, mulConfuseMat] = LearnAndTest(nbModel, devX, devY, "Multinomial") t2 = float(time.clock()) print 'Multinomial Model learn & test, using time %.4f s, \n' % (t2-t1) # Find stop words based on STD and word# print '\n * Remove STD words for Multinomial', std_thre [removedIdx, labelVec, redFeaNum] = im.find_std_zero_words(wordNum, std_thre, Pwy_m) nbModel.setFeatureLabel(labelVec, redFeaNum) t1 = float(time.clock()) [berAccuracy, berTestHist, berConfuseMat] = LearnAndTest(nbModel, devX, devY, "Bernoulli") t2 = float(time.clock()) print 'Bernoulli Model learn & test, using time %.4f s, \n' % (t2-t1) t1 = float(time.clock()) [mulAccuracy, mulTestHist, mulConfuseMat] = LearnAndTest(nbModel, devX, devY, "Multinomial") t2 = float(time.clock()) print 'Multinomial Model learn & test, using time %.4f s, \n' % (t2-t1) for tfidf_thre in tfidf_threshold: # Find stop words based on TF/IDF print '\n * Remove low TF/IDF words', tfidf_thre [removedIdx, labelVec, redFeaNum] = im.find_low_tfidf_words(wordNum, tfidf_thre, Pwy_b, Pwy_m) nbModel.setFeatureLabel(labelVec, redFeaNum) t1 = float(time.clock()) [berAccuracy, berTestHist, berConfuseMat] = LearnAndTest(nbModel, devX, devY, "Bernoulli") t2 = float(time.clock()) print 'berAccuracy Model learn & test, using time %.4f s, \n' % (t2-t1) t1 = float(time.clock()) [mulAccuracy, mulTestHist, mulConfuseMat] = LearnAndTest(nbModel, devX, devY, "Multinomial") t2 = float(time.clock()) print 'Multinomial Model learn & test, using time %.4f s, \n' % (t2-t1) # ******** part 2: Priors and overfittings ## different Laplace Smoothing Alpha [testAlpha, testAccuracy] = PriorAndFitting_diffLaplace(nbModel, devX, devY, DIR_RESULT, str0, str1) testAccuracy = np.array(testAccuracy)/float(testDocNum) print testAlpha print testAccuracy od.Save2Figure_semilogs(DIR_RESULT+'laplaceAlpha', 1, testAlpha, [testAccuracy],['log(laplace_alpha)','accuracy'], [1e-5, 10000,0, 1], 1)