def tarea1(entrenamiento, prueba): d = Main() (t_0, t_1) = d.split(entrenamiento) nb = NaiveBayes.NaiveBayes(entrenamiento, t_1, t_0, prueba) nb.plot() b = Bayes.Bayes(entrenamiento, t_1, t_0, prueba) b.plot() return
def test(): labels = [] label_ids = set() doc_matrix = [] dir = 'data/' train_file = 'train.txt' test_file = 'test.txt' word_set_file = 'all_words.txt' model_file = 'model.txt' with open(dir + train_file) as f: for l in f: l = l.replace('\n','') if l == '': continue comps = l.split('\t') assert(len(comps) == 2) if comps[1] == '': continue labels.append(comps[0]) doc_matrix.append(comps[1].split(',')) label_ids.add(comps[0]) bayes_model = Bayes.Bayes(dir + word_set_file) bayes_model.train(doc_matrix, labels, list(label_ids), dir + model_file) #open the test file expect_labels = [] predict_docs = [] with open(dir + test_file) as f: for l in f: l = l.replace('\n', '') if l == '': continue comps = l.split('\t') if comps[1] == '': continue assert(len(comps) == 2) expect_labels.append(comps[0]) predict_docs.append(comps[1].split(',')) predict_labels = bayes_model.predict(dir + model_file, predict_docs) post_analysis(predict_labels, expect_labels)
from Bayes import * import commands import re print '3b' bc = Bayes() bc.train('../data/arxiv/arxiv.train') bc.predict('../data/arxiv/arxiv.test', 0, 1, 1, 0) print '3c' c = Bayes() c.train('../data/arxiv/arxiv.train') c.predict('../data/arxiv/arxiv.test', 0, 1, 10, 0) print '3d' nfold = 4 s_test = [] s_train = [] for d in range(nfold): s_test = [] s_train = [] with open('../data/arxiv/arxiv.norm.train', 'r') as f: for i, l in enumerate(f): if i % nfold == d: s_test.append(l) else: s_train.append(l) with open('../data/arxiv/arxiv.norm%d.test' % d, 'w') as test: for t in s_test: test.write(t)
print("Dictionary classes created") print("Creating and completing positive and negative dictionaries...") if SIZED_DCT is False: dictionary1.create_dictionary() print(f"Positive dictionary created") dictionary0.create_dictionary() print(f"Negative dictionary created \n \n") else: dictionary1.create_sized_dictionary(SIZE) print("Positive dictionary created") dictionary0.create_sized_dictionary(SIZE) print("Negative dictionary created \n \n") print("Creating BAYES class...") bayes = Bayes(dictionary1, dictionary0, testing_set) print("Bayes class created") print("Predicting sentiments for testing set...") nb_undetermined = bayes.predict_sentiments(LAPLACE_SMOOTHING, pos_spl_nb, neg_spl_nb) print("Prediction of sentiments for testing set done") print( f"Number of tweets with undetermined sentiments : {nb_undetermined}" ) print( "Comparing sentiments from the dataset with predicted sentiments..." ) metrics, conf_matrix = bayes.compare_sentiments()
import SVM import lr import Bayes import LDA LDA.LDA() Bayes.Bayes() SVM.svmwch() lr.lr()
def performBayes(inputDataClass, drawPrecisionRecall=False, drawConfusion=False): """################################# Bayes Classifier #############################################""" ##Sklearn # print("\nSklearn Naive Bayes") # clf = GaussianNB() # clf.fit(inputDataClass.Train[:,:-1], inputDataClass.Train[:,-1]) # Ypred = clf.predict(inputDataClass.Train[:,:-1]) # Ytrue = inputDataClass.Train[:,-1] # print("Training Accuracy = "+str(performanceAnalyser.calcAccuracyTotal(Ypred,Ytrue))) # Ypred = clf.predict(inputDataClass.Test[:,:-1]) # Ytrue = inputDataClass.Test[:,-1] # print("Testing Accuracy = "+str(performanceAnalyser.calcAccuracyTotal(Ypred,Ytrue))) print("\nMy Naive Bayes") bayesClassifier = Bayes.Bayes( isNaive=False, distribution=[0 for i in range(inputDataClass.Train.shape[1] - 1)]) # bayesClassifier = Bayes.Bayes(isNaive = True, distribution =[-1,0,0,1,1,0]) bayesClassifier.train(inputDataClass.Train) print("Training of model done.") Ypred = bayesClassifier.fit(inputDataClass.Train) Ytrue = inputDataClass.Train[:, -1] print("Training Accuracy = " + str(performanceAnalyser.calcAccuracyTotal(Ypred, Ytrue))) Ypred = bayesClassifier.fit(inputDataClass.Test) Ytrue = inputDataClass.Test[:, -1] print("Testing Accuracy = " + str(performanceAnalyser.calcAccuracyTotal(Ypred, Ytrue))) print("Prediction done.") if drawConfusion: confusion = performanceAnalyser.getConfusionMatrix(Ytrue, Ypred) Visualization.visualizeConfusion(confusion) if drawPrecisionRecall: ############################ precision-recall curve ############################# threshold = np.arange(0.9, 0.1, -0.1) probas = bayesClassifier.get_probas() for dic in probas: sums = 0.0 for item in dic: sums += dic[item] for item in dic: dic[item] = dic[item] / sums roc = ROC.Roc(Ytrue, probas, threshold, '') roc.Roc_gen() precision, recall, _ = precision_recall_curve(Ytrue, probas) plt.step(recall, precision, color='b', alpha=0.2, where='post') plt.fill_between(recall, precision, step='post', alpha=0.2, color='b') plt.xlabel('Recall') plt.ylabel('Precision') plt.ylim([0.0, 1.05]) plt.xlim([0.0, 1.0]) plt.title('Precision Recall Curve') return Ytrue, Ypred
def main(): path_boy = "F:\\study in school\\machine learning\\forstudent\\实验数据\\boynew.txt" path_girl = "F:\\study in school\\machine learning\\forstudent\\实验数据\\girlnew.txt" # height = [] # weight = [] # feetsize = [] x_boy = [] x_girl = [] label_boy = [] # 1表示男,0表示女 label_girl = [] readdata1(path_boy, x_boy, label_boy, 1) readdata1(path_girl, x_girl, label_girl, 0) x_boy = np.mat(x_boy) x_girl = np.mat(x_girl) m1 = x_boy.mean(0) m0 = x_girl.mean(0) S1 = (x_boy - m1[0]).T * (x_boy - m1[0]) S0 = (x_girl - m0[0]).T * (x_girl - m0[0]) Sw = S1 + S0 S_inverse = Sw.I W = S_inverse * (m1 - m0).T M1 = float(W.T * m1.T) M0 = float(W.T * m0.T) w_decision0 = (M0 + M1) / 2 path_boy_test = "F:\\study in school\\machine learning\\forstudent\\实验数据\\boy.txt" path_girl_test = "F:\\study in school\\machine learning\\forstudent\\实验数据\\girl.txt" x = [] label = [] readdata1(path_boy_test, x, label, 1) readdata1(path_girl_test, x, label, 0) label_test = [] y = x * W errorcount = 0 for i in range(len(label)): if float(y[i] > w_decision0): label_test.append(1) if label[i] != 1: errorcount = errorcount + 1 else: label_test.append(0) if label[i] != 0: errorcount = errorcount + 1 e_percentage = errorcount / len(label_test) print('fisher测试集的错误率为%f' % e_percentage) #留一法 loo = LeavePOut(p=1) error = 0 for train, test in loo.split(x, label): x_boy = [] x_girl = [] label_boy = [] # 1表示男,0表示女 label_girl = [] for i in train: if label[i] == 1: x_boy.append(x[i]) label_boy.append(1) else: x_girl.append(x[i]) label_girl.append(0) x_boy = np.mat(x_boy) x_girl = np.mat(x_girl) m1 = x_boy.mean(0) m0 = x_girl.mean(0) S1 = (x_boy - m1[0]).T * (x_boy - m1[0]) S0 = (x_girl - m0[0]).T * (x_girl - m0[0]) Sw = S1 + S0 S_inverse = Sw.I W = S_inverse * (m1 - m0).T M1 = float(W.T * m1.T) M0 = float(W.T * m0.T) w_decision0 = (M0 + M1) / 2 for j in test: if float(x[j] * W > w_decision0): if label[j] != 1: error = error + 1 else: label_test.append(0) if label[j] != 0: error = error + 1 print('fisher留一法的错误率为%f' % (error / len(label))) figure(3) FPR, TPR = get_roc_fisher(W, w_decision0, x, label) plot(FPR, TPR, label='fisher') figure(5) x1 = np.arange(130, 190, 0.01) y1 = (w_decision0 - W[0] * x1) / W[1] plot(x1, array(y1)[0]) plot(x1, x1 * float(W[1]) / float(W[0])) for i in range(len(label)): if label[i] == 1: plot(float(x[i][0]), float(x[i][1]), 'o', color='r') else: plot(float(x[i][0]), float(x[i][1]), 'o', color='g') a=(float(x[i][1])+float(x[i][0])*float(W[0])/float(W[1]))/\ (float(W[1])/float(W[0])+float(W[0])/float(W[1])) b = a * float(W[1]) / float(W[0]) plot([float(x[i][0]), a], [float(x[i][1]), b], '--', color='0.75') axis([140, 190, 35, 85]) Bayes()
class Test(object): bayes = Bayes.Bayes() def testingNB(self): listOPosts, listClasses = self.bayes.loadDataSet() myVocabList = self.bayes.createVocabList(listOPosts) trainMat = [] for postinDoc in listOPosts: trainMat.append(self.bayes.setOfWords2Vec(myVocabList, postinDoc)) p0V, p1V, pAb = self.bayes.trainNB0(array(trainMat), array(listClasses)) testEntry = ['love', 'my', 'dalmation'] thisDoc = array(self.bayes.setOfWords2Vec(myVocabList, testEntry)) testEntry = ['stupid', 'garbage'] thisDoc = array(self.bayes.setOfWords2Vec(myVocabList, testEntry)) print testEntry, 'classified as: ', self.bayes.classifyNB( thisDoc, p0V, p1V, pAb) # 测试函数 def spamTest(self): docList = [] classList = [] fullText = [] for i in range(1, 26): # 导入文件 并解析成词列表 wordList = self.bayes.textParse( open(Config.DATAS + 'NaiveBayes/email/spam/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = self.bayes.textParse( open(Config.DATAS + 'NaiveBayes/email/ham/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = self.bayes.createVocabList(docList) # create vocabulary trainingSet = range(50) testSet = [] # 随机构建测试函数 for i in range(10): randIndex = int(random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) trainMat = [] trainClasses = [] for docIndex in trainingSet: # 测试分类器 trainMat.append( self.bayes.bagOfWords2VecMN(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) p0V, p1V, pSpam = self.bayes.trainNB0(array(trainMat), array(trainClasses)) errorCount = 0 for docIndex in testSet: # classify the remaining items wordVector = self.bayes.bagOfWords2VecMN(vocabList, docList[docIndex]) if self.bayes.classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print "classification error", docList[docIndex] e = float(errorCount) / len(testSet) return e print 'the error rate is: ', e