def get_lambda(parts, alpha): for i in range(0, 101, 5): lambda_legit = 10**i sum_accuracy_score = 0 cnt = 0 for test in parts: d_tran = [] predict = [] answer = [] for part in parts: if part != test: d_tran = d_tran + part bayes = train(d_tran) for answer_class, message in test: predict_class, _ = classify(bayes, message, 1, lambda_legit, alpha) predict.append(int(predict_class == SPAM)) answer.append(int(answer_class == SPAM)) if predict_class == SPAM and answer_class == LEGIT: cnt += 1 sum_accuracy_score += accuracy_score(predict, answer) print("cnt:", cnt, "lambda:", 10**i)
def test(): """ the probability reach 1 represent badness """ listpost, listclass = bayes.loaddataset() myvocablist = bayes.createlist(listpost) tmatrix = list() for doc in listpost: vec = bayes.word2vec(myvocablist, doc) tmatrix.append(vec) p0, p1, pa = bayes.train(tmatrix, listclass) testdoc1 = ['love', 'my', 'dalmation'] testvec1 = bayes.word2vec(myvocablist, testdoc1) print testdoc1, 'classify as :', bayes.classify(testvec1, p0, p1, pa) testdoc2 = ['stupid', 'love'] testvec2 = bayes.word2vec(myvocablist, testdoc2) print testdoc2, 'classify as :', bayes.classify(testvec2, p0, p1, pa)
def final_result(text): result = classify(text) if result < 0.35: return "Positive" elif result>0.65: return "Negative" else: return "Neutral"
def train(pkgDict, settings=None): test, classCorrect, count = [], 0, 0 for key in pkgDict.keys(): matrix = [np.asarray(item) for item in pkgDict[key][TRAIN]] pkgDict[key][COVARIANCE] = b.getCovarianceMatrix(matrix, settings[TEST_STRATEGY]) pkgDict[key][MEAN] = np.mean(pkgDict[key][TRAIN], axis=0) test.append(pkgDict[key][TEST]) for key in pkgDict.keys(): for sample in pkgDict[key][TEST]: classifiedClass = b.classify(pkgDict, sample, settings[TEST_STRATEGY]) classCorrect = classCorrect + 1 if key == classifiedClass else classCorrect count += 1 return classCorrect/count
def draw_roc(parts, alpha): d_train = [] for part in parts: d_train = d_train + part bayes = train(d_train) roc_c = [] roc_pred = [] count_y = 0 count_x = 0 for answer_class, message in d_train: predict_class, pred = classify(bayes, message, 1, 1, alpha) if predict_class == answer_class: count_y += 1 roc_c.append(True) else: count_x += 1 roc_c.append(False) roc_pred.append(pred) roc_c, roc_pred = zip(*sorted(zip(roc_c, roc_pred), key=lambda x: x[1])) if count_y == 0: count_y = 1 if count_x == 0: count_x = 1 shag_x = 1 / count_x shag_y = 1 / count_y X = [] Y = [] x = 0 y = 0 X.append(x) Y.append(y) for t in roc_c: if t: y += shag_y else: x += shag_x X.append(x) Y.append(y) plt.plot(X, Y) plt.show()
def spam_test(): #使用贝叶斯垃圾邮件分类器 doc_list = [] #包含所有文件解析后的词列表(二维) class_list = [] full_text = [] #包含所有出现词语的列表(一维) text_num = 50 for i in range(1, 26): #共25个文件 try: big_string = open('email/spam/%d.txt' % i).read() #导入垃圾邮件文件 word_list = text_parse(big_string) #解析为词列表 doc_list.append(word_list) #加入所有文件的总列表中 full_text.extend(word_list) #加入所有词语的列表中 class_list.append(1) #将类别设为1 except: text_num -= 1 try: big_string = open('email/ham/%d.txt' % i).read() #导入非垃圾邮件文件 word_list = text_parse(big_string) doc_list.append(word_list) full_text.extend(word_list) class_list.append(0) except: text_num -= 1 print(text_num) vocab_list = bayes.create_vocab_list(doc_list) #去重列表(二维) training_set = list(range(text_num)) #全部样本集数 test_set = [] #测试集 for i in range(10): #从样本集中随机挑选10个作为测试集(下标) rand_index = int(random.uniform(0, len(training_set))) #随机整数 test_set.append(training_set[rand_index]) #加入测试集 del (training_set[rand_index]) #从训练集中删掉 train_mat = [] #训练矩阵 train_class = [] #训练矩阵的元素类别 for doc_index in training_set: words = bayes.set_of_words2vec( vocab_list, doc_list[doc_index]) #检测doc_list[i]中的词语是否出现在vocab_list中,返回0-1向量 train_mat.append(words) #放入矩阵中 train_class.append(class_list[doc_index]) #更新类别 p_0v, p_1v, p_spam = bayes.train_nb0(array(train_mat), array(train_class)) #求出相关概率 error_count = 0 for doc_index in test_set: #测试数据 word_vec = bayes.set_of_words2vec(vocab_list, doc_list[doc_index]) result = bayes.classify(array(word_vec), p_0v, p_1v, p_spam) #与相关概率比较得到分类结果 if result != class_list[doc_index]: error_count += 1 print('The error rate is :', float(error_count / len(test_set))) #计算错误率 return float(error_count / len(test_set))
def create_answer(data, token): user_id = data['user_id'] message = get_answer(data['body'].lower()) cat = bayes.classify(sentence=message) if cat[0] == 'news': res = yandex.Parse_scv() for news in res: vkapi.send_message(user_id, token, news) elif cat[0] == 'weather': res = weather.TakeWeather() vkapi.send_message(user_id, token, 'Погода за окном ' + res['status']) vkapi.send_message(user_id, token, 'Облачность ' + res['description']) vkapi.send_message(user_id, token, 'Восход ' + res['sunrise']) vkapi.send_message(user_id, token, 'Закат ' + res['sunset']) vkapi.send_message(user_id, token, 'Скорость ветра = ' + str(res['windspeed'])) elif cat[0] == 0: vkapi.send_message(user_id, token, 'Простите, я вас не понимаю')
def emailTest(): # 储存所有文档,以一个列表的形式保存 allDoclist = [] # 储存所有的邮件,每个邮件属于一个列表 allEmail = [] # 储存每封邮件的类别 classList = [] for i in range(1, 26): with open('../email/spam/%d.txt' % i) as text: wordList = textParse(text.read()) allDoclist.append(wordList) allEmail.extend(wordList) classList.append(1) with open('../email/ham/%d.txt' % i) as text: wordList = textParse(text.read()) allDoclist.append(wordList) allEmail.extend(wordList) classList.append(0) # 将所有文档中的单词去除重复 newWordsList = bayes.createNewList(allDoclist) # 邮件总数为50,从中划分测试集和训练集 trainingSet = range(50) testSet = [] for i in range(10): # 随机数最开始的浮点数没有取到首尾,取Int后有首尾0,最后 randIndex = int(random.uniform(0, len(trainingSet))) # 添加测试集 testSet.append(trainingSet[randIndex]) # 删除训练集中对应部分 del trainingSet[randIndex] trainingVec = [] trainClass = [] for trainingIndex in trainingSet: trainingVec.append( bayes.words2Vec(newWordsList, allDoclist[trainingIndex])) trainClass.append(classList[trainingIndex]) vecP1, vecP0, p1 = bayes.trainNB(trainingVec, trainClass) errorCount = 0 for testIndex in testSet: testVec = bayes.words2Vec(newWordsList, allDoclist[testIndex]) if bayes.classify(testVec, vecP1, vecP0, p1) != classList[testIndex]: errorCount += 1 print 'classification error: ', allDoclist[testIndex] print 'The error rate is: ', float(errorCount) / len(testSet)
def local_words(feed1,feed0): #从个人广告中获取区域倾向 doc_list=[] class_list=[] full_test=[] min_len=min(len(feed1['entries']),len(feed0['entries'])) print(min_len) for i in range(min_len): #每次访问一条rss源 word_list=bayes_2.text_parse(feed1['entries'][i]['summary']) #解析feed1得到的长字符串,返回字符串列表 doc_list.append(word_list) #将这次获得字符串列表放到总列表中 full_test.extend(word_list) #包含所有单词(可重复) class_list.append(1) #标记为1(来源feed1) word_list=bayes_2.text_parse(feed0['entries'][i]['summary']) #解析feed0得到的长字符串,返回字符串列表 doc_list.append(word_list) full_test.extend(word_list) class_list.append(0) vocab_list=bayes.create_vocab_list(doc_list) #得到一个去重总词集 top30_words=cal_most_freq(vocab_list,full_test) #获得频数最高的30个词 for pair in top30_words: #从去重词集中去掉这30个词 if pair[0] in vocab_list: vocab_list.remove(pair[0]) training_set=list(range(2*min_len)) #训练集下标 test_set=[] #测试集 for i in range(20): #随机挑选20个样本作为测试集 rand_index=int(random.uniform(0,len(training_set))) test_set.append(training_set[rand_index]) del(training_set[rand_index]) train_mat=[] train_class=[] for doc_index in doc_list: #训练模型 train_mat.append(bayes.bag_of_words2vec(vocab_list,doc_list[doc_index])) train_class.append(class_list[doc_index]) p_0v,p_1v,p_spam=bayes.train_nb0(array(train_mat),array(train_class)) error_count=0 for doc_index in test_set: #测试模型,计算错误 word_vec=bayes.bag_of_words2vec(vocab_list,doc_list[doc_index]) result=bayes.classify(word_vec,p_0v,p_1v,p_spam) if result != class_list[doc_index]: error_count+=1 print('The error rate is:',float(error_count/len(test_set))) return vocab_list,p_0v,p_1v
def get_best_alpha(parts): best_accuracy = 0 best_alpha = 0 for alpha_degree in range(0, 10): alpha = 1 / (10**alpha_degree) sum_accuracy_score = 0 for d_test in parts: d_train = [] predict = [] answer = [] for part in parts: if part != d_test: d_train = d_train + part bayes = train(d_train) for answer_class, message in d_test: predict_class, _ = classify(bayes, message, 1, 1, alpha) predict.append(int(predict_class == SPAM)) answer.append(int(answer_class == SPAM)) sum_accuracy_score += accuracy_score(predict, answer) accuracy = sum_accuracy_score / COUNT_PARTS if best_accuracy < accuracy: best_accuracy = accuracy best_alpha = alpha print('alpha:', alpha, 'accuracy:', accuracy) print("__________________________________________") print("Best alpha:", best_alpha) print("Best accuracy:", best_accuracy) print("__________________________________________") return best_alpha
def main(keyword): pos, neg, net = 0, 0, 0 try: tweets = twitter_fetch(keyword) except: print "Connection Error" for each_tweet in tweets: result = classify(each_tweet) if result < 0.35: pos = pos + 1 elif result>0.65: neg = neg + 1 else: net = net + 1 return pos,neg,net
def draw_accuracy_from_lambda(parts, alpha): all_lambda_legit = [] all_accuracy = [] for i in range(0, 101, 5): lambda_legit = 10**i sum_accuracy_score = 0 for test in parts: d_train = [] predict = [] answer = [] for part in parts: if part != test: d_train = d_train + part bayes = train(d_train) for answer_class, message in test: predict_class, _ = classify(bayes, message, 1, lambda_legit, alpha) predict.append(int(predict_class == SPAM)) answer.append(int(answer_class == SPAM)) sum_accuracy_score += accuracy_score(predict, answer) accuracy = sum_accuracy_score / COUNT_PARTS all_lambda_legit.append(i) all_accuracy.append(accuracy) print("lambda:", lambda_legit, "accuracy:", accuracy) plt.plot(all_lambda_legit, all_accuracy) plt.xlabel('10^x lambda legit') plt.ylabel('Accuracy') plt.show()
import bayes dataset, labels = bayes.load_dataset() print(dataset) print(labels) vocab_list = bayes.create_vocab_list(dataset) print(vocab_list) matrix = [] for array in dataset: vec = bayes.words_set_to_vec(vocab_list, array) matrix.append(vec) print(matrix) p_0_v, p_1_v, p_ab = bayes.train(matrix, labels) print(p_0_v) print(p_1_v) print(p_ab) print('<--->') test = ['love', 'my', 'dalmation'] vec = bayes.words_set_to_vec(vocab_list, test) classify = bayes.classify(vec, p_0_v, p_1_v, p_ab) print(test) print(vec) print(classify) print('<--->') test = ['stupid', 'garbage'] vec = bayes.words_set_to_vec(vocab_list, test) classify = bayes.classify(vec, p_0_v, p_1_v, p_ab) print(test) print(vec) print(classify)
def validate_classification(roc_step=toolkit.NUM('0.001')): """Compute the ROC.""" true = dict() false = dict() true[bayes.HAM] = defaultdict(lambda: toolkit.NUM(0)) true[bayes.SPAM] = defaultdict(lambda: toolkit.NUM(0)) false[bayes.HAM] = defaultdict(lambda: toolkit.NUM(0)) false[bayes.SPAM] = defaultdict(lambda: toolkit.NUM(0)) ham_roc = list() spam_roc = list() # Compute best features. print "Computing the 300 most characteristic features" print "Therefore we compute the mutual information of each word with respect to the classification" print "" best_features = features.best_features()[-300:] best_features = map(operator.itemgetter(0), best_features) ham_files = toolkit.get_files(bayes.HAM + bayes.TEST) spam_files = toolkit.get_files(bayes.SPAM + bayes.TEST) ham_count = len(ham_files) spam_count = len(spam_files) test_samples = zip(ham_files + spam_files, ham_count * [bayes.HAM] + spam_count * [bayes.SPAM]) ham_count = toolkit.NUM(ham_count) spam_count = toolkit.NUM(spam_count) from math import log, ceil total_count = ham_count + spam_count total_digits = ceil(log(total_count, 10)) print_msg = "[%%s] Processing file %%0%dd of %d: %%s " % (total_digits, total_count) count = 0 for filename, clss in test_samples: count += 1 print print_msg % (strftime("%H:%M:%S", gmtime()), count, filename) threshold = toolkit.ZERO while threshold <= toolkit.ONE: classification = bayes.classify(filename, best_features, threshold) if (classification == clss): true[classification][threshold] += toolkit.ONE else: false[classification][threshold] += toolkit.ONE threshold += roc_step threshold = toolkit.ZERO while threshold <= toolkit.ONE: total_false = toolkit.NUM(false[bayes.HAM][threshold] + false[bayes.SPAM][threshold]) total_true = toolkit.NUM(true[bayes.HAM][threshold] + true[bayes.SPAM][threshold]) ham_roc.append((false[bayes.HAM][threshold] / total_false, true[bayes.HAM][threshold] / total_true)) spam_roc.append((false[bayes.SPAM][threshold] / total_false, true[bayes.SPAM][threshold] / total_true)) threshold += roc_step #roc.reverse() print len(ham_roc) hamfile = open('ham_roc.dat', 'w') spamfile = open('spam_roc.dat', 'w') for h_e, s_e in zip(ham_roc, spam_roc): hamfile.write(str(h_e[0]) + ' ' + str(h_e[1]) + '\n') hamfile.flush() spamfile.write(str(s_e[0]) + ' ' + str(s_e[1]) + '\n') spamfile.flush() hamfile.close() spamfile.close() return true, false, ham_roc, spam_roc
#!/usr/bin/python # -*- coding: utf-8 -*- ''' @Author Amit Joshi ''' import sys from bayes import classify from t2l import txt_to_list reload(sys) sys.setdefaultencoding('utf8') text = raw_input("Enter Nepali Sentence: ") result = classify(text) # if result < 0.35: # print text, "=> Positive" # elif result > 0.65: # print text, "=> Negative" # else: # print text, "=> Neutral" if result < 0.35: print "\nPositive\n" elif result > 0.65: print "\nNegative\n" else: print "\nNeutral\n"
ham = getArticles('data/email/ham.pickle') spam = getArticles('data/email/spam.pickle') # Generate training set and test set random.shuffle(ham) random.shuffle(spam) hamTestLen = len(ham) // 3 spamTestLen = len(spam) // 3 testData = ham[:hamTestLen] + spam[:spamTestLen] testLabels = ['ham' for i in range(hamTestLen) ] + ['spam' for i in range(spamTestLen)] trainData = ham[hamTestLen:] + spam[spamTestLen:] trainLabels = ['ham' for i in range(len(ham) - hamTestLen) ] + ['spam' for i in range(len(spam) - spamTestLen)] # Train model wordBag = articles.createWordBag(trainData) trainData = articles.createDataSet(trainData, wordBag) model = bayes.train(trainData, trainLabels) # Test model correct = 0 testData = articles.createDataSet(testData, wordBag) for i, data in enumerate(testData): res = bayes.classify(data, model) if res == testLabels[i]: correct += 1 print('Correctness: %d/%d' % (correct, len(testData)))
def main(): with open('data/SMSSpamCollection') as input_file: text = input_file.read() text = text.strip() text = text.split('\n') # stop word cutoffs as per assignment stopWords = [0] # xSlice our data into five equal segments for fivefold cross validation # each segment has random indices indices = random.sample(xrange(len(text)),len(text)) randomData = [text[i] for i in indices] stride = len(randomData)/5 randomSlices = [[],[],[],[],[]] for i in range(1,len(randomData)-1,stride+1): randomSlices[i/stride] = (randomData[i-1:i+stride-1]) # iterate through all the xSlices and perform training/classification for xSlice in range(5): trainSet = list() testSet = randomSlices[xSlice] for i in range(5): if i == xSlice: continue else: trainSet = trainSet + randomSlices[i] baseDict = textFeatures.getFeatures(trainSet) # remove n most frequent words for cutoff in stopWords: wordDict = set([baseDict[i][0] for i in range(0,len(baseDict)-cutoff)]) tp = 0 fp = 0 tn = 0 fn = 0 # build feature vectors (not really, they're hash tables) trainSpam,trainHam = textFeatures.vectorize(trainSet,wordDict) testSpam,testHam = textFeatures.vectorize(testSet,wordDict) probTable,pSpam,pHam = bayes.trainClassifier(trainSpam,trainHam,wordDict) for item in testSpam: prediction = (bayes.classify(probTable,pSpam,pHam,item)) if prediction == 'spam': tp = tp + 1 else: fn = fn + 1 for item in testHam: prediction = (bayes.classify(probTable,pSpam,pHam,item)) if prediction == 'ham': tn = tn + 1 else: fp = fp + 1 result = {'tp': tp, 'fp': fp, fn: 'fn', 'tn': tn} # write results to temporary file fName = 'output/expcutoff%dxSlice%d' % (cutoff,xSlice) pickle.dump(result,open(fName,'w'))
def main(): with open('data/SMSSpamCollection') as input_file: text = input_file.read() text = text.strip() text = text.split('\n') cutoff = raw_input("How many words to truncate from dictionary: ") try: cutoff = int(cutoff) except: print "Invalid input, defaulting to 10" cutoff = 10 # slice our data into five equal segments for fivefold cross validation # each segment has random indices indices = random.sample(xrange(len(text)), len(text)) randomData = [text[i] for i in indices] stride = len(randomData) / 5 randomSlices = [[], [], [], [], []] for i in range(1, len(randomData) - 1, stride + 1): randomSlices[i / stride] = (randomData[i - 1:i + stride - 1]) print "Entering 'n' will use 1/5th of data for testing, the rest for training" print "Entering 'y' will use full cross validation and may take a while" crossValidate = raw_input("Perform cross validation? (y/n): ") if crossValidate.lower() != 'y' and crossValidate.lower() != 'n': print "Invalid input, not using cross validation" limit = 1 elif crossValidate.lower() == 'y': limit = 5 else: limit = 1 resultList = list() for xSlice in range(limit): trainSet = list() testSet = randomSlices[xSlice] for i in range(5): if i == xSlice: continue else: trainSet = trainSet + randomSlices[i] print "Building dictionary..." baseDict = textFeatures.getFeatures(trainSet) wordDict = set( [baseDict[i][0] for i in range(0, len(baseDict) - cutoff)]) print "Vectorizing documents..." trainSpam, trainHam = textFeatures.vectorize(trainSet, wordDict) testSpam, testHam = textFeatures.vectorize(testSet, wordDict) print "Training classifier..." probTable, pSpam, pHam = bayes.trainClassifier(trainSpam, trainHam, wordDict) tp, fp, tn, fn = 0.0, 0.0, 0.0, 0.0 print "Beginning testing..." total = len(testSpam) + len(testHam) count = 0 for item in testSpam: prediction = bayes.classify(probTable, pSpam, pHam, item) if prediction == 'spam': tp = tp + 1.0 else: fp = fp + 1.0 count = count + 1 if count % 50 == 0: print "%d/%d complete" % (count, total) for item in testHam: prediction = bayes.classify(probTable, pSpam, pHam, item) if prediction == 'ham': tn = tn + 1.0 else: fp = fp + 1.0 count = count + 1 if count % 50 == 0: print "%d/%d complete" % (count, total) print "Finished testing.." result = {'tp': tp, 'fp': fp, 'tn': tn, 'fn': fn} resultList.append(result) printResults(resultList)
# Written by Ningyuan Jiang import sys import c45 import bayes if __name__ == '__main__': with open("mushroom.training") as f: dataset = [[str(x) for x in line.split()] for line in f] with open("mushroom.test") as f: testData = [[str(x) for x in line.split()] for line in f] #print C45.splitFeature(dataset) dicts, classProb = bayes.computeProbs(dataset) testData = bayes.classify(testData, dicts, classProb) for i in testData: print i
bayes.train(t, False) else: bayes.train(t, True) total = total + 1 print total if (total % 100) == 0: query = "SELECT a.article_id, a.prediction, a.like_flag, b.article_text from test_user_activity a, \ test_article b where a.article_id = b.article_id and a.article_id between %s and %s and user_name ='brad'" % (row["article_id"] + 1, (row["article_id"] + 101)) print query cur.execute(query) rows = cur.fetchall() for row in rows: t = row["article_text"] t = unquote_plus(t) prediction = bayes.classify(t) rating = bayes.article_rating(t) print "Pred: %s, Prob: %.4f" % (prediction, rating) print "Prediction for: %d is %s, naive predicted %s and I chose %s. CRM probability was %.4f" % (row["article_id"], prediction, row["prediction"], row["like_flag"], rating) #query = """INSERT into test_predictions (article_id, user_name, like_flag, naive_bayes, crm, crm_prob) VALUES (%s, '%s', %s, '%s', '%s', %s)""" % (row["article_id"], "brad", row["like_flag"], row["prediction"], prediction, probability) query = """UPDATE test_predictions set pg_bayes = '%s', pg_dislike_score = %.4f where user_name = 'brad' and article_id = '%s'""" % (prediction, rating, row["article_id"]) print query cur.execute(query) #spam_message = "Viagra, cialis for $2.59!!! Call 555-54-53" #bayes.train(spam_message, True) # #ham_message = "Paul Graham doesn't need Viagra. He is NP-hard." #bayes.train(ham_message, False) # #m1 = "Cheap viagra for 2.59"