def classifyNavieBayesianTest(): wordsList, classTypes = bayes.loadDataSet() inputTestWords = ['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'] result = bayes.classifyNavieBayesian(wordsList, classTypes, inputTestWords) print inputTestWords, ':', result inputTestWords2 = ['love', 'stupid'] result2 = bayes.classifyNavieBayesian(wordsList, classTypes, inputTestWords2) print inputTestWords2, ':', result2
def filterSpamEmail(): """ 过滤垃圾邮件 :return: """ initialDocList, classTypes = loadEmailText() # 从initialDocList中随机创建10个待测试的文档 testDocList = [] # 待测试邮件的类型 testDocClassList = [] """ 注意此处随机选择10封email,添加到测试集合,同时将原有的数据集删除, 这种随机选择数据的一部分作为训练集合,而剩余部分作为测试集合的过程称为 留存交叉验证:hold-out cross validation """ for i in range(10): randomIndex = int(random.uniform(0, len(initialDocList))) testDocClassList.append(classTypes[randomIndex]) testDocList.append(initialDocList[randomIndex]) del (initialDocList[randomIndex]) del (classTypes[randomIndex]) errorCount = 0 for i in range(len(testDocList)): # 对给定的待测试的邮件进行分类 classType = bayes.classifyNavieBayesian( initialDocList, classTypes, testDocList[i]) if classType != testDocClassList[i]: # 预测的结果和实际的结果进行比较 print '分类错误的邮件:', testDocList[i], '\n属于', testDocClassList[i], \ '错误分类成了:', classType errorCount += 1 # 计算分类的误差 print 'the error rate is :', float(errorCount) / len(testDocList)