def localWordsTest(city0Rss, city1Rss): """ 测试根据输入的text分类城市的准确率 :param city0Rss: :param city1Rss: 过滤垃圾邮件 :return: """ initialDocList, fullText, cityTypes = loadRSSText(city0Rss, city1Rss) voclist = bayes.createVocabularyList(initialDocList) print "未删除高频词汇的词汇表长度:", len(voclist) # 出现频率最高的词汇,例如:I and 等辅助词 deletedVoc = calcFrequentWords(voclist, fullText) # 去除词汇列表的高频词汇 for word in deletedVoc: if word[0] in voclist: voclist.remove(word[0]) print "删除后的词汇表长度:", len(voclist) # 从initialDocList中随机创建10个待测试的文档 testDocList = [] # 待测试邮件的类型 testDocClassList = [] """ 注意此处随机选择10个数据,添加到测试集合,同时将原有的数据集删除, 这种随机选择数据的一部分作为训练集合,而剩余部分作为测试集合的过程称为 留存交叉验证:hold-out cross validation """ for i in range(10): randomIndex = int(random.uniform(0, len(initialDocList))) testDocClassList.append(cityTypes[randomIndex]) testDocList.append(initialDocList[randomIndex]) del (initialDocList[randomIndex]) del (cityTypes[randomIndex]) errorCount = 0 for j in range(len(testDocList)): classType = bayes.classifyNavieBayesian2(voclist, initialDocList, cityTypes, testDocList[j]) if classType != testDocClassList[j]: # 预测的结果和实际的结果进行比较 print "分类错误的信息:", testDocList[j], "\n属于", testDocClassList[j], "错误分类成了:", classType errorCount += 1 # 计算分类的误差 errorRate = float(errorCount) / len(testDocList) print "the error rate is :", errorRate return errorRate