def getCityTopWords(trainCity0Rss, trainCity1Rss): """ 获取城市中评论中最多的词汇 :param trainCity1Rss: :param trainCity0Rss: """ initialDocList, fullText, cityTypes = loadRSSText(trainCity0Rss, trainCity1Rss) vocaList = bayes.createVocabularyList(initialDocList) trainVocabularyMattrix = [] # 将训练的文档集合针对vocaList进行标记 for words in initialDocList: signedFeatureList = bayes.checkSignedFeatureList(vocaList, words) trainVocabularyMattrix.append(signedFeatureList) p_WiBasedOnClass0, p_WiBasedOnClass1, pAbusive = bayes.trainNavieBayesian(trainVocabularyMattrix, cityTypes) topCity0Words = [] topCity1Words = [] for i in range(len(p_WiBasedOnClass0)): if p_WiBasedOnClass0[i] > -6.0: topCity0Words.append(vocaList[i]) if p_WiBasedOnClass1[i] > -6.0: topCity1Words.append(vocaList[i]) print "*******City0最常用20的词汇*********" for word in topCity0Words[:20]: print word print "*******City1最常用的词汇*********" for word in topCity1Words[:20]: print word
def classifyNavieBayesianTest(): wordsList, classTypes = bayes.loadDataSet() inputTestWords = ['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'] result = bayes.classifyNavieBayesian(wordsList, classTypes, inputTestWords) print inputTestWords, ':', result inputTestWords2 = ['love', 'stupid'] result2 = bayes.classifyNavieBayesian(wordsList, classTypes, inputTestWords2) print inputTestWords2, ':', result2
def trainNavieBayesianTest(): wordsList, classTypes = bayes.loadDataSet() vocaList = bayes.createVocabularyList(wordsList) # 将feature对应的标记为0,1 trainVocabularyMattrix = [] for words in wordsList: trainVocabularyMattrix.append(bayes.checkSignedFeatureList(vocaList, words)) # print np.array(trainVocabularyMattrix) p_WiBasedOnClass0, p_WiBasedOnClass1, pAbusive = bayes.trainNavieBayesian(trainVocabularyMattrix, classTypes) print p_WiBasedOnClass0, '\n' print p_WiBasedOnClass1 print pAbusive
def filterSpamEmail(): """ 过滤垃圾邮件 :return: """ initialDocList, classTypes = loadEmailText() # 从initialDocList中随机创建10个待测试的文档 testDocList = [] # 待测试邮件的类型 testDocClassList = [] """ 注意此处随机选择10封email,添加到测试集合,同时将原有的数据集删除, 这种随机选择数据的一部分作为训练集合,而剩余部分作为测试集合的过程称为 留存交叉验证:hold-out cross validation """ for i in range(10): randomIndex = int(random.uniform(0, len(initialDocList))) testDocClassList.append(classTypes[randomIndex]) testDocList.append(initialDocList[randomIndex]) del (initialDocList[randomIndex]) del (classTypes[randomIndex]) errorCount = 0 for i in range(len(testDocList)): # 对给定的待测试的邮件进行分类 classType = bayes.classifyNavieBayesian( initialDocList, classTypes, testDocList[i]) if classType != testDocClassList[i]: # 预测的结果和实际的结果进行比较 print '分类错误的邮件:', testDocList[i], '\n属于', testDocClassList[i], \ '错误分类成了:', classType errorCount += 1 # 计算分类的误差 print 'the error rate is :', float(errorCount) / len(testDocList)
def localWordsTest(city0Rss, city1Rss): """ 测试根据输入的text分类城市的准确率 :param city0Rss: :param city1Rss: 过滤垃圾邮件 :return: """ initialDocList, fullText, cityTypes = loadRSSText(city0Rss, city1Rss) voclist = bayes.createVocabularyList(initialDocList) print "未删除高频词汇的词汇表长度:", len(voclist) # 出现频率最高的词汇,例如:I and 等辅助词 deletedVoc = calcFrequentWords(voclist, fullText) # 去除词汇列表的高频词汇 for word in deletedVoc: if word[0] in voclist: voclist.remove(word[0]) print "删除后的词汇表长度:", len(voclist) # 从initialDocList中随机创建10个待测试的文档 testDocList = [] # 待测试邮件的类型 testDocClassList = [] """ 注意此处随机选择10个数据,添加到测试集合,同时将原有的数据集删除, 这种随机选择数据的一部分作为训练集合,而剩余部分作为测试集合的过程称为 留存交叉验证:hold-out cross validation """ for i in range(10): randomIndex = int(random.uniform(0, len(initialDocList))) testDocClassList.append(cityTypes[randomIndex]) testDocList.append(initialDocList[randomIndex]) del (initialDocList[randomIndex]) del (cityTypes[randomIndex]) errorCount = 0 for j in range(len(testDocList)): classType = bayes.classifyNavieBayesian2(voclist, initialDocList, cityTypes, testDocList[j]) if classType != testDocClassList[j]: # 预测的结果和实际的结果进行比较 print "分类错误的信息:", testDocList[j], "\n属于", testDocClassList[j], "错误分类成了:", classType errorCount += 1 # 计算分类的误差 errorRate = float(errorCount) / len(testDocList) print "the error rate is :", errorRate return errorRate
def createWordSetTest(): wordsList, classTypes = bayes.loadDataSet() print wordsList wordsetList = bayes.createVocabularyList(wordsList) print wordsetList return wordsetList