def getCityTopWords(trainCity0Rss, trainCity1Rss): """ 获取城市中评论中最多的词汇 :param trainCity1Rss: :param trainCity0Rss: """ initialDocList, fullText, cityTypes = loadRSSText(trainCity0Rss, trainCity1Rss) vocaList = bayes.createVocabularyList(initialDocList) trainVocabularyMattrix = [] # 将训练的文档集合针对vocaList进行标记 for words in initialDocList: signedFeatureList = bayes.checkSignedFeatureList(vocaList, words) trainVocabularyMattrix.append(signedFeatureList) p_WiBasedOnClass0, p_WiBasedOnClass1, pAbusive = bayes.trainNavieBayesian(trainVocabularyMattrix, cityTypes) topCity0Words = [] topCity1Words = [] for i in range(len(p_WiBasedOnClass0)): if p_WiBasedOnClass0[i] > -6.0: topCity0Words.append(vocaList[i]) if p_WiBasedOnClass1[i] > -6.0: topCity1Words.append(vocaList[i]) print "*******City0最常用20的词汇*********" for word in topCity0Words[:20]: print word print "*******City1最常用的词汇*********" for word in topCity1Words[:20]: print word
def trainNavieBayesianTest(): wordsList, classTypes = bayes.loadDataSet() vocaList = bayes.createVocabularyList(wordsList) # 将feature对应的标记为0,1 trainVocabularyMattrix = [] for words in wordsList: trainVocabularyMattrix.append(bayes.checkSignedFeatureList(vocaList, words)) # print np.array(trainVocabularyMattrix) p_WiBasedOnClass0, p_WiBasedOnClass1, pAbusive = bayes.trainNavieBayesian(trainVocabularyMattrix, classTypes) print p_WiBasedOnClass0, '\n' print p_WiBasedOnClass1 print pAbusive