def classifyNavieBayesianTest():
    wordsList, classTypes = bayes.loadDataSet()
    inputTestWords = ['my', 'dog', 'has', 'flea', 'problems', 'help', 'please']
    result = bayes.classifyNavieBayesian(wordsList, classTypes, inputTestWords)
    print inputTestWords, ':', result
    inputTestWords2 = ['love', 'stupid']
    result2 = bayes.classifyNavieBayesian(wordsList, classTypes, inputTestWords2)
    print inputTestWords2, ':', result2
def filterSpamEmail():
    """
    过滤垃圾邮件
    :return:
    """
    initialDocList, classTypes = loadEmailText()
    # 从initialDocList中随机创建10个待测试的文档
    testDocList = []
    # 待测试邮件的类型
    testDocClassList = []
    """
    注意此处随机选择10封email,添加到测试集合,同时将原有的数据集删除,
    这种随机选择数据的一部分作为训练集合,而剩余部分作为测试集合的过程称为
    留存交叉验证:hold-out cross validation
    """
    for i in range(10):
        randomIndex = int(random.uniform(0, len(initialDocList)))
        testDocClassList.append(classTypes[randomIndex])
        testDocList.append(initialDocList[randomIndex])
        del (initialDocList[randomIndex])
        del (classTypes[randomIndex])

    errorCount = 0
    for i in range(len(testDocList)):
        # 对给定的待测试的邮件进行分类
        classType = bayes.classifyNavieBayesian(
                initialDocList, classTypes, testDocList[i])
        if classType != testDocClassList[i]:  # 预测的结果和实际的结果进行比较
            print '分类错误的邮件:', testDocList[i], '\n属于', testDocClassList[i], \
                '错误分类成了:', classType
            errorCount += 1

    # 计算分类的误差
    print 'the error rate is :', float(errorCount) / len(testDocList)