def adaBoostTrainDS(dataArr, classLabels, numIt = 40):
    weakClassArr = []
    m = shape(dataArr)[0]
    D = mat(ones((m,1))/m)
    aggClassEst = mat(zeros((m,1)))
    for i in range(numIt):
        bestStump, error, classEst = boost.buildStump(dataArr, classLabels, D)
        print "D:", D.T
        alpha = float(0.5*log((1.0-error)/max(error,1e-16)))
        bestStump['alpha'] = alpha
        weakClassArr.append(bestStump)
        print "classEst:", classEst.T
        #compute D for next teration
        #D is the weight of all samples
        expon = multiply(-1*alpha*mat(classLabels).T,classEst)
        D = multiply(D,exp(expon))
        D = D/D.sum()

        #calculate error
        aggClassEst += alpha*classEst
        print "aggClassEst: ", aggClassEst.T
        aggErros = multiply(sign(aggClassEst) != mat(classLabels).T, ones((m,1)))
        errorRate = aggErros.sum()/m
        print "total error: ", errorRate, "\n"
        if errorRate == 0.0:
            break
    return weakClassArr, aggClassEst
Beispiel #2
0
def adaBoostTrainDS(dataArr, classLabels, numIt=40):
    '''
    基于单层决策树的AdaBoost训练过程
    input:数据集、类别标签、迭代次数numIt
    '''
    weakClassArr = []
    m = np.shape(dataArr)[0]
    D = np.mat(np.ones((m, 1)) / m)  #初始数据点权重
    aggClassEst = np.mat(np.zeros(
        (m, 1)))  #数据点的类别估计累计值——基本分类器的线性组合f(x),np.sign(aggClassEst)即为G(x)
    for i in range(numIt):
        bestStump, error, classEst = boost.buildStump(dataArr, classLabels, D)
        print('D: ', D.T)
        alpha = float(0.5 * np.log(
            (1.0 - error) /
            max(error, 1e-16)))  #max(error,1e-16)用于确保在没有错误时无除零溢出
        bestStump['alpha'] = alpha
        weakClassArr.append(bestStump)
        print('classEst: ', classEst.T)
        expon = np.multiply(-1 * alpha * np.mat(classLabels).T, classEst)
        D = np.multiply(D,
                        np.exp(expon))  #此时D的结果对应公式中的w_mi*exp(-α_m*y_i*Gm(x_i))
        D = D / D.sum()  #w_m+1,其中D.sum()对应公式中的Zm(规范因子)
        aggClassEst += alpha * classEst
        print('aggClassEst: ', aggClassEst.T)
        aggErrors = np.multiply(
            np.sign(aggClassEst) != np.mat(classLabels).T, np.ones((m, 1)))
        errorRate = aggErrors.sum() / m
        print('total error: ', errorRate, '\n')
        if errorRate == 0.0: break
    return weakClassArr
def adaBoostTrainDS(dataArr, classLabels, numIt=40):
    '''
    输入参数:
    dataArr:输入的数据集
    classLabels:类别标签
    numIt:迭代次数(需要自己指定)
    返回值:
    weakClassArr:单层决策树的数组
    '''
    #建立一个新的列表来保存单层决策树的数组
    weakClassArr = []
    #得到例子的个数
    m = np.shape(dataArr)[0]
    '''
    D包含了每个数据点的权重,开始都被赋予了相等的值
    在后续的迭代中,增加错分数据的权重同时降低正确分类数据的权重
    D的所有元素之和为1.0
    '''
    D = np.mat(np.ones((m, 1)) / m)
    #记录每个数据点的类别估计累计值
    aggClassEst = np.mat(np.zeros((m, 1)))
    for i in range(numIt):
        '''
        返回值:
        bestStump:利用D而得到的具有最小错误率的单层决策树
        error:最小的错误率
        classEst:估计的类别向量
        '''
        bestStump, error, classEst = boost.buildStump(dataArr, classLabels, D)
        print("D:", D.T)
        #书中用于调整权值的参数alpha,告诉总分类器本次单层决策树输出结果的权重
        alpha = float(0.5 * math.log((1.0 - error) / max(error, 1e-16)))
        #加入字典
        bestStump['alpha'] = alpha
        #加入列表
        weakClassArr.append(bestStump)
        print("classEst: ", classEst.T)
        #1 为下一次迭代计算D
        expon = np.multiply(-1 * alpha * np.mat(classLabels).T, classEst)
        D = np.multiply(D, np.exp(expon))
        D = D / D.sum()
        #2 错误率累加计算
        aggClassEst += alpha * classEst
        print("aggClassEst: ", aggClassEst.T)
        aggErrors = np.multiply(np.sign(aggClassEst) != \
                                np.mat(classLabels).T, np.ones((m, 1)))
        errorRate = aggErrors.sum() / m
        print("total error: ", errorRate, "\n")
        #如果错误率降低到0时,就停止迭代
        if errorRate == 0.0: break
    return weakClassArr, aggClassEst
Beispiel #4
0
def adaBoostTrainDS(dataArr, classLabels, numIt=40):
    # 存储每次迭代的最优弱分类器
    weakClassArr = []
    m = dataArr.shape[0]
    # 每个样本的初始化权重为 1/m
    D = mat(ones((m, 1)) / m)
    n = dataArr.shape[1]
    aggClassEst = mat(zeros((m, 1)))

    print("##########Training begin .......##############")
    print()
    for i in range(numIt):
        print("===========Epoch %d=========" % i)
        bestStump, error, classEst = boost.buildStump(dataArr, classLabels, D)
        print("D: ", D.T)

        # 计算alpha, 注意这里为了防止error为0造成除0异常,使用了一个极小的浮点数来代替
        alpha = float(0.5 * log((1.0 - error) / max(error, 1e-16)))
        bestStump['alpha'] = alpha

        # 将本次迭代的最佳单层决策树加入单层决策树列表
        weakClassArr.append(bestStump)
        print("classEst: ", classEst.T)

        # 计算下一次迭代的权重向量D
        # 若样本被正确分类,则系数为e(-alpha),否则为e(alpha), 先确定指数alpha的正负
        # stumpClassify中将样本分类为-1和+1,而真正的样本标签也是+1与-1, alpha的符号刚好与真正样本标签想反
        # 而预测正确则两个矩阵对应元素为同号得正,否则为负号,所以只要将alpha的符号设置为负即可
        expon = multiply(-alpha * mat(classLabels).T, classEst)
        D = multiply(D, exp(expon)) / D.sum()

        # 更新累积类别估计值(即当前迭代次数的集成学习估计结果)
        aggClassEst += alpha * classEst
        print("aggClassEst: ", aggClassEst.T)

        # 计算累积类别估计的错误率
        # 先计算累积错误的个数, 再求出错误率
        aggErrors = multiply(
            sign(aggClassEst) != mat(classLabels).T, ones((m, 1)))
        aggErrorRate = aggErrors.sum() / m
        print("total error: ", aggErrorRate)
        print("===========Epoch %d End=========" % i)
        print()
        # 若错误率为0, 则退出循环
        if (aggErrorRate == 0):
            break
    # 返回弱学习器的集合(包括每个弱学习器的权重alpha)作为集成学习模型
    return weakClassArr
Beispiel #5
0
def adaBoostTrainDS(dataArr, classLabels, numIt=40):
    '''
    Train a series of adaboost weak classifier on a dataset
    
    Input:
        dataArr: data array
        classLabels: data labels
        numIt: iteration number / classifier number
    Output:
        weakClassArr: information(dim, thresh, ineq, alpha) about a 
                      series of trained weak classifier
    '''
    weakClassArr = []
    m = np.shape(dataArr)[0]
    D = np.mat(np.ones((m, 1)) / m)  # 样本权重,均匀
    aggClassEst = np.mat(np.zeros((m, 1)))
    for i in range(numIt):
        bestStump, error, classEst = buildStump(dataArr, classLabels,
                                                D)  # 树桩信息,最小错误率,预测结果
        #print('D:',D.T)
        alpha = float(0.5 * np.log(
            (1.0 - error) / max(error, 1e-16)))  # 当前分类器权重,由错误率计算得出
        bestStump['alpha'] = alpha
        weakClassArr.append(bestStump)  # 将当前决策树桩记录下来
        #print('classEst:', classEst.T)
        expon = np.multiply(-1 * alpha * np.mat(classLabels).T,
                            classEst)  # 每个预测结果与真实标签逐元素相乘
        D = np.multiply(D, np.exp(expon))  # 通过上一棵树的结果计算下一轮每一个样本的权重
        D = D / D.sum()  # 样本权重归一化
        aggClassEst += alpha * classEst  # 将预测结果加权到总结果(可理解为模型的相加)
        #print('aggClassEst:', aggClassEst.T)
        aggErrors = np.multiply(
            np.sign(aggClassEst) != np.mat(classLabels).T, np.ones(
                (m, 1)))  # 总模型每个样本预测正确与否
        errorRate = aggErrors.sum() / m  # 总模型错误率
        print('total error:', errorRate, '\n')
        if errorRate == 0.0:
            break
    return weakClassArr  # 返回一个所有树桩的列表
def adaBoostTrainDS(dataArr, classLabels, numIt=40):
    weakClassArr = []
    m = shape(dataArr)[0]
    D = mat(ones((m, 1)) / m)
    aggClassEst = mat(zeros((m, 1)))
    for i in range(numIt):
        bestStump, error, classEst = buildStump(dataArr, classLabels, D)
        print("D:", D.T)
        alpha = float(0.5 * log((1.0 - error) / max(error, 1e-16)))
        bestStump['alpha'] = alpha
        weakClassArr.append(bestStump)
        print("classEst:", classEst.T)
        expon = multiply(-1 * alpha * mat(classLabels).T, classEst)
        D = multiply(D, exp(expon))
        D = D / D.sum()
        aggClassEst += alpha * classEst
        print("aggClassEst:", aggClassEst.T)
        aggErrors = multiply(
            sign(aggClassEst) != mat(classLabels).T, ones((m, 1)))
        errorRate = aggErrors.sum() / m
        print("Total error:", errorRate)
        if errorRate == 0.0:
            break
    return weakClassArr
Beispiel #7
0
from numpy import *
import boost

def loadSimpleData():
	dataMat = matrix([[1., 2.1], [2., 1.1], [1.3, 1.], [1., 1.], [2., 1.]])

	classLabels = [1.0, 1.0, -1.0, -1.0, 1.0]

	return dataMat, classLabels

dataMat, classLabels = loadSimpleData()

D = mat(ones((5, 1)) / 5)

print(boost.buildStump(dataMat, classLabels, D))

classifierArray = boost.adaBoostTrainDS(dataMat, classLabels, 30)
print('classifierArray = ', classifierArray)

# boost.adaClassify([0, 0], classifierArray)
# boost.adaClassify([[5, 5], [0, 0]], classifierArray)

# real sample : horseColic problem
dataArray, labelArray = boost.loadDataSet('horseColicTraining2.txt')
classifierArray, aggClassEst = boost.adaBoostTrainDS(dataArray, labelArray, 10)

testArray, testLabelArray = boost.loadDataSet('horseColicTraining2.txt')
prediction10 = boost.adaClassify(testArray, classifierArray)

errorArray = mat(ones((len(testLabelArray), 1)))
import kNN
from numpy import *
import operator
from os import listdir
import trees
import treePlotter

import bayes

import logRegres

import svmMLiA
import boost
import adaboost

datMat, classLabels = adaboost.loadSimpData()

D = mat(ones((5,1))/5)
print boost.buildStump(datMat, classLabels, D)
import adaboost
import boost
from numpy import *
dataMat, dataLabels = adaboost.loadSimpData()
print(dataMat)
print(dataLabels)

D = mat(ones((5, 1)) / 5)
boost.buildStump(dataMat, dataLabels, D)
        else:
            delX = xStep
            delY = 0
            ySum += cur[1]
        ax.plot([cur[0], cur[0] - delX], [cur[1], cur[1] - delY], c='b')
        cur = (cur[0] - delX, cur[1] - delY)
    ax.plot([0, 1], [0, 1], 'b--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC curve for AdaBoost Horse Colic Detection System')
    ax.axis([0, 1, 0, 1])
    plt.show()
    print("the Area Under the Curve is:", ySum * xStep)


'''
import numpy as np
import adaboost
import boost
datMat, classLabels = adaboost.loadSimpData()
D = np.mat(np.ones((5, 1))/5) #创建的是平均加权
boost.buildStump(datMat, classLabels, D)
classifierArr = adaboost.adaBoostTrainDS(datMat, classLabels, 30)
adaboost.adaClassify([0, 0], classifierArr)
adaboost.adaClassify([[5, 5], [0, 0]], classifierArr)

7_4
datArr, labelArr = adaboost.loadDataSet('horseColicTraining2.txt')
classifierArray = adaboost.adaBoostTrainDS(datArr, labelArr, 10)
testArr, testLabelArr = adaboost.loadDataSet('horseColicTest2.txt')
prediction10 = adaboost.adaClassify(testArr, classifierArray)