Esempio n. 1
0
def best_k(xArr, yArr):
    k_small = 0.01
    k_big = 2.0
    k = (k_big + k_small) / 2.0
    while True:
        yHat = regression.lwlrTest(xArr, xArr, yArr, k)
        yHat_small = regression.lwlrTest(xArr, xArr, yArr, k_small)
        yHat_big = regression.lwlrTest(xArr, xArr, yArr, k_big)
        re = regression.rssError(yArr, yHat)
        re_small = regression.rssError(yArr, yHat_small)
        re_big = regression.rssError(yArr, yHat_big)
        if re_small > re and re_big > re:
            k_big = k + (k_big - k) / 2.0
            k_small = k_small + (k - k_small) / 2.0
        elif re_small > re and re_big < re:
            k_small = k
            k = (k_big + k_small) / 2.0
        elif re_small < re and re_big > re:
            k_big = k
            k = (k_big + k_small) / 2.0
        else:
            k_big = k + (k_big - k) / 2.0
            k_small = k_small + (k - k_small) / 2.0
        if k_big - k_small < 0.01:
            k = k_small
            break
    return k
def crossValidation(xArr, yArr, numVal=10):
    """
    交叉验证测试岭回归
    :param xArr:    数据的特征集
    :param yArr:    类别标签
    :param numVal:  算计中交叉验证的次数。如果没有指定,默认是10.
    :return:
    """
    #获取数据点的个数
    m = len(yArr)
    indexList = arange(m)
    errorMat = zeros((numVal, 30))  # create error mat 30columns numVal rows
    #主循环,
    for i in range(numVal):
        #创建训练集和测试集的容器
        trainX = []
        trainY = []
        testX = []
        testY = []
        #使用numpy提供的shuffle函数对indexList中的元素进行混洗。
        #因此可以实现训练集或测试集数据点的随机选取。
        random.shuffle(indexList)
        #切分训练集和测试集
        for j in range(m):
            #创建一个基于数据集大小90%的训练集
            if j < m * 0.9:
                trainX.append(xArr[indexList[j]])
                trainY.append(yArr[indexList[j]])
            else:
                testX.append(xArr[indexList[j]])
                testY.append(yArr[indexList[j]])
        #利用岭回归获得回归系数矩阵,得到30组回归系数组成的矩阵
        wMat = ridge_regression.ridgeTest(trainX, trainY)
        #循环遍历矩阵中的30组回归系数
        for k in range(30):
            #读取训练集和测试集
            matTestX = mat(testX)
            matTrainX = mat(trainX)
            #对数据进行标准化处理
            meanTrain = mean(matTrainX, 0)
            varTrain = var(matTrainX, 0)
            matTestX = (matTestX - meanTrain) / varTrain
            #测试回归效果并存储
            yEst = matTestX * mat(wMat[k, :]).T + mean(trainY)
            # yEst = matTestX * mat(wMat[k, :]).T
            errorMat[i, k] = regression.rssError(yEst.T.A, array(testY))
    #计算误差估计值得均值
    meanErrors = mean(errorMat, 0)
    minMean = float(min(meanErrors))
    bestWeights = wMat[nonzero(meanErrors == minMean)]
    #为了将得到的回归系数与standRegres()作对比,需要计算这些误差估计值的均值。
    #有一点值得注意,岭回归使用了数据标准化,而standRegres()没有,因此为了将上述比较可视化,还需将数据还原。
    xMat = mat(xArr)
    yMat = mat(yArr).T
    meanX = mean(xMat, 0)
    varX = var(xMat, 0)
    unReg = bestWeights / varX
    print("the best model from Ridge 7.Regression is:\n", unReg)
    print("with constant term: ",
          -1 * sum(multiply(meanX, unReg)) + mean(yMat))
Esempio n. 3
0
def test1():
    abX, abY = regression.loadDataSet('abalone.txt')
    yHat01 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 0.1)
    yHat1 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 1)
    yHat10 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 10)
    print(regression.rssError(abY[0:99], yHat01.T))
    print(regression.rssError(abY[0:99], yHat1.T))
    print(regression.rssError(abY[0:99], yHat10.T))
    print('-------------------------------------------')
    yHat01 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 0.1)
    yHat1 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 1)
    yHat10 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 10)
    print(regression.rssError(abY[100:199], yHat01.T))
    print(regression.rssError(abY[100:199], yHat1.T))
    print(regression.rssError(abY[100:199], yHat10.T))
def stageWise(xArr, yArr, eps=0.01, numIt=100):
    """
    前向逐步回归算法
    :param xArr: 样本的数据特征
    :param yArr: 类别标签
    :param eps: 每次迭代需要调整的步长
    :param numIt: 迭代次数
    :return:
    """
    xMat = mat(xArr)
    yMat = mat(yArr).T
    yMean = mean(yMat, 0)
    yMat = yMat - yMean
    xMat = regularize(xMat)
    m, n = shape(xMat)
    #创建一个创建numIt* n的全部数据为0的矩阵
    returnMat = zeros((numIt, n))
    #创建一个n*1的向量来保存w的值
    ws = zeros((n, 1))
    wsMax = ws.copy()
    #开始迭代
    for i in range(numIt):
        print(ws.T)
        lowestError = inf
        #对每个特征进行循环
        for j in range(n):
            for sign in [-1, 1]:
                wsTest = ws.copy()
                #改变一个系数得到一个新的w
                wsTest[j] += eps * sign
                #计算新w下的误差
                yTest = xMat * wsTest
                rssE = regression.rssError(yMat.A, yTest.A)
                #如果误差Error小于当前最小误差lowesError,这是wsTest等于当前W,否则的话不改变。
                if rssE < lowestError:
                    lowestError = rssE
                    wsMax = wsTest
        ws = wsMax.copy()
        returnMat[i, :] = ws.T
    return returnMat
def abaloneTest():
    """ 预测鲍鱼的年龄

    描述:机器学习实战示例8.3 预测鲍鱼的年龄
    INPUT:
        无
    OUPUT: 
        无 
    """
    # 加载数据
    abX, abY = regression.loadDataSet("./data/abalone.txt")
    # 使用不同的核进行预测
    oldyHat01 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 0.1)
    oldyHat1 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 1)
    oldyHat10 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 10)
    # 打印出不同的核预测值与训练数据集上的真实值之间的误差大小
    print("old yHat01 error Size is :",
          regression.rssError(abY[0:99], oldyHat01.T))
    print("old yHat1 error Size is :",
          regression.rssError(abY[0:99], oldyHat1.T))
    print("old yHat10 error Size is :",
          regression.rssError(abY[0:99], oldyHat10.T))
    # 打印出不同的核预测值与新数据集(测试数据集)上的真实值之间的误差大小
    newyHat01 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 0.1)
    print("new yHat01 error Size is :",
          regression.rssError(abY[0:99], newyHat01.T))
    newyHat1 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 1)
    print("new yHat1 error Size is :",
          regression.rssError(abY[0:99], newyHat1.T))
    newyHat10 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 10)
    print("new yHat10 error Size is :",
          regression.rssError(abY[0:99], newyHat10.T))
    # 使用简单的线性回归进行预测,与上面的计算进行比较
    standWs = regression.standRegres(abX[0:99], abY[0:99])
    standyHat = mat(abX[100:199]) * standWs
    print("standRegress error Size is:",
          regression.rssError(abY[100:199], standyHat.T.A))
Esempio n. 6
0
import regression

abX, abY = regression.loadDataSet('abalone.txt')
yHat01 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 0.1)
yHat1 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 1)
yHat10 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 10)
print('Error on training data when K=0.1 - ',
      regression.rssError(abY[0:99], yHat01.T))
print('Error on training data when K=1.0 - ',
      regression.rssError(abY[0:99], yHat1.T))
print('Error on training data when K=10 - ',
      regression.rssError(abY[0:99], yHat10.T))

yHat01 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 0.1)

print('Error on Test Data when k=0.1: ',
      regression.rssError(abY[100:199], yHat01.T))

yHat1 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 1)
print('Error on Test Data when k=1: ',
      regression.rssError(abY[100:199], yHat1.T))

yHat10 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 10)
print('Error on Test Data when k=10:',
      regression.rssError(abY[100:199], yHat10.T))
Esempio n. 7
0
"""
@file:abalone.py
@author:姚水林
@time:2018-12-16 16:02:01
@function:
"""
import regression
import matplotlib.pyplot as plt

abX, abY = regression.loadDataSet('abalone.txt')
yHat01 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 0.1)
yHat1 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 1)
yHat10 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 10)
ressError01 = regression.rssError(abY[0:99], yHat01.T)
ressError1 = regression.rssError(abY[0:99], yHat1.T)
ressError10 = regression.rssError(abY[0:99], yHat10.T)
print("ressError01=", ressError01, "ressError1=", ressError1, "ressError10=",
      ressError10)

ridgeWeights = regression.ridgeTest(abX, abY)
print("ridgeWeights=", ridgeWeights)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(ridgeWeights)
plt.show()
# print "srtInd:",srtInd
# print "xSort:",xSort
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(xSort[:, 1], yHat[srtInd])
ax.scatter(xMat[:, 1].flatten().A[0], mat(yArr).T.flatten().A[0], s=2, c='red')
plt.show()

#8.3 示例:预测鲍鱼的年龄
abX, abY = regression.loadDataSet(homedir + 'abalone.txt')
print "abX:", abX
print "abY:", abY
yHat01 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 0.1)
yHat1 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 1)
yHat10 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 10)
print "regression.rssError(abY[0:99],yHat01.T):", regression.rssError(
    abY[0:99], yHat01.T)
print "regression.rssError(abY[0:99],yHat1.T):", regression.rssError(
    abY[0:99], yHat1.T)
print "regression.rssError(abY[0:99],yHat10.T):", regression.rssError(
    abY[0:99], yHat10.T)
yHat01 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 0.1)
print "regression.rssError(abY[100:199],yHat01.T):", regression.rssError(
    abY[100:199], yHat01.T)
yHat1 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 1)
print "regression.rssError(abY[100:199],yHat1.T):", regression.rssError(
    abY[100:199], yHat1.T)
yHat10 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 10)
print "regression.rssError(abY[100:199],yHat10.T):", regression.rssError(
    abY[100:199], yHat10.T)
ws = regression.standRegres(abX[0:99], abY[0:99])
yHat = mat(abX[100:199]) * ws
Esempio n. 9
0
        invalidKMax = 0.009

        yMatTmp = yMat[:, yIdx]
        yTMatTmp = yTMat[:, yIdx]

        for k in arange(5, 0.09, -0.1):  ##find best k
            yAssume = regression.lwlrTest(xTMat, xMat, yMatTmp.T, k)
            print k
            if yAssume.all() == 0:
                #print("%s  %d: regression.lwlr failed by k = %f." %(myDebug.file(), myDebug.line(), k))
                invalidKNum += 1
                if k > invalidKMax:
                    invalidKMax = k
                if k < invalidKMin:
                    invalidKMin = k
                continue
            #transfer Mat to list
            yTList = yTMatTmp.reshape(-1).tolist()
            yTList = [j for i in yTList for j in i]
            rssE = regression.rssError(yTList, yAssume)
            if len(bestKList) == 0:
                bestKList.insert(0, [rssE, k])
            else:
                for idx in range(0, len(bestKList)):
                    if rssE < bestKList[idx][0]:
                        bestKList.insert(idx, [rssE, k])
                        if len(bestKList) > 50:  #save 50 top k
                            bestKList.pop()
                        break
        print bestKList
#conding=utf-8
from numpy import *
import regression
"""
案例一:我们将回归用于真实数据
"""
if __name__ == '__main__':
    """#####################################################################################################################"""
    xArr, yArr = regression.loadDataSet(
        r'C:\Users\v_wangdehong\PycharmProjects\MachineLearning_V\Regression\data\abalone.txt'
    )
    #使用前99行数据测试算法
    yHat01 = regression.lwlrTest(xArr[0:99], xArr[0:99], yArr[0:99], 0.1)
    yHat1 = regression.lwlrTest(xArr[0:99], xArr[0:99], yArr[0:99], 1)
    yHat10 = regression.lwlrTest(xArr[0:99], xArr[0:99], yArr[0:99], 10)
    print(regression.rssError(yArr[0:99], yHat01))  #56.7842091184
    print(regression.rssError(yArr[0:99], yHat1))  #429.89056187
    print(regression.rssError(yArr[0:99], yHat10))  #549.118170883
    """
    从上面可以看到,使用较小的核将得到较低的误差,那么为什么不在所有数据集上都使用最小的核呢?
    因为使用最小的核将造成过拟合,对新数据不一定能达到最好的效果,下面就看看它在新数据上的表现
    """
    yHat01 = regression.lwlrTest(xArr[100:199], xArr[0:99], yArr[0:99], 0.1)
    yHat1 = regression.lwlrTest(xArr[100:199], xArr[0:99], yArr[0:99], 1)
    yHat10 = regression.lwlrTest(xArr[100:199], xArr[0:99], yArr[0:99], 10)
    print(regression.rssError(yArr[100:199], yHat01))  # 25119.4591112
    print(regression.rssError(yArr[100:199], yHat1))  # 573.52614419
    print(regression.rssError(yArr[100:199], yHat10))  # 517.571190538
    """
    从上面结果可以看到,核大小等于10时测试误差最小,但是它在训练集上的误差却是最大的。
    接下来再和简单的线性回归做个比较。
Esempio n. 11
0
# -*- coding=utf-8 -*-
import regression
from numpy import *

abX,abY = regression.loadDataSet("abalone.txt")
yHat01 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 0.1)
yHat1 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 1.0)
yHat10 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 10)

error01 = regression.rssError(abY[0:99], yHat01)
error1 = regression.rssError(abY[0:99], yHat1)
error10 = regression.rssError(abY[0:99], yHat10)

#结论,使用较小的核可以得到较低的误差
#但较小的核会造成过拟合的,对新数据不定能达到最好的预测效果
print ("error01 is %s"  % error01)     #error01 is 56.7862596807
print ("error1 is %s"  % error1)         #error1 is 429.89056187
print ("error10 is %s"  % error10)     #error10 is 549.118170883

yyHat01 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 0.1)
yyHat1 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 1.0)
yyHat10 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 10)
eerror01 = regression.rssError(abY[100:199], yyHat01)
eerror1 = regression.rssError(abY[100:199], yyHat1)
eerror10 = regression.rssError(abY[100:199], yyHat10)
print ("eerror01 is %s"  % eerror01)     #eerror01 is 33652.8973161
print ("eerror1 is %s"  % eerror1)         #eerror1 is 573.52614419
print ("eerror10 is %s"  % eerror10)     #eerror10 is 517.571190538       #对新数据,k=10得到较好的效果


#和线性做比较
#coding:utf-8
import regression
from numpy import *

abX, abY = regression.loadDataSet('abalone.txt')
yHat01 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 0.1)
yHat1 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 1)
yHat10 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 10)
print regression.rssError(abY[0:99], yHat01.T)
print regression.rssError(abY[0:99], yHat1.T)
print regression.rssError(abY[0:99], yHat10.T)

yHat01 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 0.1)
yHat1 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 1)
yHat10 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 10)
print regression.rssError(abY[100:199], yHat01.T)
print regression.rssError(abY[100:199], yHat1.T)
print regression.rssError(abY[100:199], yHat10.T)

ws = regression.standRegres(abX[0:99], abY[0:99])
yHat = mat(abX[100:199])*ws
print regression.rssError(abY[100:199], yHat.T.A)

ridgeWeights = regression.ridgeTest(abX, abY)
#print ridgeWeights
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(ridgeWeights)
plt.show()
Esempio n. 13
0
# -*- coding=utf-8 -*-
import regression
from numpy import *

abX, abY = regression.loadDataSet("abalone.txt")
yHat01 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 0.1)
yHat1 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 1.0)
yHat10 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 10)

error01 = regression.rssError(abY[0:99], yHat01)
error1 = regression.rssError(abY[0:99], yHat1)
error10 = regression.rssError(abY[0:99], yHat10)

#结论,使用较小的核可以得到较低的误差
#但较小的核会造成过拟合的,对新数据不定能达到最好的预测效果
print("error01 is %s" % error01)  #error01 is 56.7862596807
print("error1 is %s" % error1)  #error1 is 429.89056187
print("error10 is %s" % error10)  #error10 is 549.118170883

yyHat01 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 0.1)
yyHat1 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 1.0)
yyHat10 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 10)
eerror01 = regression.rssError(abY[100:199], yyHat01)
eerror1 = regression.rssError(abY[100:199], yyHat1)
eerror10 = regression.rssError(abY[100:199], yyHat10)
print("eerror01 is %s" % eerror01)  #eerror01 is 33652.8973161
print("eerror1 is %s" % eerror1)  #eerror1 is 573.52614419
print("eerror10 is %s" %
      eerror10)  #eerror10 is 517.571190538       #对新数据,k=10得到较好的效果

#和线性做比较
Esempio n. 14
0
yHat = xMat*ws
'''
#Retry by lwlr to get best k

corrcoefMin=100
bestK=1
keysets = [0.1,1,10,0.02,0.3]

for step in keysets:   
    print(step) 
    yHat = regression.lwlrTest(xArr[4000:],xArr[0:4000],yArr[0:4000],step)
    if(sum(yHat) != 0):
        if(corrcoefMin >= linalg.det(corrcoef(yHat.T, yArr[4000:]))):
            corrcoefMin = linalg.det(corrcoef(yHat.T, yArr[4000:]))
            bestK=step
        print(regression.rssError(yArr[4000:], yHat.T))
print("=======================")
print(bestK)
print(corrcoefMin)

'''
fig = plt.figure()
ax = fig.add_subplot(111)

ax.scatter(xMat[:,1], yMat.T[:,0])

xCopy = xMat.copy()
xCopy.sort(0)
yHat = xCopy*ws
ax.plot(xCopy[:, 1], yHat)
plt.show()
Esempio n. 15
0
#print yArr
#print 
#ws = regression.standRegres(xArr, yArr)
#print ws
#yHat = regression.lwlrTest(xArr, xArr, yArr, 0.3)
#yHat = regression.lwlrTest(xArr, xArr, yArr, 0.001)
#print regression.rssError(yArr[:], yHat.T)
#yHat = regression.lwlrTest(xArr, xArr, yArr, 0.003)
#print regression.rssError(yArr[:], yHat.T)
yHat = regression.lwlrTest(xArr, xArr, yArr, 0.01)
#print regression.rssError(yArr[:], yHat.T)
#print yHat
#yHat = regression.lwlrTest(xArr, xArr, yArr, 0.1)
#print regression.rssError(yArr[:], yHat.T)
#yHat = regression.lwlrTest(xArr, xArr, yArr, 1)
print regression.rssError(yArr[:], yHat.T)
#exit(0)

#yHat = regression.lwlrTest(xArr, xArr, yArr, 0.01)
#print
#yHat = regression.lwlrTest(xArr, xArr, yArr, 0.003)
#print


xMat = mat(xArr)
srtInd = xMat[:,1].argsort(0)
xSort = xMat[srtInd][:,0,:]
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(xSort[:,1], yHat[srtInd])
Esempio n. 16
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

'8.3 mechinelearing in action'

__author__ = 'lxp'

import regression
import numpy as np

abX, abY = regression.loadDataSet('abalone.txt')
yHat01 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 0.1)
yHat1 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 1)
yHat10 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 10)
print(regression.rssError(abY[0:99], yHat01.T))
print(regression.rssError(abY[0:99], yHat1.T))
print(regression.rssError(abY[0:99], yHat10.T))

yHat01 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 0.1)
yHat1 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 1)
yHat10 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 10)
print(regression.rssError(abY[100:199], yHat01.T))
print(regression.rssError(abY[100:199], yHat1.T))
print(regression.rssError(abY[100:199], yHat10.T))

ws = regression.standRegres(abX[0:99], abY[0:99])
yHat = np.mat(abX[100:199]) * ws
print(regression.rssError(abY[100:199], yHat.T.A))
Esempio n. 17
0
srtInd = xMat[:, 1].argsort(0)
xSort = xMat[srtInd][:, 0, :]
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(xSort[:, 1], yHat[srtInd])
ax.scatter(xMat[:, 1].flatten().A[0], mat(yArr).T.flatten().A[0], s=2, c='red')
plt.show()

# 在真实数据上
reload(regression)
abX, abY = regression.loadDataSet('abalone.txt')
yHat01 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 0.1)
yHat1 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 1)
yHat10 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 10)
# 看下k取什么比较好,所谓的交叉验证也算是
regression.rssError(abY[0:99], yHat01.T)  # you are best~
regression.rssError(abY[0:99], yHat1.T)
regression.rssError(abY[0:99], yHat10.T)

# 看看是不是最好的k,测试集上也表现的良好
yHat01 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 0.1)
yHat1 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 1)
yHat10 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 10)
regression.rssError(abY[100:199], yHat01.T)  # 明显过拟合了~
regression.rssError(abY[100:199], yHat1.T)
regression.rssError(abY[100:199],
                    yHat10.T)  # You are really the best...有没有写错....

# 岭回归测试
reload(regression)
abX, abY = regression.loadDataSet('abalone.txt')
Esempio n. 18
0
import regression
from numpy import *
def rssError(yArr,yHatArr):
    return ((yArr-yHatArr)**2).sum()

abX,abY=regression.loadDataSet('abalone.txt')
yHat01=regression.lwlrTest(abX[0:99],abX[0:99],abY[0:99],0.1)
yHat1=regression.lwlrTest(abX[0:99],abX[0:99],abY[0:99],1)
yHat10=regression.lwlrTest(abX[0:99],abX[0:99],abY[0:99],10)

print regression.rssError(abY[0:99],yHat01.T)
print regression.rssError(abY[0:99],yHat1.T)
print regression.rssError(abY[0:99],yHat10.T)