def _var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False): arr = asanyarray(a) rcount = _count_reduce_items(arr, axis) # Make this warning show up on top. if ddof >= rcount: warnings.warn("Degrees of freedom <= 0 for slice", RuntimeWarning, stacklevel=2) # Cast bool, unsigned int, and int to float64 by default if dtype is None and issubclass(arr.dtype.type, (nt.integer, nt.bool_)): dtype = mu.dtype('f8') # Compute the mean. # Note that if dtype is not of inexact type then arraymean will # not be either. arrmean = umr_sum(arr, axis, dtype, keepdims=True) if isinstance(arrmean, mu.ndarray): arrmean = um.true_divide( arrmean, rcount, out=arrmean, casting='unsafe', subok=False) else: arrmean = arrmean.dtype.type(arrmean / rcount) # Compute sum of squared deviations from mean # Note that x may not be inexact and that we need it to be an array, # not a scalar. x = asanyarray(arr - arrmean) if issubclass(arr.dtype.type, (nt.floating, nt.integer)): x = um.multiply(x, x, out=x) else: x = um.multiply(x, um.conjugate(x), out=x).real ret = umr_sum(x, axis, dtype, out, keepdims) # Compute degrees of freedom and make sure it is not negative. rcount = max([rcount - ddof, 0]) # divide by degrees of freedom if isinstance(ret, mu.ndarray): ret = um.true_divide( ret, rcount, out=ret, casting='unsafe', subok=False) elif hasattr(ret, 'dtype'): ret = ret.dtype.type(ret / rcount) else: ret = ret / rcount return ret
def _nanvar(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False): # Using array() instead of asanyarray() because the former always # makes a copy, which is important due to the copyto() action later arr = array(a, subok=True) mask = isnan(arr) # First compute the mean, saving 'rcount' for reuse later if dtype is None and (issubdtype(arr.dtype, nt.integer) or issubdtype(arr.dtype, nt.bool_)): arrmean = um.add.reduce(arr, axis=axis, dtype='f8', keepdims=True) else: mu.copyto(arr, 0.0, where=mask) arrmean = um.add.reduce(arr, axis=axis, dtype=dtype, keepdims=True) rcount = (~mask).sum(axis=axis, keepdims=True) if isinstance(arrmean, mu.ndarray): arrmean = um.true_divide(arrmean, rcount, out=arrmean, casting='unsafe', subok=False) else: arrmean = arrmean / float(rcount) # arr - arrmean x = arr - arrmean mu.copyto(x, 0.0, where=mask) # (arr - arrmean) ** 2 if issubdtype(arr.dtype, nt.complex_): x = um.multiply(x, um.conjugate(x), out=x).real else: x = um.multiply(x, x, out=x) # add.reduce((arr - arrmean) ** 2, axis) ret = um.add.reduce(x, axis=axis, dtype=dtype, out=out, keepdims=keepdims) # add.reduce((arr - arrmean) ** 2, axis) / (n - ddof) if not keepdims and isinstance(rcount, mu.ndarray): rcount = rcount.squeeze(axis=axis) rcount -= ddof if isinstance(ret, mu.ndarray): ret = um.true_divide(ret, rcount, out=ret, casting='unsafe', subok=False) else: ret = ret / float(rcount) return ret
def _var(a, axis=None, dtype=None, out=None, ddof=0, skipna=False, keepdims=False): arr = asanyarray(a) # First compute the mean, saving 'rcount' for reuse later if dtype is None and arr.dtype.kind in ['b','u','i']: arrmean = um.add.reduce(arr, axis=axis, dtype='f8', skipna=skipna, keepdims=True) else: arrmean = um.add.reduce(arr, axis=axis, dtype=dtype, skipna=skipna, keepdims=True) rcount = mu.count_reduce_items(arr, axis=axis, skipna=skipna, keepdims=True) if isinstance(arrmean, mu.ndarray): arrmean = um.true_divide(arrmean, rcount, out=arrmean, casting='unsafe', subok=False) else: arrmean = arrmean / float(rcount) # arr - arrmean x = arr - arrmean # (arr - arrmean) ** 2 if arr.dtype.kind == 'c': x = um.multiply(x, um.conjugate(x), out=x).real else: x = um.multiply(x, x, out=x) # add.reduce((arr - arrmean) ** 2, axis) ret = um.add.reduce(x, axis=axis, dtype=dtype, out=out, skipna=skipna, keepdims=keepdims) # add.reduce((arr - arrmean) ** 2, axis) / (n - ddof) if not keepdims and isinstance(rcount, mu.ndarray): rcount = rcount.squeeze(axis=axis) rcount -= ddof if isinstance(ret, mu.ndarray): ret = um.true_divide(ret, rcount, out=ret, casting='unsafe', subok=False) else: ret = ret / float(rcount) return ret
def calcEk(oS, k): ''' 对于给定的 alpha, 计算 E值 :param oS: :param k: :return: ''' fXk = float(multiply(oS.alphas, oS.labelMat).T * oS.K[:, k] + oS.b) Ek = fXk - float(oS.labelMat[k]) return Ek
def adaBoostTrainDS(dataArr, classLabels, numIt=40): ''' 基于单层决策树的AdaBoost训练过程 :param dataArr: 数据集 :param classLabels: 类标签 :param numIt: 迭代次数, 用户自定义指定 :return: weakClassArr, 弱分类器集合;aggClassEst,每个数据点的类别估计累计值 ''' # 初始化 weakClassArr = [] m = shape(dataArr)[0] D = mat(ones((m, 1)) / m) # 初始化概率分布向量,其元素之和为 1 aggClassEst = mat(zeros((m, 1))) for i in range(numIt): # 构建单层决策树 bestStump, error, classEst = buildStump(dataArr, classLabels, D) print "D:", D.T # alpha每个分类器配备的权重值, 计算公式:alpha = (1/2) * ln[(1-e) / e] alpha = float(0.5 * log((1.0 - error) / max(error, 1e-16))) bestStump['alpha'] = alpha weakClassArr.append(bestStump) # 存储最佳决策树 print "classEst: ", classEst.T # 更新权重向量D # 若正确分类,D[t + 1] = [D[t]*exp(-a) / sum(D)] # 若错误分类,D[t + 1] = [D[t]*exp(+a) / sum(D)] expon = multiply(-1 * alpha * mat(classLabels).T, classEst) D = multiply(D, exp(expon)) # Calc New D for next iteration D = D / D.sum() aggClassEst += alpha * classEst # 更新累计类别估计值 print "aggClassEst: ", aggClassEst.T aggErrors = multiply(sign(aggClassEst) != mat(classLabels).T, ones((m, 1))) errorRate = aggErrors.sum() / m # 计算错误率 print "total error: ", errorRate if errorRate == 0.0: break # 为0, 退出循环 return weakClassArr, aggClassEst
def testDigits(kTup=('rbf', 10)): ''' 测试基于SVM的手写数字识别系统 :param kTup: 输入参数,元组类型 :return: ''' dataArr, labelArr = loadImages(os.path.dirname(os.getcwd()) + '\\datas\\digits\\trainingDigits') b, alphas = smoP(dataArr, labelArr, 200, 0.0001, 10000, kTup) datMat = mat(dataArr) labelMat = mat(labelArr).transpose() svInd = nonzero(alphas.A > 0)[0] sVs = datMat[svInd] labelSV = labelMat[svInd] print "there are %d Support Vectors" % shape(sVs)[0] m, n = shape(datMat) errorCount = 0 for i in range(m): kernelEval = kernelTrans(sVs, datMat[i, :], kTup) predict = kernelEval.T * multiply(labelSV, alphas[svInd]) + b if sign(predict) != sign(labelArr[i]): errorCount += 1 print "the training error rate is: %f" % (float(errorCount) / m) dataArr, labelArr = loadImages(os.path.dirname(os.getcwd()) + '\\datas\\digits\\testDigits') errorCount = 0 datMat = mat(dataArr) labelMat = mat(labelArr).transpose() m, n = shape(datMat) for i in range(m): kernelEval = kernelTrans(sVs, datMat[i, :], kTup) predict = kernelEval.T * multiply(labelSV, alphas[svInd]) + b if sign(predict) != sign(labelArr[i]): errorCount += 1 print "the test error rate is: %f" % (float(errorCount) / m)
def testRbf(k1=1.3): ''' 利用核函数进行分类的径向基测试函数, 数据集表现为圆形分类,如figure_3.png :param k1: 输入参数,高斯径向基中的用户自定义变量 :return: ''' dataArr, labelArr = loadDataSet(os.path.dirname(os.getcwd()) + '\\datas\\testSetRBF.txt') b, alphas = smoP(dataArr, labelArr, 200, 0.0001, 10000, ('rbf', k1)) datMat = mat(dataArr) labelMat = mat(labelArr).transpose() svInd = nonzero(alphas.A > 0)[0] sVs = datMat[svInd] # 获取唯一的支持向量 labelSV = labelMat[svInd] print "there are %d Support Vectors" % shape(sVs)[0] m, n = shape(datMat) errorCount = 0 for i in range(m): kernelEval = kernelTrans(sVs, datMat[i, :], ('rbf', k1)) predict = kernelEval.T * multiply(labelSV, alphas[svInd]) + b if sign(predict) != sign(labelArr[i]): errorCount += 1 print "the training error rate is: %f" % (float(errorCount) / m) dataArr, labelArr = loadDataSet(os.path.dirname(os.getcwd()) + '\\datas\\testSetRBF2.txt') errorCount = 0 datMat = mat(dataArr) labelMat = mat(labelArr).transpose() m, n = shape(datMat) for i in range(m): kernelEval = kernelTrans(sVs, datMat[i, :], ('rbf', k1)) predict = kernelEval.T * multiply(labelSV, alphas[svInd]) + b if sign(predict) != sign(labelArr[i]): errorCount += 1 print "the test error rate is: %f" % (float(errorCount) / m)
def test_wrap_with_iterable(self): # test fix for bug #1026: class with_wrap(np.ndarray): __array_priority__ = 10 def __new__(cls): return np.asarray(1).view(cls).copy() def __array_wrap__(self, arr, context): return arr.view(type(self)) a = with_wrap() x = ncu.multiply(a, (1, 2, 3)) self.assertTrue(isinstance(x, with_wrap)) assert_array_equal(x, np.array((1, 2, 3)))
def crossValidation(xArr, yArr, numVal=10): ''' 交叉验证测试岭回归 :param xArr: :param yArr: :param numVal: :return: ''' m = len(yArr) indexList = range(m) errorMat = zeros((numVal, 30)) for i in range(numVal): trainX = [] trainY = [] testX = [] testY = [] random.shuffle(indexList) for j in range(m): if j < m * 0.9: trainX.append(xArr[indexList[j]]) trainY.append(yArr[indexList[j]]) else: testX.append(xArr[indexList[j]]) testY.append(yArr[indexList[j]]) wMat = ridgeTest(trainX, trainY) for k in range(30): matTestX = mat(testX) matTrainX = mat(trainX) meanTrain = mean(matTrainX, 0) varTrain = var(matTrainX, 0) matTestX = (matTestX - meanTrain) / varTrain yEst = matTestX * mat(wMat[k, :]).T + mean(trainY) errorMat[i, k] = rssError(yEst.T.A, array(testY)) print errorMat[i, k] meanErrors = mean(errorMat, 0) minMean = float(min(meanErrors)) bestWeights = wMat[nonzero(meanErrors == minMean)] xMat = mat(xArr) yMat = mat(yArr).T meanX = mean(xMat, 0) varX = var(xMat, 0) unReg = bestWeights / varX print "the best model from Ridge Regression is:\n", unReg print "with constant term: ", -1 * sum(multiply(meanX, unReg)) + mean(yMat)
def calcWs(alphas, dataArr, classLabels): ''' 计算 w值 :param alphas: 参数 :param dataArr: 数据集 :param classLabels: 类标签 :return: ''' X = mat(dataArr) labelMat = mat(classLabels).transpose() m, n = shape(X) w = zeros((n, 1)) for i in range(m): w += multiply(alphas[i] * labelMat[i], X[i, :].T) return w
def _var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False, *, where=True): arr = asanyarray(a) rcount = _count_reduce_items(arr, axis, keepdims=keepdims, where=where) # Make this warning show up on top. if ddof >= rcount if where is True else umr_any(ddof >= rcount): warnings.warn("Degrees of freedom <= 0 for slice", RuntimeWarning, stacklevel=2) # Cast bool, unsigned int, and int to float64 by default if dtype is None and issubclass(arr.dtype.type, (nt.integer, nt.bool_)): dtype = mu.dtype('f8') # Compute the mean. # Note that if dtype is not of inexact type then arraymean will # not be either. arrmean = umr_sum(arr, axis, dtype, keepdims=True, where=where) # The shape of rcount has to match arrmean to not change the shape of out # in broadcasting. Otherwise, it cannot be stored back to arrmean. if rcount.ndim == 0: # fast-path for default case when where is True div = rcount else: # matching rcount to arrmean when where is specified as array div = rcount.reshape(arrmean.shape) if isinstance(arrmean, mu.ndarray): arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe', subok=False) else: arrmean = arrmean.dtype.type(arrmean / rcount) # Compute sum of squared deviations from mean # Note that x may not be inexact and that we need it to be an array, # not a scalar. x = asanyarray(arr - arrmean) if issubclass(arr.dtype.type, (nt.floating, nt.integer)): x = um.multiply(x, x, out=x) # Fast-paths for built-in complex types elif x.dtype in _complex_to_float: xv = x.view(dtype=(_complex_to_float[x.dtype], (2, ))) um.multiply(xv, xv, out=xv) x = um.add(xv[..., 0], xv[..., 1], out=x.real).real # Most general case; includes handling object arrays containing imaginary # numbers and complex types with non-native byteorder else: x = um.multiply(x, um.conjugate(x), out=x).real ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where) # Compute degrees of freedom and make sure it is not negative. rcount = um.maximum(rcount - ddof, 0) # divide by degrees of freedom if isinstance(ret, mu.ndarray): ret = um.true_divide(ret, rcount, out=ret, casting='unsafe', subok=False) elif hasattr(ret, 'dtype'): ret = ret.dtype.type(ret / rcount) else: ret = ret / rcount return ret
def smoSimple(dataMatIn, classLabels, C, toler, maxIter): ''' 简化版 SMO算法实现 :param dataMatIn: 数据集 :param classLabels: 类标签 :param C: 常数 C :param toler: 容错率 :param maxIter: 取消前最大的循环次数 :return: ''' dataMatrix = mat(dataMatIn) labelMat = mat(classLabels).transpose() # 初始化 b = 0 m, n = shape(dataMatrix) alphas = mat(zeros((m, 1))) iter = 0 while (iter < maxIter): # 如果Alpha可以更改进入优化过程 alphaPairsChanged = 0 for i in range(m): fXi = float(multiply(alphas, labelMat).T * (dataMatrix * dataMatrix[i, :].T)) + b Ei = fXi - float(labelMat[i]) if ((labelMat[i] * Ei < -toler) and (alphas[i] < C)) or \ ((labelMat[i] * Ei > toler) and (alphas[i] > 0)): j = selectJrand(i, m) # 随机选择第二个alpha fXj = float(multiply(alphas, labelMat).T * (dataMatrix * dataMatrix[j, :].T)) + b Ej = fXj - float(labelMat[j]) alphaIold = alphas[i].copy() alphaJold = alphas[j].copy() # 保证alpha在0和C之间 if (labelMat[i] != labelMat[j]): L = max(0, alphas[j] - alphas[i]) H = min(C, C + alphas[j] - alphas[i]) else: L = max(0, alphas[j] + alphas[i] - C) H = min(C, alphas[j] + alphas[i]) if L == H: print "L==H" continue eta = 2.0 * dataMatrix[i, :] * dataMatrix[j, :].T - \ dataMatrix[i, :] * dataMatrix[i, :].T - \ dataMatrix[j, :] * dataMatrix[j, :].T if eta >= 0: print "eta>=0" continue alphas[j] -= labelMat[j] * (Ei - Ej) / eta alphas[j] = clipAlpha(alphas[j], H, L) if (abs(alphas[j] - alphaJold) < 0.00001): print "j not moving enough" continue alphas[i] += labelMat[j] * labelMat[i] * (alphaJold - alphas[j]) b1 = b - Ei - labelMat[i] * (alphas[i] - alphaIold) \ * dataMatrix[i, :] * dataMatrix[i, :].T - \ labelMat[j] * (alphas[j] - alphaJold) * dataMatrix[i, :] * \ dataMatrix[j, :].T b2 = b - Ej - labelMat[i] * (alphas[i] - alphaIold) * \ dataMatrix[i, :] * dataMatrix[j, :].T - \ labelMat[j] * (alphas[j] - alphaJold) * dataMatrix[j, :] * \ dataMatrix[j, :].T if (0 < alphas[i]) and (C > alphas[i]): b = b1 elif (0 < alphas[j]) and (C > alphas[j]): b = b2 else: b = (b1 + b2) / 2.0 alphaPairsChanged += 1 print "iter: %d i:%d, pairs changed %d" % (iter, i, alphaPairsChanged) if (alphaPairsChanged == 0): iter += 1 else: iter = 0 print "iteration number: %d" % iter return b, alphas
def average(a, axis=None, weights=None, returned=False): """average(a, axis=None weights=None, returned=False) Average the array over the given axis. If the axis is None, average over all dimensions of the array. Equivalent to a.mean(axis) and to a.sum(axis) / size(a, axis) If weights are given, result is: sum(a * weights,axis) / sum(weights,axis), where the weights must have a's shape or be 1D with length the size of a in the given axis. Integer weights are converted to Float. Not specifying weights is equivalent to specifying weights that are all 1. If 'returned' is True, return a tuple: the result and the sum of the weights or count of values. The shape of these two results will be the same. Raises ZeroDivisionError if appropriate. (The version in MA does not -- it returns masked values). """ if axis is None: a = array(a).ravel() if weights is None: n = add.reduce(a) d = len(a) * 1.0 else: w = array(weights).ravel() * 1.0 n = add.reduce(multiply(a, w)) d = add.reduce(w) else: a = array(a) ash = a.shape if ash == (): a.shape = (1,) if weights is None: n = add.reduce(a, axis) d = ash[axis] * 1.0 if returned: d = ones(n.shape) * d else: w = array(weights, copy=False) * 1.0 wsh = w.shape if wsh == (): wsh = (1,) if wsh == ash: n = add.reduce(a*w, axis) d = add.reduce(w, axis) elif wsh == (ash[axis],): ni = ash[axis] r = [newaxis]*ni r[axis] = slice(None, None, 1) w1 = eval("w["+repr(tuple(r))+"]*ones(ash, float)") n = add.reduce(a*w1, axis) d = add.reduce(w1, axis) else: raise ValueError, 'averaging weights have wrong shape' if not isinstance(d, ndarray): if d == 0.0: raise ZeroDivisionError, 'zero denominator in average()' if returned: return n/d, d else: return n/d