def choseBestFeature(dataSet, op=[1, 1]): # 三个停止条件可否当作是三个预剪枝操作 if len(set(get_label(dataSet))) == 1: # 停止条件 1 regLeaf = mean(get_label(dataSet)) return None, regLeaf # 返回标签的均值作为叶子节点 Serror = GetAllVar(dataSet) BestFeature = -1 BestNumber = 0 lowError = float('inf') m, n = shape(dataSet) # m 个样本, n -1 个特征 for i in range(n - 1): # 遍历每一个特征值 for j in set(choice_column(dataSet, i)): # 选择一列 dataL, dataR = dataSplit(dataSet, i, j) # 以该列某值划分 #print dataR,"\n",dataL if shape(dataR)[0] < op[1] or shape(dataL)[0] < op[1]: continue # 如果所给的划分后的数据集中样本数目甚少,则直接跳出 tempError = GetAllVar(dataL) + GetAllVar(dataR) if tempError < lowError: lowError = tempError BestFeature = i BestNumber = j # if Serror - lowError < 0.01: # 停止条件 2 如果所给的数据划分前后的差别不大,则停止划分 # print "Serror,lowError: ",Serror,lowError # print "---",mean(get_label(dataSet)) # return None,mean(get_label(dataSet)) dataL, dataR = dataSplit(dataSet, BestFeature, BestNumber) if shape(dataR)[0] < op[1] or shape(dataL)[0] < op[1]: # 停止条件 3 return None, mean(get_label(dataSet)) return BestFeature, BestNumber
def kernal_ridge_train(self, X, y, alpha): m, n = mat.shape(X) B = mat.create_by_mn(m, 1, 1.0) X = mat.extend(X, B) X_K = self.Kernal(X) # 核函数映射,低维映射到高维 X_T = mat.transpose(X_K) X_T_X = mat.multiply(X_T, X_K) m, n = mat.shape(X_T_X) I = mat.eye(m) alp_I = mat.n_mat(alpha, I) _inverse = mat.inverse(mat.add(X_T_X, alp_I)) if _inverse == 0: self.isError = True print("逆矩阵不可求") return 0, 0 _w = mat.multiply(mat.multiply(_inverse, X_T), y) return _w, X
def stageWise(self, xArr, yArr, eps=0.01, numIt=5000): xMat = xArr yMat = yArr #预测的变量的转置 yMean = mat.mean(yMat, 0) yMat = mat.lasso_sub(yMat, yMean) xMat = mat.regularize(xMat) m, n = mat.shape(xMat) print xMat, yMat ws = mat.create_one_mn(n, 1) # tempws = copy.deepcopy(ws) # lowestError = float("inf") # float("inf") #初始化当前迭代的最小误差表示为正无穷大 for a_weight in ws: lowestError = float("inf") #mat.assign(tempws, ws) forward = 1 old_rssE = mat.rssError(yMat, mat.multiply(xMat, ws)) a_weight[0] = a_weight[0] + eps * forward rssE = mat.rssError(yMat, mat.multiply(xMat, ws)) if rssE > old_rssE: forward = -1 for time in range(numIt): a_weight[0] = a_weight[0] + eps * forward new_error = mat.rssError(yMat, mat.multiply(xMat, ws)) if new_error > old_rssE: break old_rssE = new_error print "ws: \n", ws return ws for i in range(numIt): ##每一次迭代 for j in range(n): a1 = tempws[j][0] a2 = a1 # 遍历每一个特征 for sign in [-1, 1]: # 两次循环,计算增加或者减少该特征对误差的影响 tempws[j][0] = a1 b = float(eps * sign) tempws = mat.list_add(tempws, j, 0, b) yTest = mat.multiply(xMat, tempws) rssE = mat.rssError(yMat, yTest) # 平方误差,将矩阵转换成为数组Array if rssE < lowestError: lowestError = rssE a2 = tempws[j][0] mat.assign(ws, tempws) if a2 != a1: tempws[j][0] = a2 else: tempws[j][0] = a1
def ridge_train(self, X, y, alpha): """ :param X: 输入的特征矩阵 :param y: 输入的标签 :param alpha: a complexity parameter that controls the amount of shrinkage :return: 模型的权重系数 w = (X.T*X+alpha*I).I*X.T*y """ m, n = mat.shape(X) B = mat.create_by_mn(m, 1, 1.0) X = mat.extend(X, B) # 将偏bias系数合并 X_T = mat.transpose(X) X_T_X = mat.multiply(X_T, X) m, n = mat.shape(X_T_X) I = mat.eye(m) alp_I = mat.n_mat(alpha, I) _inverse = mat.inverse(mat.add(X_T_X, alp_I)) if _inverse == 0: self.isError = True print("逆矩阵不可求") return 0 _w = mat.multiply(mat.multiply(_inverse, X_T), y) return _w
def get_label(_dataSet): # 最后一列为标签 return choice_column(_dataSet, shape(_dataSet)[1] - 1)
def GetAllVar(dataSet): return var(get_label(dataSet)) * shape(dataSet)[0]