def train(self, num_trees=200, num_features=2, th=0.0): """ 逐个训练, 采用cart算法 :param num_trees: 树的数量 :param num_features: 每个划分随机抽取的候选特征数量 :param th: 传递给dts.train :return: """ for i in range(num_trees): # bootstrap有放回抽样 sample_indices = np.random.choice(range(self.m), size=self.m, replace=True) sampled = [self.rows[i] for i in sample_indices] # 根据ml_decision_trees.train的设定, 当m=num_features时, 即使sample=True, 也会执行bagging tree = dts.train_cart(sampled, th=th, target=self.target, m=num_features, sample=True) self.trees.append(tree) print("training complete on {0} trees".format(num_trees))
def train(self, d=1, k=20): """ 回归问题的提升树算法 :param d: 每棵树的深度 :param k: 树的数量 :return: """ def update_y(rows, y_new): """ 替换数据集rows的y值, 返回一个新的数据集 :param rows: 数据集 :param y_new: 新的y值, list-like :return: """ updated = [] col_id = len(rows[0]) - 1 # 列数 n_rows = len(rows) # 行数 for l in range(n_rows): r_new = rows[l][:col_id] r_new.append(y_new[l]) updated.append(r_new) return updated for i in range(k): # 用残差更新训练数据的y值 self.rows_updated = update_y(self.rows_updated, self.resid) # 训练新的weak learner, 入栈 t = dts.train_cart(self.rows_updated, target="regression", d=d) self.trees.append(t) # 更新残差 pred = dts.predict(t, self.rows_updated) # print(np.array(pred)) # print(self.resid) self.resid -= np.array(pred) # 输出当前步骤均方误差 mse = np.dot(self.resid, self.resid) / len(self.resid) print("training step {0} finished, current mse: {1}".format(str(i), str(mse)))