def fit(self, dataset, train_data): if self.loss_type == "multi-classification": label_valueset = dataset.get_label_valueset() self.loss = MultinomialDeviance(dataset.get_label_size(), label_valueset) f = dict() # 记录F_{m-1}的值 self.loss.initialize(f, dataset) for iter in range(1, self.max_iter + 1): subset = train_data if 0 < self.sample_rate < 1: subset = sample(subset, int(len(subset) * self.sample_rate)) self.trees[iter] = dict() # 用损失函数的负梯度作为回归问题提升树的残差近似值 residual = self.loss.compute_residual(dataset, subset, f) for label in label_valueset: # 挂在叶子节点下的各种样本,只有到迭代的max-depth才会使用 # 存放的各个叶子节点,注意叶子节点存放的是各个条件下的样本集点 leaf_nodes = [] targets = {} for id in subset: targets[id] = residual[id][label] # 对某一个具体的label-K分类,选择max-depth个特征构造决策树 tree = construct_decision_tree( dataset, subset, targets, 0, leaf_nodes, self.max_depth, self.loss, self.split_points ) self.trees[iter][label] = tree self.loss.update_f_value(f, tree, leaf_nodes, subset, dataset, self.learn_rate, label) train_loss = self.compute_loss(dataset, train_data, f) print("iter%d : average train_loss=%f" % (iter, train_loss)) else: if self.loss_type == "binary-classification": self.loss = BinomialDeviance(n_classes=dataset.get_label_size()) elif self.loss_type == "regression": self.loss = LeastSquaresError(n_classes=1) f = dict() # 记录F_{m-1}的值 self.loss.initialize(f, dataset) for iter in range(1, self.max_iter + 1): subset = train_data if 0 < self.sample_rate < 1: subset = sample(subset, int(len(subset) * self.sample_rate)) # 用损失函数的负梯度作为回归问题提升树的残差近似值 residual = self.loss.compute_residual(dataset, subset, f) leaf_nodes = [] targets = residual tree = construct_decision_tree( dataset, subset, targets, 0, leaf_nodes, self.max_depth, self.loss, self.split_points ) self.trees[iter] = tree self.loss.update_f_value(f, tree, leaf_nodes, subset, dataset, self.learn_rate) if isinstance(self.loss, RegressionLossFunction): # todo 判断回归的效果 pass else: train_loss = self.compute_loss(dataset, train_data, f) print("iter%d : train loss=%f" % (iter, train_loss))
def fit(self, dataset, train_data): if self.loss_type == 'multi-classification': label_valueset = dataset.get_label_valueset() self.loss = MultinomialDeviance(dataset.get_label_size(), label_valueset) f = dict() # 记录F_{m-1}的值 self.loss.initialize(f, dataset) for iter in range(1, self.max_iter+1): subset = train_data if 0 < self.sample_rate < 1: subset = sample(subset, int(len(subset)*self.sample_rate)) self.trees[iter] = dict() # 用损失函数的负梯度作为回归问题提升树的残差近似值 residual = self.loss.compute_residual(dataset, subset, f) for label in label_valueset: # 挂在叶子节点下的各种样本,只有到迭代的max-depth才会使用 # 存放的各个叶子节点,注意叶子节点存放的是各个条件下的样本集点 leaf_nodes = [] targets = {} for id in subset: targets[id] = residual[id][label] # 对某一个具体的label-K分类,选择max-depth个特征构造决策树 tree = construct_decision_tree(dataset, subset, targets, 0, leaf_nodes, self.max_depth, self.loss, self.split_points) self.trees[iter][label] = tree self.loss.update_f_value(f, tree, leaf_nodes, subset, dataset, self.learn_rate, label) train_loss = self.compute_loss(dataset, train_data, f) print("iter%d : average train_loss=%f" % (iter, train_loss)) else: if self.loss_type == 'binary-classification': self.loss = BinomialDeviance(n_classes=dataset.get_label_size()) elif self.loss_type == 'regression': self.loss = LeastSquaresError(n_classes=1) f = dict() # 记录F_{m-1}的值 self.loss.initialize(f, dataset) for iter in range(1, self.max_iter+1): # Chao: 用决策树拟合的步数 m subset = train_data if 0 < self.sample_rate < 1: # Chao: 这里只选择80%的Id来构造决策树。 # Chao:未来,剩下20%的Id只是通过这个构造好的决策树的leafnodes得到取值 subset = sample(subset, int(len(subset)*self.sample_rate)) # 用损失函数的负梯度作为回归问题提升树的残差近似值 residual = self.loss.compute_residual(dataset, subset, f) leaf_nodes = [] targets = residual tree = construct_decision_tree(dataset, subset, targets, 0, leaf_nodes, self.max_depth, self.loss, self.split_points) self.trees[iter] = tree self.loss.update_f_value(f, tree, leaf_nodes, subset, dataset, self.learn_rate) # Chao:更新每个Id(样本点)的值 if isinstance(self.loss, RegressionLossFunction): # todo 判断回归的效果 pass else: train_loss = self.compute_loss(dataset, train_data, f) print("iter%d : train loss=%f" % (iter,train_loss))
def fit(self, dataset, train_data): # 我只看回归 self.loss = LeastSquaresError(n_classes=1) f = dict() # 记录F_{m-1}的值 self.loss.initialize(f, dataset) # TODO f = {"id1":0.0, "id2":0.0} for iter in range(1, self.max_iter + 1): subset = train_data if 0 < self.sample_rate < 1: subset = sample(subset, int(len(subset) * self.sample_rate)) # 用损失函数的负梯度作为回归问题提升树的残差近似值 residual = self.loss.compute_residual(dataset, subset, f) leaf_nodes = [] targets = residual # 目标变成残差 tree = construct_decision_tree(dataset, subset, targets, 0, leaf_nodes, self.max_depth, self.loss, self.split_points) self.trees[iter] = tree self.loss.update_f_value(f, tree, leaf_nodes, subset, dataset, self.learn_rate) if isinstance(self.loss, RegressionLossFunction): # todo 判断回归的效果 pass else: train_loss = self.compute_loss(dataset, train_data, f) print("iter%d : train loss=%f" % (iter, train_loss))
def fit(self, dataset, train_data): if self.loss_type == 'multi-classification': label_valueset = dataset.get_label_valueset() self.loss = MultinomialDeviance(dataset.get_label_size(), label_valueset) f = dict() # 记录F_{m-1}的值 self.loss.initialize(f, dataset) for iter in range(1, self.max_iter + 1): subset = train_data if 0 < self.sample_rate < 1: subset = sample(subset, int(len(subset) * self.sample_rate)) self.trees[iter] = dict() # 用损失函数的负梯度作为回归问题提升树的残差近似值 residual = self.loss.compute_residual(dataset, subset, f) for label in label_valueset: # 挂在叶子节点下的各种样本,只有到迭代的max-depth才会使用 # 存放的各个叶子节点,注意叶子节点存放的是各个条件下的样本集点 leaf_nodes = [] targets = {} for id in subset: targets[id] = residual[id][label] # 对某一个具体的label-K分类,选择max-depth个特征构造决策树 tree = construct_decision_tree(dataset, subset, targets, 0, leaf_nodes, self.max_depth, self.loss, self.split_points) self.trees[iter][label] = tree self.loss.update_f_value(f, tree, leaf_nodes, subset, dataset, self.learn_rate, label) train_loss = self.compute_loss(dataset, train_data, f) print("iter%d : average train_loss=%f" % (iter, train_loss)) else: if self.loss_type == 'binary-classification': self.loss = BinomialDeviance( n_classes=dataset.get_label_size()) #二分类 elif self.loss_type == 'regression': self.loss = LeastSquaresError(n_classes=1) f = dict() # 记录F_{m-1}的值 self.loss.initialize(f, dataset) #初始化F_{m-1} for iter in range(1, self.max_iter + 1): subset = train_data if 0 < self.sample_rate < 1: subset = sample(subset, int( len(subset) * self.sample_rate)) #python的采样函数 # 用损失函数的负梯度作为回归问题提升树的残差近似值 residual = self.loss.compute_residual( dataset, subset, f) #对每个样本都计算了误差,此处residual是一个字典,样本id为key leaf_nodes = [] targets = residual #负梯度作为树的拟合目标 #拟合一棵树 tree = construct_decision_tree(dataset, subset, targets, 0, leaf_nodes, self.max_depth, self.loss, self.split_points) #加入第i棵树 self.trees[iter] = tree #更新每个样本的误差 self.loss.update_f_value(f, tree, leaf_nodes, subset, dataset, self.learn_rate) #计算训练误差 if isinstance(self.loss, RegressionLossFunction): # todo 判断回归的效果 pass else: #分类 train_loss = self.compute_loss(dataset, train_data, f) print("iter%d : train loss=%f" % (iter, train_loss))