コード例 #1
0
ファイル: model.py プロジェクト: huangjinb/GBDT
    def fit(self, dataset, train_data):
        if self.loss_type == "multi-classification":
            label_valueset = dataset.get_label_valueset()
            self.loss = MultinomialDeviance(dataset.get_label_size(), label_valueset)
            f = dict()  # 记录F_{m-1}的值
            self.loss.initialize(f, dataset)
            for iter in range(1, self.max_iter + 1):
                subset = train_data
                if 0 < self.sample_rate < 1:
                    subset = sample(subset, int(len(subset) * self.sample_rate))
                self.trees[iter] = dict()
                # 用损失函数的负梯度作为回归问题提升树的残差近似值
                residual = self.loss.compute_residual(dataset, subset, f)
                for label in label_valueset:
                    # 挂在叶子节点下的各种样本,只有到迭代的max-depth才会使用
                    # 存放的各个叶子节点,注意叶子节点存放的是各个条件下的样本集点
                    leaf_nodes = []
                    targets = {}
                    for id in subset:
                        targets[id] = residual[id][label]
                    # 对某一个具体的label-K分类,选择max-depth个特征构造决策树
                    tree = construct_decision_tree(
                        dataset, subset, targets, 0, leaf_nodes, self.max_depth, self.loss, self.split_points
                    )
                    self.trees[iter][label] = tree
                    self.loss.update_f_value(f, tree, leaf_nodes, subset, dataset, self.learn_rate, label)
                train_loss = self.compute_loss(dataset, train_data, f)
                print("iter%d : average train_loss=%f" % (iter, train_loss))

        else:
            if self.loss_type == "binary-classification":
                self.loss = BinomialDeviance(n_classes=dataset.get_label_size())
            elif self.loss_type == "regression":
                self.loss = LeastSquaresError(n_classes=1)

            f = dict()  # 记录F_{m-1}的值
            self.loss.initialize(f, dataset)
            for iter in range(1, self.max_iter + 1):
                subset = train_data
                if 0 < self.sample_rate < 1:
                    subset = sample(subset, int(len(subset) * self.sample_rate))
                # 用损失函数的负梯度作为回归问题提升树的残差近似值
                residual = self.loss.compute_residual(dataset, subset, f)
                leaf_nodes = []
                targets = residual
                tree = construct_decision_tree(
                    dataset, subset, targets, 0, leaf_nodes, self.max_depth, self.loss, self.split_points
                )
                self.trees[iter] = tree
                self.loss.update_f_value(f, tree, leaf_nodes, subset, dataset, self.learn_rate)
                if isinstance(self.loss, RegressionLossFunction):
                    # todo 判断回归的效果
                    pass
                else:
                    train_loss = self.compute_loss(dataset, train_data, f)
                    print("iter%d : train loss=%f" % (iter, train_loss))
コード例 #2
0
ファイル: model.py プロジェクト: wcfrank/GBDT
    def fit(self, dataset, train_data):
        if self.loss_type == 'multi-classification':
            label_valueset = dataset.get_label_valueset()
            self.loss = MultinomialDeviance(dataset.get_label_size(), label_valueset)
            f = dict()  # 记录F_{m-1}的值
            self.loss.initialize(f, dataset)
            for iter in range(1, self.max_iter+1):
                subset = train_data
                if 0 < self.sample_rate < 1:
                    subset = sample(subset, int(len(subset)*self.sample_rate))
                self.trees[iter] = dict()
                # 用损失函数的负梯度作为回归问题提升树的残差近似值
                residual = self.loss.compute_residual(dataset, subset, f)
                for label in label_valueset:
                    # 挂在叶子节点下的各种样本,只有到迭代的max-depth才会使用
                    # 存放的各个叶子节点,注意叶子节点存放的是各个条件下的样本集点
                    leaf_nodes = []
                    targets = {}
                    for id in subset:
                        targets[id] = residual[id][label]
                    # 对某一个具体的label-K分类,选择max-depth个特征构造决策树
                    tree = construct_decision_tree(dataset, subset, targets, 0, leaf_nodes, self.max_depth, self.loss, self.split_points)
                    self.trees[iter][label] = tree
                    self.loss.update_f_value(f, tree, leaf_nodes, subset, dataset, self.learn_rate, label)
                train_loss = self.compute_loss(dataset, train_data, f)
                print("iter%d : average train_loss=%f" % (iter, train_loss))

        else:
            if self.loss_type == 'binary-classification':
                self.loss = BinomialDeviance(n_classes=dataset.get_label_size())
            elif self.loss_type == 'regression':
                self.loss = LeastSquaresError(n_classes=1)

            f = dict()  # 记录F_{m-1}的值
            self.loss.initialize(f, dataset)
            for iter in range(1, self.max_iter+1): # Chao: 用决策树拟合的步数 m
                subset = train_data
                if 0 < self.sample_rate < 1:
                    # Chao: 这里只选择80%的Id来构造决策树。
                    # Chao:未来,剩下20%的Id只是通过这个构造好的决策树的leafnodes得到取值
                    subset = sample(subset, int(len(subset)*self.sample_rate))
                # 用损失函数的负梯度作为回归问题提升树的残差近似值
                residual = self.loss.compute_residual(dataset, subset, f)
                leaf_nodes = []
                targets = residual
                tree = construct_decision_tree(dataset, subset, targets, 0, leaf_nodes, self.max_depth, self.loss, self.split_points)
                self.trees[iter] = tree
                self.loss.update_f_value(f, tree, leaf_nodes, subset, dataset, self.learn_rate) # Chao:更新每个Id(样本点)的值
                if isinstance(self.loss, RegressionLossFunction):
                    # todo 判断回归的效果
                    pass
                else:
                    train_loss = self.compute_loss(dataset, train_data, f)
                    print("iter%d : train loss=%f" % (iter,train_loss))
コード例 #3
0
    def fit(self, dataset, train_data):
        # 我只看回归
        self.loss = LeastSquaresError(n_classes=1)

        f = dict()  # 记录F_{m-1}的值
        self.loss.initialize(f, dataset)  # TODO f = {"id1":0.0, "id2":0.0}
        for iter in range(1, self.max_iter + 1):
            subset = train_data
            if 0 < self.sample_rate < 1:
                subset = sample(subset, int(len(subset) * self.sample_rate))
            # 用损失函数的负梯度作为回归问题提升树的残差近似值
            residual = self.loss.compute_residual(dataset, subset, f)
            leaf_nodes = []
            targets = residual  # 目标变成残差
            tree = construct_decision_tree(dataset, subset, targets, 0,
                                           leaf_nodes, self.max_depth,
                                           self.loss, self.split_points)
            self.trees[iter] = tree
            self.loss.update_f_value(f, tree, leaf_nodes, subset, dataset,
                                     self.learn_rate)
            if isinstance(self.loss, RegressionLossFunction):
                # todo 判断回归的效果
                pass
            else:
                train_loss = self.compute_loss(dataset, train_data, f)
                print("iter%d : train loss=%f" % (iter, train_loss))
コード例 #4
0
ファイル: model.py プロジェクト: hikekang/hkx_tf_practice
    def fit(self, dataset, train_data):
        if self.loss_type == 'multi-classification':
            label_valueset = dataset.get_label_valueset()
            self.loss = MultinomialDeviance(dataset.get_label_size(),
                                            label_valueset)
            f = dict()  # 记录F_{m-1}的值
            self.loss.initialize(f, dataset)
            for iter in range(1, self.max_iter + 1):
                subset = train_data
                if 0 < self.sample_rate < 1:
                    subset = sample(subset,
                                    int(len(subset) * self.sample_rate))
                self.trees[iter] = dict()
                # 用损失函数的负梯度作为回归问题提升树的残差近似值
                residual = self.loss.compute_residual(dataset, subset, f)
                for label in label_valueset:
                    # 挂在叶子节点下的各种样本,只有到迭代的max-depth才会使用
                    # 存放的各个叶子节点,注意叶子节点存放的是各个条件下的样本集点
                    leaf_nodes = []
                    targets = {}
                    for id in subset:
                        targets[id] = residual[id][label]
                    # 对某一个具体的label-K分类,选择max-depth个特征构造决策树
                    tree = construct_decision_tree(dataset, subset, targets, 0,
                                                   leaf_nodes, self.max_depth,
                                                   self.loss,
                                                   self.split_points)
                    self.trees[iter][label] = tree
                    self.loss.update_f_value(f, tree, leaf_nodes, subset,
                                             dataset, self.learn_rate, label)
                train_loss = self.compute_loss(dataset, train_data, f)
                print("iter%d : average train_loss=%f" % (iter, train_loss))

        else:
            if self.loss_type == 'binary-classification':
                self.loss = BinomialDeviance(
                    n_classes=dataset.get_label_size())  #二分类
            elif self.loss_type == 'regression':
                self.loss = LeastSquaresError(n_classes=1)

            f = dict()  # 记录F_{m-1}的值
            self.loss.initialize(f, dataset)  #初始化F_{m-1}
            for iter in range(1, self.max_iter + 1):
                subset = train_data
                if 0 < self.sample_rate < 1:
                    subset = sample(subset, int(
                        len(subset) * self.sample_rate))  #python的采样函数
                # 用损失函数的负梯度作为回归问题提升树的残差近似值
                residual = self.loss.compute_residual(
                    dataset, subset, f)  #对每个样本都计算了误差,此处residual是一个字典,样本id为key
                leaf_nodes = []
                targets = residual  #负梯度作为树的拟合目标
                #拟合一棵树
                tree = construct_decision_tree(dataset, subset, targets, 0,
                                               leaf_nodes, self.max_depth,
                                               self.loss, self.split_points)
                #加入第i棵树
                self.trees[iter] = tree
                #更新每个样本的误差
                self.loss.update_f_value(f, tree, leaf_nodes, subset, dataset,
                                         self.learn_rate)
                #计算训练误差
                if isinstance(self.loss, RegressionLossFunction):
                    # todo 判断回归的效果
                    pass
                else:  #分类
                    train_loss = self.compute_loss(dataset, train_data, f)
                    print("iter%d : train loss=%f" % (iter, train_loss))