def fit(self, x, y, sample_weight, eps=1e-8): self._x, self._y = np.atleast_2d(x), np.array(y) self.sample_weight = sample_weight # 满足第一种停止准则,退出函数体 if self.stop1(eps): return # 用该 Node 数据实例化 Cluster 计算各种信息量 _cluster = Cluster(self._x, self._y, sample_weight, self.base) # 对于根节点,需要额外计算数据的不确定性 if self.is_root: if self.criterion == "gini": self.chaos = _cluster.gini() else: self.chaos = _cluster.ent() _max_gain, _chaos_lst = 0, [] _max_feature = _max_tar = None # 遍历还能选择的特征 for feat in self.feats: # 如果连续型或是CART算法,需要额外计算二分标准的取值集合 if self.wc[feat]: _samples = np.sort(self._x.T[feat]) _set = (_samples[:-1] + _samples[1:]) * 0.5 elif self.is_cart: _set = self.tree.feature_sets[feat] # 遍历二分标准并调用二类问题相关的计算信息量的方法 if self.is_cart or self.wc[feat]: for tar in _set: _tmp_gain, _tmp_chaos_lst = _cluster.b
def get_threshold(self): if self.category is None: rs = 0 for leaf in self.leafs.values(): _cluster = Cluster(None, leaf["y"], None, self.base) rs += len(leaf["y"]) * _cluster.ent() return Cluster(self._x, self._y, None, self.base).ent() - rs / (len(self.leafs) - 1) return 0
def fit(self, x, y, sample_weight, feature_bound=None, eps=1e-8): self._x, self._y = np.atleast_2d(x), np.asarray(y) self.sample_weight = sample_weight if self.stop1(eps): return cluster = Cluster(self._x, self._y, sample_weight, self.base) if self.is_root: if self.criterion == "gini": self.chaos = cluster.gini() else: self.chaos = cluster.ent() max_gain, chaos_lst = 0, [] max_feature = max_tar = None feat_len = len(self.feats) if feature_bound is None: indices = range(0, feat_len) elif feature_bound == "log": indices = np.random.permutation(feat_len)[:max(1, int(log2(feat_len)))] else: indices = np.random.permutation(feat_len)[:feature_bound] tmp_feats = [self.feats[i] for i in indices] xt, feat_sets = self._x.T, self.tree.feature_sets bin_ig, ig = cluster.bin_info_gain, cluster.info_gain for feat in tmp_feats: if self.wc[feat]: samples = np.sort(xt[feat]) feat_set = (samples[:-1] + samples[1:]) * 0.5 else: if self.is_cart: feat_set = feat_sets[feat] else: feat_set = None if self.is_cart or self.wc[feat]: for tar in feat_set: tmp_gain, tmp_chaos_lst = bin_ig( feat, tar, criterion=self.criterion, get_chaos_lst=True, continuous=self.wc[feat]) if tmp_gain > max_gain: (max_gain, chaos_lst), max_feature, max_tar = (tmp_gain, tmp_chaos_lst), feat, tar else: tmp_gain, tmp_chaos_lst = ig( feat, self.criterion, True, self.tree.feature_sets[feat]) if tmp_gain > max_gain: (max_gain, chaos_lst), max_feature = (tmp_gain, tmp_chaos_lst), feat if self.stop2(max_gain, eps): return self.feature_dim = max_feature if self.is_cart or self.wc[max_feature]: self.tar = max_tar self._gen_children(chaos_lst, feature_bound) if (self.left_child.category is not None and self.left_child.category == self.right_child.category): self.prune() self.tree.reduce_nodes() else: self._gen_children(chaos_lst, feature_bound)
def fit(self, x, y, sample_weights, eps=1e-8): self.feed_data(x, y) self.sample_weights = sample_weights if self.stop1(eps): return _cluster = Cluster(self._x, self._y, sample_weights, self.base) _max_gain, _chaos_lst = 0, [] _max_feature = _max_tar = None for feat in self.feats: if self.wc[feat]: _samples = np.sort(self._x.T[feat]) _set = (_samples[:-1] + _samples[1:]) * 0.5 else: if self.is_cart: _set = self.tree.feature_sets[feat] else: _set = None if self.is_cart or self.wc[feat]: for tar in _set: _tmp_gain, _tmp_chaos_lst = _cluster.bin_info_gain( feat, tar, criterion=self.criterion, get_chaos_lst=True, continuous=self.wc[feat]) if _tmp_gain > _max_gain: (_max_gain, _chaos_lst), _max_feature, _max_tar = (_tmp_gain, _tmp_chaos_lst), feat, tar else: _tmp_gain, _tmp_chaos_lst = _cluster.info_gain( feat, self.criterion, True, self.tree.feature_sets[feat]) if _tmp_gain > _max_gain: (_max_gain, _chaos_lst), _max_feature = (_tmp_gain, _tmp_chaos_lst), feat if self.stop2(_max_gain, eps): return self.feature_dim = _max_feature if self.is_cart or self.wc[_max_feature]: self.tar = _max_tar self._gen_children(_chaos_lst) if (self.left_child.category is not None and self.left_child.category == self.right_child.category): self.prune() self.tree.reduce_nodes() else: self._gen_children(_chaos_lst)
def fit(self, x, y, sample_weight, feature_bound=None, eps=1e-8): """ 1.根据划分标准将数据划分为若干份 2.依次用这若干份数据实例化新Node(新Node即是当前Node的子节点),同时将当前Node的相关信息传递新的Node 这里需要注意的是,如果划分标准是离散型的特征的话: .若算法是ID3或C4.5,需将该特征对应的维度从新的Node的self.feats属性中除去 .若算法是CART,需要将二分标准从新的Node的二分标准取值集合中除去。 最后对新的Node调用fit方法,完成递归 """ self._x, self._y = np.atleast_2d(x), np.asarray(y) self.sample_weight = sample_weight if self.stop1(eps): return cluster = Cluster(self._x, self._y, sample_weight, self.base) if self.is_root: if self.criterion =="gini": self.chaos = cluster.gini() else: self.chaos = cluster.ent() max_gain, chaos_lst = 0, [] max_feature = max_tar = None feat_len = len(self.feats) if feature_bound is None: indices = range(0, feat_len) elif feature_bound == "log": indices = np.random.permutation(feat_len)[:max(1, int(log2(feat_len)))] else: indices = np.random.permutation(feat_len)[:feature_bound] tmp_feats = [self.feats[i] for i in indices] xt, feat_sets = self._x.T, self.tree.feature_sets bin_ig, ig = cluster.bin_info_gain, cluster.info_gain for feat in tmp_feats:#[0.1,2,3]遍历每个维度,通过遍历这些维度特征,选取使得不确定性最小的特征 if self.wc[feat]: #是否连续 wc=whether_continuous samples = np.sort(xt[feat]) feat_set = (samples[:-1] + samples[1:]) *5 else:#非连续 if self.is_cart: feat_set = feat_sets[feat] #取第feat维的特征类别 #{'y', 'p'} else: feat_set = None if self.is_cart or self.wc[feat]: for tar in feat_set: tmp_gain, tmp_chaos_lst = bin_ig(feat,tar,criterion = self.criterion, get_chaos_lst=True,continuous=self.wc[feat]) if tmp_gain >max_gain: (max_gain, chaos_lst), max_feature,max_tar = (tmp_gain, tmp_chaos_lst), feat, tar else: tmp_gain, tmp_chaos_lst = ig(feat, self.criterion, True, self.tree.feature_sets[feat]) if tmp_gain > max_gain: (max_gain, chaos_lst), max_feature = (tmp_gain, tmp_chaos_lst), feat if self.stop2(max_gain, eps): return self.feature_dim = max_feature #max_feature记录划分后获取最大信息增益的维度 if self.is_cart or self.wc[max_feature]: self.tar = max_tar #调用根据划分标准进行生成的方法 self._gen_children(chaos_lst, feature_bound) #如果该Node的左子节点和右子节点都是叶节点且所属类别一样,那么就将他们合并,亦进行局部剪枝 if (self.left_child.category is not None and self.left_child.category == self.right_child.category): self.prune() #调用Tree的相关方法,将被剪掉的该Node的左右子节点从tree的记录所有Node的列表nodes中除去 self.tree.reduce_nodes() else: #调用根据划分标准进行生成的方法 self._gen_children(chaos_lst,feature_bound)