예제 #1
0
 def fit(self, x, y, sample_weight, eps=1e-8):
     self._x, self._y = np.atleast_2d(x), np.array(y)
     self.sample_weight = sample_weight
     # 满足第一种停止准则,退出函数体
     if self.stop1(eps):
         return
     # 用该 Node 数据实例化 Cluster 计算各种信息量
     _cluster = Cluster(self._x, self._y, sample_weight, self.base)
     # 对于根节点,需要额外计算数据的不确定性
     if self.is_root:
         if self.criterion == "gini":
             self.chaos = _cluster.gini()
         else:
             self.chaos = _cluster.ent()
     _max_gain, _chaos_lst = 0, []
     _max_feature = _max_tar = None
     # 遍历还能选择的特征
     for feat in self.feats:
         # 如果连续型或是CART算法,需要额外计算二分标准的取值集合
         if self.wc[feat]:
             _samples = np.sort(self._x.T[feat])
             _set = (_samples[:-1] + _samples[1:]) * 0.5
         elif self.is_cart:
             _set = self.tree.feature_sets[feat]
         # 遍历二分标准并调用二类问题相关的计算信息量的方法
         if self.is_cart or self.wc[feat]:
             for tar in _set:
                 _tmp_gain, _tmp_chaos_lst = _cluster.b
예제 #2
0
 def get_threshold(self):
     if self.category is None:
         rs = 0
         for leaf in self.leafs.values():
             _cluster = Cluster(None, leaf["y"], None, self.base)
             rs += len(leaf["y"]) * _cluster.ent()
         return Cluster(self._x, self._y, None, self.base).ent() - rs / (len(self.leafs) - 1)
     return 0
예제 #3
0
 def fit(self, x, y, sample_weight, feature_bound=None, eps=1e-8):
     self._x, self._y = np.atleast_2d(x), np.asarray(y)
     self.sample_weight = sample_weight
     if self.stop1(eps):
         return
     cluster = Cluster(self._x, self._y, sample_weight, self.base)
     if self.is_root:
         if self.criterion == "gini":
             self.chaos = cluster.gini()
         else:
             self.chaos = cluster.ent()
     max_gain, chaos_lst = 0, []
     max_feature = max_tar = None
     feat_len = len(self.feats)
     if feature_bound is None:
         indices = range(0, feat_len)
     elif feature_bound == "log":
         indices = np.random.permutation(feat_len)[:max(1, int(log2(feat_len)))]
     else:
         indices = np.random.permutation(feat_len)[:feature_bound]
     tmp_feats = [self.feats[i] for i in indices]
     xt, feat_sets = self._x.T, self.tree.feature_sets
     bin_ig, ig = cluster.bin_info_gain, cluster.info_gain
     for feat in tmp_feats:
         if self.wc[feat]:
             samples = np.sort(xt[feat])
             feat_set = (samples[:-1] + samples[1:]) * 0.5
         else:
             if self.is_cart:
                 feat_set = feat_sets[feat]
             else:
                 feat_set = None
         if self.is_cart or self.wc[feat]:
             for tar in feat_set:
                 tmp_gain, tmp_chaos_lst = bin_ig(
                     feat, tar, criterion=self.criterion, get_chaos_lst=True, continuous=self.wc[feat])
                 if tmp_gain > max_gain:
                     (max_gain, chaos_lst), max_feature, max_tar = (tmp_gain, tmp_chaos_lst), feat, tar
         else:
             tmp_gain, tmp_chaos_lst = ig(
                 feat, self.criterion, True, self.tree.feature_sets[feat])
             if tmp_gain > max_gain:
                 (max_gain, chaos_lst), max_feature = (tmp_gain, tmp_chaos_lst), feat
     if self.stop2(max_gain, eps):
         return
     self.feature_dim = max_feature
     if self.is_cart or self.wc[max_feature]:
         self.tar = max_tar
         self._gen_children(chaos_lst, feature_bound)
         if (self.left_child.category is not None and
                 self.left_child.category == self.right_child.category):
             self.prune()
             self.tree.reduce_nodes()
     else:
         self._gen_children(chaos_lst, feature_bound)
예제 #4
0
 def fit(self, x, y, sample_weights, eps=1e-8):
     self.feed_data(x, y)
     self.sample_weights = sample_weights
     if self.stop1(eps):
         return
     _cluster = Cluster(self._x, self._y, sample_weights, self.base)
     _max_gain, _chaos_lst = 0, []
     _max_feature = _max_tar = None
     for feat in self.feats:
         if self.wc[feat]:
             _samples = np.sort(self._x.T[feat])
             _set = (_samples[:-1] + _samples[1:]) * 0.5
         else:
             if self.is_cart:
                 _set = self.tree.feature_sets[feat]
             else:
                 _set = None
         if self.is_cart or self.wc[feat]:
             for tar in _set:
                 _tmp_gain, _tmp_chaos_lst = _cluster.bin_info_gain(
                     feat, tar, criterion=self.criterion, get_chaos_lst=True, continuous=self.wc[feat])
                 if _tmp_gain > _max_gain:
                     (_max_gain, _chaos_lst), _max_feature, _max_tar = (_tmp_gain, _tmp_chaos_lst), feat, tar
         else:
             _tmp_gain, _tmp_chaos_lst = _cluster.info_gain(
                 feat, self.criterion, True, self.tree.feature_sets[feat])
             if _tmp_gain > _max_gain:
                 (_max_gain, _chaos_lst), _max_feature = (_tmp_gain, _tmp_chaos_lst), feat
     if self.stop2(_max_gain, eps):
         return
     self.feature_dim = _max_feature
     if self.is_cart or self.wc[_max_feature]:
         self.tar = _max_tar
         self._gen_children(_chaos_lst)
         if (self.left_child.category is not None and
                 self.left_child.category == self.right_child.category):
             self.prune()
             self.tree.reduce_nodes()
     else:
         self._gen_children(_chaos_lst)
예제 #5
0
 def fit(self, x, y, sample_weight, feature_bound=None, eps=1e-8):
     """
     1.根据划分标准将数据划分为若干份
     2.依次用这若干份数据实例化新Node(新Node即是当前Node的子节点),同时将当前Node的相关信息传递新的Node
     这里需要注意的是,如果划分标准是离散型的特征的话:
      .若算法是ID3或C4.5,需将该特征对应的维度从新的Node的self.feats属性中除去
      .若算法是CART,需要将二分标准从新的Node的二分标准取值集合中除去。
     最后对新的Node调用fit方法,完成递归
     """
     self._x, self._y = np.atleast_2d(x), np.asarray(y)
     self.sample_weight = sample_weight
     if self.stop1(eps):
         return
     cluster = Cluster(self._x, self._y, sample_weight, self.base)
     if self.is_root:
         if self.criterion =="gini":
             self.chaos = cluster.gini()
         else:
             self.chaos = cluster.ent()
     max_gain, chaos_lst = 0, []
     max_feature = max_tar = None
     feat_len = len(self.feats)
     if feature_bound is None:
         indices = range(0, feat_len)
     elif feature_bound == "log":
         indices = np.random.permutation(feat_len)[:max(1, int(log2(feat_len)))]
     else:
         indices = np.random.permutation(feat_len)[:feature_bound]
     tmp_feats = [self.feats[i] for i in indices]
     xt, feat_sets = self._x.T, self.tree.feature_sets
     bin_ig, ig = cluster.bin_info_gain, cluster.info_gain
     for feat in tmp_feats:#[0.1,2,3]遍历每个维度,通过遍历这些维度特征,选取使得不确定性最小的特征
         if self.wc[feat]: #是否连续 wc=whether_continuous
             samples = np.sort(xt[feat])
             feat_set = (samples[:-1] + samples[1:]) *5
         else:#非连续
             if self.is_cart:
                 feat_set = feat_sets[feat] #取第feat维的特征类别 #{'y', 'p'}
             else:
                 feat_set = None
         if self.is_cart or self.wc[feat]:
             for tar in feat_set:
                 tmp_gain, tmp_chaos_lst = bin_ig(feat,tar,criterion = self.criterion, get_chaos_lst=True,continuous=self.wc[feat])
                 if tmp_gain >max_gain:
                     (max_gain, chaos_lst), max_feature,max_tar = (tmp_gain, tmp_chaos_lst), feat, tar
         else:
             tmp_gain, tmp_chaos_lst = ig(feat, self.criterion, True, self.tree.feature_sets[feat])
             if tmp_gain > max_gain:
                 (max_gain, chaos_lst), max_feature = (tmp_gain, tmp_chaos_lst), feat
     if self.stop2(max_gain, eps):
         return
     self.feature_dim = max_feature #max_feature记录划分后获取最大信息增益的维度
     if self.is_cart or self.wc[max_feature]:
         self.tar = max_tar
         #调用根据划分标准进行生成的方法
         self._gen_children(chaos_lst, feature_bound)
         #如果该Node的左子节点和右子节点都是叶节点且所属类别一样,那么就将他们合并,亦进行局部剪枝
         if (self.left_child.category is not None and self.left_child.category == self.right_child.category):
             self.prune()
             #调用Tree的相关方法,将被剪掉的该Node的左右子节点从tree的记录所有Node的列表nodes中除去
             self.tree.reduce_nodes()
     else:
         #调用根据划分标准进行生成的方法
         self._gen_children(chaos_lst,feature_bound)