Esempio n. 1
0
 def _build_leaf(self, y):
     fd = FreqDict(y)
     leaf = list()
     for label in self._class_label:
         cnt = fd[label] if label in fd.keys() else 0
         leaf.append((label, cnt))
     return leaf
Esempio n. 2
0
 def predict(self, X):
     assert self._is_trained, 'model must be trained before predict.'
     pred = list()
     if self._search_mode == 'kd_tree':
         kd_tree = self._parameter['kd_tree']
         K = min(self._K, self._parameter['kd_tree'].nSize)
         for i in xrange(X.shape[0]):
             neighbor = kd_tree.search(kd_tree.root, X[i, :], K)
             fd = FreqDict([v.y for v in neighbor], reverse=True)
             pred.append(fd.keys()[0])
             self._logger.info(
                 'progress : %.2f %%\tsearch ratio : %f' % (float(i) / X.shape[0] * 100, kd_tree.get_search_ratio()))
     elif self._search_mode == 'brutal':
         K = min(self._K, len(self._parameter['neighbor_y']))
         for i in xrange(X.shape[0]):
             dist = list()
             for irow in range(self._parameter['neighbor_X'].shape[0]):
                 dist.append(np.linalg.norm(X[i, :] - self._parameter['neighbor_X'][irow, :]))
             indices = np.argsort(dist)[:K]
             fd = FreqDict(list(self._parameter['neighbor_y'][indices]), reverse=True)
             pred.append(fd.keys()[0])
             self._logger.info('progress: %.2f %%' % (float(i) / X.shape[0] * 100))
     else:
         raise ValueError
     return pred
Esempio n. 3
0
def entropy(x):
    nSize = len(x)
    fd = FreqDict(list(x))
    result = 0.
    for v in fd.values():
        prob = float(v) / nSize
        result += -prob * np.log(prob)
    return result
Esempio n. 4
0
def entropy(x):
    nSize = len(x)
    fd = FreqDict(list(x))
    result = 0.
    for v in fd.values():
        prob = float(v) / nSize
        result += -prob * np.log(prob)
    return result
Esempio n. 5
0
def condition_entropy(x, cond):
    assert x.shape == cond.shape, 'input is invalid.'
    nSize = len(x)
    fd = FreqDict(list(cond))
    fd = {k: float(v) / nSize for k, v in fd}
    result = 0.
    for k, v in fd.iteritems():
        result += v * entropy(x[cond == k])
    return result
Esempio n. 6
0
def condition_entropy(x, cond):
    assert x.shape == cond.shape, 'input is invalid.'
    nSize = len(x)
    fd = FreqDict(list(cond))
    fd = {k: float(v) / nSize for k, v in fd}
    result = 0.
    for k, v in fd.iteritems():
        result += v * entropy(x[cond == k])
    return result
Esempio n. 7
0
 def _build_tree(self, X, y, used_feat):
     if len(np.unique(y)) == 1:
         return self._build_leaf(FreqDict(y))
     if X.shape[1] == 1:
         return self._build_leaf(FreqDict(y))
     if len(y) < self._min_split:
         return self._build_leaf(FreqDict(y))
     _used_feat = copy.deepcopy(used_feat)
     choosed_feat = self._choose_feature(X, y, _used_feat)
     if choosed_feat is None:
         return self._build_leaf(FreqDict(y))
     _used_feat.add(choosed_feat)
     root = {choosed_feat: {}}
     root[choosed_feat]['__default__'] = self._build_leaf(FreqDict(y))
     for v in np.unique(X[:, choosed_feat]):
         indices = X[:, choosed_feat] == v
         root[choosed_feat][v] = self._build_tree(X[indices], y[indices],
                                                  _used_feat)
     return root
Esempio n. 8
0
    def fit(self, X, y):
        assert self.__check_valid(X, y), 'input is invalid.'
        if self._is_trained is False:
            self._nFeat = X.shape[1]
            self._nClass = len(np.unique(y))
            self.feat_set = dict()
            for icol in range(X.shape[1]):
                self.feat_set[icol] = list(np.unique(X[:, icol]))
        nSize = X.shape[0]
        freq_y = {k: v for k, v in FreqDict(list(y))}
        cond_freq_feat = {
            k: {i: defaultdict(int)
                for i in range(X.shape[1])}
            for k in freq_y.keys()
        }
        for c in freq_y.keys():
            for icol in self.feat_set.keys():
                for feat_val in self.feat_set[icol]:
                    cond_freq_feat[c][icol][feat_val] = 1
        for irow in range(X.shape[0]):
            for icol in range(X.shape[1]):
                cond_freq_feat[y[irow]][icol][X[irow, icol]] += 1
        self._nSize += nSize
        if self._is_trained is False:
            self._parameter['freq_y'] = freq_y
            self._parameter['cond_freq_feat'] = cond_freq_feat
            self._parameter['proba_y'] = dict()
            self._parameter['cond_proba_feat'] = {
                k: {i: defaultdict(float)
                    for i in range(X.shape[1])}
                for k in self._parameter['cond_freq_feat'].keys()
            }
        else:
            for c in freq_y.keys():
                self._parameter['freq_y'][c] += freq_y[c]
            for c in self._parameter['proba_y'].keys():
                for icol in self.feat_set.keys():
                    for feat_val in self.feat_set[icol]:
                        self._parameter['cond_freq_feat'][c][icol][
                            feat_val] += cond_freq_feat[c][icol][feat_val] - 1
        self._parameter['proba_y'] = {
            k: np.log(float(v) / self._nSize)
            for k, v in self._parameter['freq_y'].iteritems()
        }
        for c, feats in self._parameter['cond_freq_feat'].iteritems():
            for icol, feat in feats.iteritems():
                for feat_val in feat.keys():
                    self._parameter['cond_proba_feat'][c][icol][
                        feat_val] = np.log(
                            float(feat[feat_val]) /
                            (self._parameter['freq_y'][c] +
                             len(self.feat_set[icol])))

        self._is_trained = True
Esempio n. 9
0
 def predict(self, X):
     assert self._is_trained, 'model must be trained before predict.'
     pred = list()
     if self._search_mode == 'kd_tree':
         kd_tree = self._parameter['kd_tree']
         K = min(self._K, self._parameter['kd_tree'].nSize)
         for i in xrange(X.shape[0]):
             neighbor = kd_tree.search(kd_tree.root, X[i, :], K)
             fd = FreqDict([v.y for v in neighbor], reverse=True)
             pred.append(fd.keys()[0])
             self._logger.info(
                 'progress : %.2f %%\tsearch ratio : %f' %
                 (float(i) / X.shape[0] * 100, kd_tree.get_search_ratio()))
     elif self._search_mode == 'brutal':
         K = min(self._K, len(self._parameter['neighbor_y']))
         for i in xrange(X.shape[0]):
             dist = list()
             for irow in range(self._parameter['neighbor_X'].shape[0]):
                 dist.append(
                     np.linalg.norm(X[i, :] -
                                    self._parameter['neighbor_X'][irow, :]))
             indices = np.argsort(dist)[:K]
             fd = FreqDict(list(self._parameter['neighbor_y'][indices]),
                           reverse=True)
             pred.append(fd.keys()[0])
             self._logger.info('progress: %.2f %%' %
                               (float(i) / X.shape[0] * 100))
     else:
         raise ValueError
     return pred