def _build_leaf(self, y): fd = FreqDict(y) leaf = list() for label in self._class_label: cnt = fd[label] if label in fd.keys() else 0 leaf.append((label, cnt)) return leaf
def predict(self, X): assert self._is_trained, 'model must be trained before predict.' pred = list() if self._search_mode == 'kd_tree': kd_tree = self._parameter['kd_tree'] K = min(self._K, self._parameter['kd_tree'].nSize) for i in xrange(X.shape[0]): neighbor = kd_tree.search(kd_tree.root, X[i, :], K) fd = FreqDict([v.y for v in neighbor], reverse=True) pred.append(fd.keys()[0]) self._logger.info( 'progress : %.2f %%\tsearch ratio : %f' % (float(i) / X.shape[0] * 100, kd_tree.get_search_ratio())) elif self._search_mode == 'brutal': K = min(self._K, len(self._parameter['neighbor_y'])) for i in xrange(X.shape[0]): dist = list() for irow in range(self._parameter['neighbor_X'].shape[0]): dist.append(np.linalg.norm(X[i, :] - self._parameter['neighbor_X'][irow, :])) indices = np.argsort(dist)[:K] fd = FreqDict(list(self._parameter['neighbor_y'][indices]), reverse=True) pred.append(fd.keys()[0]) self._logger.info('progress: %.2f %%' % (float(i) / X.shape[0] * 100)) else: raise ValueError return pred
def entropy(x): nSize = len(x) fd = FreqDict(list(x)) result = 0. for v in fd.values(): prob = float(v) / nSize result += -prob * np.log(prob) return result
def condition_entropy(x, cond): assert x.shape == cond.shape, 'input is invalid.' nSize = len(x) fd = FreqDict(list(cond)) fd = {k: float(v) / nSize for k, v in fd} result = 0. for k, v in fd.iteritems(): result += v * entropy(x[cond == k]) return result
def _build_tree(self, X, y, used_feat): if len(np.unique(y)) == 1: return self._build_leaf(FreqDict(y)) if X.shape[1] == 1: return self._build_leaf(FreqDict(y)) if len(y) < self._min_split: return self._build_leaf(FreqDict(y)) _used_feat = copy.deepcopy(used_feat) choosed_feat = self._choose_feature(X, y, _used_feat) if choosed_feat is None: return self._build_leaf(FreqDict(y)) _used_feat.add(choosed_feat) root = {choosed_feat: {}} root[choosed_feat]['__default__'] = self._build_leaf(FreqDict(y)) for v in np.unique(X[:, choosed_feat]): indices = X[:, choosed_feat] == v root[choosed_feat][v] = self._build_tree(X[indices], y[indices], _used_feat) return root
def fit(self, X, y): assert self.__check_valid(X, y), 'input is invalid.' if self._is_trained is False: self._nFeat = X.shape[1] self._nClass = len(np.unique(y)) self.feat_set = dict() for icol in range(X.shape[1]): self.feat_set[icol] = list(np.unique(X[:, icol])) nSize = X.shape[0] freq_y = {k: v for k, v in FreqDict(list(y))} cond_freq_feat = { k: {i: defaultdict(int) for i in range(X.shape[1])} for k in freq_y.keys() } for c in freq_y.keys(): for icol in self.feat_set.keys(): for feat_val in self.feat_set[icol]: cond_freq_feat[c][icol][feat_val] = 1 for irow in range(X.shape[0]): for icol in range(X.shape[1]): cond_freq_feat[y[irow]][icol][X[irow, icol]] += 1 self._nSize += nSize if self._is_trained is False: self._parameter['freq_y'] = freq_y self._parameter['cond_freq_feat'] = cond_freq_feat self._parameter['proba_y'] = dict() self._parameter['cond_proba_feat'] = { k: {i: defaultdict(float) for i in range(X.shape[1])} for k in self._parameter['cond_freq_feat'].keys() } else: for c in freq_y.keys(): self._parameter['freq_y'][c] += freq_y[c] for c in self._parameter['proba_y'].keys(): for icol in self.feat_set.keys(): for feat_val in self.feat_set[icol]: self._parameter['cond_freq_feat'][c][icol][ feat_val] += cond_freq_feat[c][icol][feat_val] - 1 self._parameter['proba_y'] = { k: np.log(float(v) / self._nSize) for k, v in self._parameter['freq_y'].iteritems() } for c, feats in self._parameter['cond_freq_feat'].iteritems(): for icol, feat in feats.iteritems(): for feat_val in feat.keys(): self._parameter['cond_proba_feat'][c][icol][ feat_val] = np.log( float(feat[feat_val]) / (self._parameter['freq_y'][c] + len(self.feat_set[icol]))) self._is_trained = True
def predict(self, X): assert self._is_trained, 'model must be trained before predict.' pred = list() if self._search_mode == 'kd_tree': kd_tree = self._parameter['kd_tree'] K = min(self._K, self._parameter['kd_tree'].nSize) for i in xrange(X.shape[0]): neighbor = kd_tree.search(kd_tree.root, X[i, :], K) fd = FreqDict([v.y for v in neighbor], reverse=True) pred.append(fd.keys()[0]) self._logger.info( 'progress : %.2f %%\tsearch ratio : %f' % (float(i) / X.shape[0] * 100, kd_tree.get_search_ratio())) elif self._search_mode == 'brutal': K = min(self._K, len(self._parameter['neighbor_y'])) for i in xrange(X.shape[0]): dist = list() for irow in range(self._parameter['neighbor_X'].shape[0]): dist.append( np.linalg.norm(X[i, :] - self._parameter['neighbor_X'][irow, :])) indices = np.argsort(dist)[:K] fd = FreqDict(list(self._parameter['neighbor_y'][indices]), reverse=True) pred.append(fd.keys()[0]) self._logger.info('progress: %.2f %%' % (float(i) / X.shape[0] * 100)) else: raise ValueError return pred