self.plotMidText((self.xOff, self.yOff,), cntrPt, str(key)) self.yOff += 1.0 / self.totalD def createPlot(self): inTree = self._parameter['tree'] fig = plt.figure(1, facecolor='white') fig.clf() axprops = dict(xticks=[], yticks=[]) self.decisionNode = dict(boxstyle='sawtooth', fc='0.8') self.leafNone = dict(boxstyle='round4', fc='0.8') self.arrow_args = dict(arrowstyle='<-') self.ax1 = plt.subplot(111, frameon=False, **axprops) self.totalW = float(self.get_num_leafs(inTree)) self.totalD = float(self.get_tree_depth(inTree)) self.xOff = -0.5 / self.totalW self.yOff = 1.0 self.plotTree(inTree, (0.5, 1.0), '') plt.show() if __name__ == '__main__': path = os.getcwd() + '/../dataset/dataset_21_car.arff' loader = DataLoader(path) dataset = loader.load(target_col_name='class') trainset, testset = dataset.cross_split() dt = DecisionTreeClassifier(min_split=1, is_prune=False) dt.fit(trainset[0], trainset[1]) predict = dt.predict(testset[0]) performance = accuracy_score(testset[1], predict) print 'test accuracy:', performance
info = self._S if cumulative is False: return info else: return np.cumsum(info) def plot(self, X): nSize = X.shape[0] nFeat = X.shape[1] assert nFeat == 2, 'feature number should be 2.' for i in xrange(nSize): plt.plot(X[i, 0], X[i, 1], 'or') plt.show() if __name__ == '__main__': path = os.getcwd() + '/../dataset/iris.arff' loader = DataLoader(path) dataset = loader.load(target_col_name='binaryClass') trainset, testset = dataset.cross_split() pca = PCA(2, whiten=False) X = trainset[0][:, [0, 2]] pca.fit(X) _X = pca.transform(X) print 'eigenvalue:', pca.information_distribution(percent=True) pca.plot(_X) pca = PCA(2, whiten=True) pca.fit(X) _X = pca.transform(X) pca.plot(_X)
logger.info("progress: %.2f %%" % (float(i) / X.shape[0] * 100)) else: raise ValueError return pred logger = get_logger("KNN") if __name__ == "__main__": from base.time_scheduler import TimeScheduler scheduler = TimeScheduler() # KNN for classification task path = os.getcwd() + "/../dataset/electricity-normalized.arff" loader = DataLoader(path) dataset = loader.load(target_col_name="class") trainset, testset = dataset.cross_split() knn = KNNClassifier(search_mode="kd_tree") knn.fit(trainset[0], trainset[1]) predict_kd_tree = scheduler.tic_tac("kd_tree", knn.predict, X=testset[0]) knn = KNNClassifier(search_mode="brutal") knn.fit(trainset[0], trainset[1]) predict_brutal = scheduler.tic_tac("brutal", knn.predict, X=testset[0]) scheduler.print_task_schedule("brutal") scheduler.print_task_schedule("kd_tree") print accuracy_score(testset[1], predict_brutal), accuracy_score(testset[1], predict_kd_tree) # KNN for regression task # path = os.getcwd() + '/../dataset/winequality-white.csv' # loader = DataLoader(path)
for irow in range(X.shape[0]): _X = X[irow] max_prob = None label = None for c in proba_y.keys(): p = proba_y[c] for icol, feat in cond_proba_y[c].iteritems(): p += feat[_X[icol]] if max_prob < p or max_prob is None: max_prob = p label = c assert label is not None, 'label should be None. There must be some error. please check.' pred.append(label) return np.array(pred) if __name__ == '__main__': path = os.getcwd() + '/../dataset/dataset_21_car.arff' loader = DataLoader(path) dataset = loader.load(target_col_name='class') trainset, testset = dataset.cross_split() nb = NaiveBayes() nb.fit(trainset[0], trainset[1]) predict = nb.predict(testset[0]) acc = accuracy_score(testset[1], predict) print acc nb.dump('NB.model') # nb = NaiveBayes.load('NB.model') # predict = nb.predict(testset[0]) # print accuracy_score(testset[1], predict)
else: is_valid = False nFeat = X.shape[1] if nFeat == self._nFeat: is_valid = True return is_valid def predict(self, X): models = self._parameter['trees'] pred = np.zeros(X.shape[0]) for model in models: pred += np.array(model.predict(X)) return pred if __name__ == '__main__': path = os.getcwd() + '/../dataset/winequality-white.csv' loader = DataLoader(path) dataset = loader.load(target_col_name='quality') trainset, testset = dataset.cross_split() gbdt = GradientBoostingDecisionTree(10) gbdt.fit(trainset[0], trainset[1]) predict = gbdt.predict(testset[0]) print 'GBDT mean error:', mean_error(testset[1], predict) dt = DecisionTreeRegressor() dt.fit(trainset[0], trainset[1]) predict = dt.predict(testset[0]) print 'DecisionTree mean error:', mean_error(testset[1], predict)