def _tree_prune(self, tree, Xval, yval): if not isinstance(tree, dict): return feat = tree.keys()[0] for feat_val in tree[feat].keys(): if isinstance(tree[feat][feat_val], dict): subtree = tree[feat][feat_val] pred_no_prone = self.predict(Xval) perf_no_prone = accuracy_score(yval, pred_no_prone) num_leaf_no_prone = self.get_num_leafs(self._parameter['tree']) tree[feat][feat_val] = subtree[subtree.keys() [0]]['__default__'] pred_with_prone = self.predict(Xval) perf_with_prone = accuracy_score(yval, pred_with_prone) num_leaf_with_prone = self.get_num_leafs( self._parameter['tree']) if perf_no_prone < perf_with_prone: improve = perf_with_prone - perf_no_prone # improve /= num_leaf_no_prone - num_leaf_with_prone - 1 self._logger.info( 'tree prune, validation precision improve %f' % (improve)) else: tree[feat][feat_val] = subtree self._tree_prune(subtree, Xval, yval)
def _tree_prune(self, tree, X, y): if not isinstance(tree, dict): return feat = tree.keys()[0] for feat_val in tree[feat].keys(): if isinstance(tree[feat][feat_val], dict): branch = tree[feat][feat_val] self._tree_prune(branch, X, y) pred_no_prone = self.predict(X) perf_no_prone = accuracy_score(y, pred_no_prone) tree[feat][feat_val] = branch[branch.keys()[0]]['__default__'] pred_with_prone = self.predict(X) perf_with_prone = accuracy_score(y, pred_with_prone) if perf_no_prone < perf_with_prone: improve = perf_with_prone - perf_no_prone logger.info('tree prune, validation precision improve %f' % (improve)) else: tree[feat][feat_val] = branch
def _tree_prune(self, tree, Xval, yval): if not isinstance(tree, dict): return feat = tree.keys()[0] for feat_val in tree[feat].keys(): if isinstance(tree[feat][feat_val], dict): subtree = tree[feat][feat_val] pred_no_prone = self.predict(Xval) perf_no_prone = accuracy_score(yval, pred_no_prone) num_leaf_no_prone = self.get_num_leafs(self._parameter['tree']) tree[feat][feat_val] = subtree[subtree.keys()[0]]['__default__'] pred_with_prone = self.predict(Xval) perf_with_prone = accuracy_score(yval, pred_with_prone) num_leaf_with_prone = self.get_num_leafs(self._parameter['tree']) if perf_no_prone < perf_with_prone: improve = perf_with_prone - perf_no_prone # improve /= num_leaf_no_prone - num_leaf_with_prone - 1 self._logger.info('tree prune, validation precision improve %f' % (improve)) else: tree[feat][feat_val] = subtree self._tree_prune(subtree, Xval, yval)
word = sentence[pivot] feat = {} feat['w'] = word for i in xrange(1, self.windows + 1): if pivot - i < 0: feat['w-%d' % i] = self.start_symbol else: feat['w-%d' % i] = sentence[pivot - i] for i in xrange(1, self.windows + 1): if pivot + i >= len(sentence): feat['w+%d' % i] = self.end_symbol else: feat['w+%d' % i] = sentence[pivot + i] features.append(feat) return features if __name__ == '__main__': data = treebank.tagged_sents()[:200] trainset, testset = data[:180], data[180:] model = MaxentTagger(max_iter=20, min_freq=3) model.fit(trainset) label = [] for tagged_sent in testset: label.extend([v[1] for v in tagged_sent]) X = [] for tagged_sent in testset: X.append([v[0] for v in tagged_sent]) pred = model.predict(X) print 'test accuracy:', accuracy_score(label, pred)
self.plotMidText((self.xOff, self.yOff,), cntrPt, str(key)) self.yOff += 1.0 / self.totalD def createPlot(self): inTree = self._parameter['tree'] fig = plt.figure(1, facecolor='white') fig.clf() axprops = dict(xticks=[], yticks=[]) self.decisionNode = dict(boxstyle='sawtooth', fc='0.8') self.leafNone = dict(boxstyle='round4', fc='0.8') self.arrow_args = dict(arrowstyle='<-') self.ax1 = plt.subplot(111, frameon=False, **axprops) self.totalW = float(self.get_num_leafs(inTree)) self.totalD = float(self.get_tree_depth(inTree)) self.xOff = -0.5 / self.totalW self.yOff = 1.0 self.plotTree(inTree, (0.5, 1.0), '') plt.show() if __name__ == '__main__': path = os.getcwd() + '/../dataset/dataset_21_car.arff' loader = DataLoader(path) dataset = loader.load(target_col_name='class') trainset, testset = dataset.cross_split() dt = DecisionTreeClassifier(min_split=1, is_prune=False) dt.fit(trainset[0], trainset[1]) predict = dt.predict(testset[0]) performance = accuracy_score(testset[1], predict) print 'test accuracy:', performance
scheduler = TimeScheduler() # KNN for classification task path = os.getcwd() + '/../dataset/electricity-normalized.arff' loader = DataLoader(path) dataset = loader.load(target_col_name='class') trainset, testset = dataset.cross_split() knn = KNNClassifier(search_mode='kd_tree') knn.fit(trainset[0], trainset[1]) predict_kd_tree = scheduler.tic_tac('kd_tree', knn.predict, X=testset[0]) knn = KNNClassifier(search_mode='brutal') knn.fit(trainset[0], trainset[1]) predict_brutal = scheduler.tic_tac('brutal', knn.predict, X=testset[0]) scheduler.print_task_schedule('brutal') scheduler.print_task_schedule('kd_tree') print accuracy_score(testset[1], predict_brutal), accuracy_score(testset[1], predict_kd_tree) # KNN for regression task # path = os.getcwd() + '/../dataset/winequality-white.csv' # loader = DataLoader(path) # dataset = loader.load(target_col_name='quality') # trainset, testset = dataset.cross_split() # knn = KNNRegressor(search_mode='brutal') # knn.fit(trainset[0], trainset[1]) # predict_brutal = scheduler.tic_tac('brutal', knn.predict, X=testset[0]) # knn = KNNRegressor(search_mode='kd_tree') # knn.fit(trainset[0], trainset[1]) # predict_kd_tree = scheduler.tic_tac('kd_tree', knn.predict, X=testset[0]) # scheduler.print_task_schedule('brutal') # scheduler.print_task_schedule('kd_tree') # print mean_error(testset[1], predict_brutal), mean_error(testset[1], predict_kd_tree)
assert self._is_trained, 'model must be trained before predict.' nSize = X.shape[0] if len(X.shape) == 2 else 1 pred = [np.zeros(self._nClass) for i in xrange(nSize)] for model, col_sample_ix in zip(self._parameter['forest'], self._parameter['col_sample_ix']): proba = model.predict_proba(X[:, col_sample_ix]) for i in xrange(nSize): ix = np.argmax(proba[i]) pred[i][ix] += proba[i][ix] return np.array(pred) def predict(self, X): pred_proba = self.predict_proba(X) pred = np.argmax(pred_proba, axis=1) pred = [self._class_label[i] for i in pred] return np.array(pred) if __name__ == '__main__': path = os.getcwd() + '/../dataset/dataset_21_car.arff' loader = DataLoader(path) dataset = loader.load(target_col_name='class') trainset, testset = dataset.cross_split() dt = DecisionTreeClassifier() dt.fit(trainset[0], trainset[1]) predict = dt.predict(testset[0]) print accuracy_score(testset[1], predict) rf = RandomForest(100, 0.9) rf.fit(trainset[0], trainset[1]) predict = rf.predict(testset[0]) print accuracy_score(testset[1], predict)
for irow in range(X.shape[0]): _X = X[irow] max_prob = None label = None for c in proba_y.keys(): p = proba_y[c] for icol, feat in cond_proba_y[c].iteritems(): p += feat[_X[icol]] if max_prob < p or max_prob is None: max_prob = p label = c assert label is not None, 'label should be None. There must be some error. please check.' pred.append(label) return np.array(pred) if __name__ == '__main__': path = os.getcwd() + '/../dataset/dataset_21_car.arff' loader = DataLoader(path) dataset = loader.load(target_col_name='class') trainset, testset = dataset.cross_split() nb = NaiveBayes() nb.fit(trainset[0], trainset[1]) predict = nb.predict(testset[0]) acc = accuracy_score(testset[1], predict) print acc # nb.dump('NB.model') # nb = NaiveBayes.load('NB.model') # predict = nb.predict(testset[0]) # print accuracy_score(testset[1], predict)
for irow in range(X.shape[0]): _X = X[irow] max_prob = None label = None for c in proba_y.keys(): p = proba_y[c] for icol, feat in cond_proba_y[c].iteritems(): p += feat[_X[icol]] if max_prob < p or max_prob is None: max_prob = p label = c assert label is not None, 'label should be None. There must be some error. please check.' pred.append(label) return np.array(pred) if __name__ == '__main__': path = os.getcwd() + '/../dataset/dataset_21_car.arff' loader = DataLoader(path) dataset = loader.load(target_col_name='class') trainset, testset = dataset.cross_split() nb = NaiveBayes() nb.fit(trainset[0], trainset[1]) predict = nb.predict(testset[0]) acc = accuracy_score(testset[1], predict) print acc nb.dump('NB.model') # nb = NaiveBayes.load('NB.model') # predict = nb.predict(testset[0]) # print accuracy_score(testset[1], predict)
return True else: is_valid = False nFeat = X.shape[1] nClass = len(np.unique(y)) if nFeat == self._nFeat and nClass == self._nClass: is_valid = True return is_valid if __name__ == '__main__': path = os.getcwd() + '/../dataset/dataset_21_car.arff' loader = DataLoader(path) dataset = loader.load(target_col_name='class') trainset, testset = dataset.cross_split() nb = NaiveBayes() nb.fit(trainset[0], trainset[1]) p1 = nb.predict(testset[0]) print 'NaiveBayes accuracy:', accuracy_score(testset[1], p1) base_learner = NaiveBayes() ada = AdaBoost(base_learner, 100) ada.fit(trainset[0], trainset[1]) prediction = ada.predict(testset[0]) performance = accuracy_score(testset[1], prediction) print 'AdaBoost accuracy:', performance # ada.dump('ada.model') # ada = AdaBoost.load('ada.model') # prediction = ada.predict(testset[0]) # performance = accuracy_score(testset[1], prediction) # print performance
scheduler = TimeScheduler() # KNN for classification task path = os.getcwd() + '/../dataset/electricity-normalized.arff' loader = DataLoader(path) dataset = loader.load(target_col_name='class') trainset, testset = dataset.cross_split() knn = KNNClassifier(search_mode='kd_tree') knn.fit(trainset[0], trainset[1]) predict_kd_tree = scheduler.tic_tac('kd_tree', knn.predict, X=testset[0]) knn = KNNClassifier(search_mode='brutal') knn.fit(trainset[0], trainset[1]) predict_brutal = scheduler.tic_tac('brutal', knn.predict, X=testset[0]) scheduler.print_task_schedule('brutal') scheduler.print_task_schedule('kd_tree') print accuracy_score(testset[1], predict_brutal), accuracy_score( testset[1], predict_kd_tree) # KNN for regression task # path = os.getcwd() + '/../dataset/winequality-white.csv' # loader = DataLoader(path) # dataset = loader.load(target_col_name='quality') # trainset, testset = dataset.cross_split() # knn = KNNRegressor(search_mode='brutal') # knn.fit(trainset[0], trainset[1]) # predict_brutal = scheduler.tic_tac('brutal', knn.predict, X=testset[0]) # knn = KNNRegressor(search_mode='kd_tree') # knn.fit(trainset[0], trainset[1]) # predict_kd_tree = scheduler.tic_tac('kd_tree', knn.predict, X=testset[0]) # scheduler.print_task_schedule('brutal') # scheduler.print_task_schedule('kd_tree') # print mean_error(testset[1], predict_brutal), mean_error(testset[1], predict_kd_tree)
nSize = X.shape[0] if len(X.shape) == 2 else 1 pred = [np.zeros(self._nClass) for i in xrange(nSize)] for model, col_sample_ix in zip(self._parameter['forest'], self._parameter['col_sample_ix']): proba = model.predict_proba(X[:, col_sample_ix]) for i in xrange(nSize): ix = np.argmax(proba[i]) pred[i][ix] += proba[i][ix] return np.array(pred) def predict(self, X): pred_proba = self.predict_proba(X) pred = np.argmax(pred_proba, axis=1) pred = [self._class_label[i] for i in pred] return np.array(pred) if __name__ == '__main__': path = os.getcwd() + '/../dataset/dataset_21_car.arff' loader = DataLoader(path) dataset = loader.load(target_col_name='class') trainset, testset = dataset.cross_split() dt = DecisionTreeClassifier() dt.fit(trainset[0], trainset[1]) predict = dt.predict(testset[0]) print accuracy_score(testset[1], predict) rf = RandomForest(100, 0.9) rf.fit(trainset[0], trainset[1]) predict = rf.predict(testset[0]) print accuracy_score(testset[1], predict)
pred[i] = max_label return np.array(pred) def __check_valid(self, X, y): if self._is_trained is False: return True else: is_valid = False nFeat = X.shape[1] nClass = len(np.unique(y)) if nFeat == self._nFeat and nClass == self._nClass: is_valid = True return is_valid if __name__ == '__main__': path = os.getcwd() + '/../dataset/dataset_21_car.arff' loader = DataLoader(path) dataset = loader.load(target_col_name='class') trainset, testset = dataset.cross_split() nb = DecisionTreeClassifier(is_prune=False) nb.fit(trainset[0], trainset[1]) p1 = nb.predict(testset[0]) print accuracy_score(testset[1], p1) nb = DecisionTreeClassifier(is_prune=False) ada = AdaBoost(nb, 100) ada.fit(trainset[0], trainset[1]) prediction = ada.predict(testset[0]) performance = accuracy_score(testset[1], prediction) print performance
), cntrPt, str(key)) self.yOff += 1.0 / self.totalD def createPlot(self): inTree = self._parameter['tree'] fig = plt.figure(1, facecolor='white') fig.clf() axprops = dict(xticks=[], yticks=[]) self.decisionNode = dict(boxstyle='sawtooth', fc='0.8') self.leafNone = dict(boxstyle='round4', fc='0.8') self.arrow_args = dict(arrowstyle='<-') self.ax1 = plt.subplot(111, frameon=False, **axprops) self.totalW = float(self.get_num_leafs(inTree)) self.totalD = float(self.get_tree_depth(inTree)) self.xOff = -0.5 / self.totalW self.yOff = 1.0 self.plotTree(inTree, (0.5, 1.0), '') plt.show() if __name__ == '__main__': path = os.getcwd() + '/../dataset/dataset_21_car.arff' loader = DataLoader(path) dataset = loader.load(target_col_name='class') trainset, testset = dataset.cross_split() dt = DecisionTreeClassifier(min_split=1, is_prune=False) dt.fit(trainset[0], trainset[1]) predict = dt.predict(testset[0]) performance = accuracy_score(testset[1], predict) print 'test accuracy:', performance
logger.warning('feature number must be 2.') return logger.info('start plotting...') pred = self._predict(X) h = 0.02 # step size in the mesh x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = self._predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) plt.scatter(X[:, 0], X[:, 1], c=pred, cmap=plt.cm.Paired) plt.contour(xx, yy, Z, cmap=plt.cm.Paired) plt.show() logger = get_logger(SVM.__name__) if __name__ == '__main__': path = os.getcwd() + '/../dataset/iris.arff' loader = DataLoader(path) dataset = loader.load(target_col_name='binaryClass') trainset, testset = dataset.cross_split() X = trainset[0][:, [0, 1]] y = trainset[1] svm = SVM(kernel_type='rbf', sigma=0.3) svm.fit(X, y) predict = svm.predict(testset[0][:, [0, 1]]) print 'test accuracy:', accuracy_score(testset[1], predict) svm.plot(X)
for i in xrange(nSize): ix = np.argmax(proba[i]) pred[i][ix] += proba[i][ix] return np.array(pred) def predict(self, X): pred_proba = self.predict_proba(X) pred = np.argmax(pred_proba, axis=1) pred = [self._class_label[i] for i in pred] return np.array(pred) if __name__ == '__main__': path = os.getcwd() + '/../dataset/dataset_21_car.arff' loader = DataLoader(path) dataset = loader.load(target_col_name='class') trainset, testset = dataset.cross_split() dt = DecisionTreeClassifier() dt.fit(trainset[0], trainset[1]) predict = dt.predict(testset[0]) print 'DecisionTree accuracy:', accuracy_score(testset[1], predict) rf = RandomForest(100, 0.9) rf.fit(trainset[0], trainset[1]) predict = rf.predict(testset[0]) print 'RandomForest accuracy', accuracy_score(testset[1], predict) # rf.dump('rf.model') # rf = RandomForest.load('rf.model') # predict = rf.predict(testset[0]) # print 'RandomForest accuracy', accuracy_score(testset[1], predict)