def test_classifier_cv(self, classifier, name): """ Performs 10-fold cross-validation on the given classifier. @param name {string} Name of the classifier or run @param classifier {sklearn Classifier} Classifier instance """ folds = self.load_folds() if not folds: logger.error("Folds is None?") # Does not handle root nodes separately as the folds do not distinguish # between these # So what we do here: collect all results, i..e all csv rows for all # folds, and print two files with 10 lines each logger.info("test_classifier: %s", classifier) foldNum = 0 for train_index, test_index in self.folds: logger.debug("Current fold: train_index: %s", train_index) logger.debug("Current fold: test_index: %s", test_index) classifier.fit(self.trainData[train_index], self.trainLabels[train_index]) pred = classifier.predict(self.trainData[test_index]) allEval = evaluate.printStatsCoarseInt( self.trainLabels[test_index], pred) allEval = evaluate.ins(['Classifier', 'Mode', 'Fold'], [name, 'CV', foldNum], allEval) self.evalData.append(allEval) # mangle name to include other columns self.get_feature_importances(name + '-CV-Fold-' + foldNum, classifier) foldNum += 1
def test_classifier_toClassify(self, classifier, name): """ Trains classifier on EnsembleLearner::trainData and predicts labels for EnsembleLearner::testData. The predictions are saved as a tree to the location specified by EnsembleLearner::toClassifyOutFile. If EnsembleLearner::toClassifyIsTest is set, the labels in EnsembleLearner::testLabels are used to evaluate the predictions. @param name {string} Name of the classifier or run @param classifier {sklearn Classifier} Classifier instance """ logger.info("Started classifying file") logger.debug("len(data): %s" % len(self.trainData)) logger.debug("len(labels): %s" % len(self.trainLabels)) classifier.fit(self.trainData, self.trainLabels) self.get_feature_importances(name + '-Single', classifier) pred = classifier.predict(self.testData) out = open(self.toClassifyOutFile, "w") trees = self.apply_prediction_to_tree( self.toClassify, pred.tolist()) for tree in trees: # .encode('utf-8')) out.write(tree.pprint(margin=999999, nodesep='').encode('utf-8')) out.write('\n') out.close() logger.info("Finished classifying file") if self.toClassifyIsTest: rootPred = numpy.asarray(pred)[self.testRootInd] rootGold = numpy.asarray(self.testLabels)[self.testRootInd] rootEval = evaluate.printStatsCoarseInt(rootGold, rootPred, "root") allEval = evaluate.printStatsCoarseInt(self.testLabels, pred) allEval.update(rootEval) allEval = evaluate.ins(['Classifier', 'Mode', 'Fold'], [name, 'Single', 'N/A'], allEval) self.evalData.append(allEval)