Ejemplo n.º 1
0
    def test_classifier_cv(self, classifier, name):
        """
        Performs 10-fold cross-validation on the given classifier.

        @param name {string} Name of the classifier or run
        @param classifier {sklearn Classifier} Classifier instance
        """
        folds = self.load_folds()
        if not folds:
            logger.error("Folds is None?")
        # Does not handle root nodes separately as the folds do not distinguish
        # between these
        # So what we do here: collect all results, i..e all csv rows for all
        # folds, and print two files with 10 lines each
        logger.info("test_classifier: %s", classifier)
        foldNum = 0
        for train_index, test_index in self.folds:
            logger.debug("Current fold: train_index: %s", train_index)
            logger.debug("Current fold: test_index: %s", test_index)
            classifier.fit(self.trainData[train_index],
                           self.trainLabels[train_index])
            pred = classifier.predict(self.trainData[test_index])

            allEval = evaluate.printStatsCoarseInt(
                self.trainLabels[test_index],
                pred)
            allEval = evaluate.ins(['Classifier', 'Mode', 'Fold'],
                                   [name, 'CV', foldNum],
                                   allEval)
            self.evalData.append(allEval)
            # mangle name to include other columns
            self.get_feature_importances(name + '-CV-Fold-' + foldNum,
                                         classifier)
            foldNum += 1
Ejemplo n.º 2
0
    def test_classifier_toClassify(self, classifier, name):
        """
        Trains classifier on EnsembleLearner::trainData and
        predicts labels for EnsembleLearner::testData.
        The predictions are saved as a tree to the location specified by
        EnsembleLearner::toClassifyOutFile.

        If EnsembleLearner::toClassifyIsTest is set, the labels
        in EnsembleLearner::testLabels are used to evaluate the
        predictions.


        @param name {string} Name of the classifier or run
        @param classifier {sklearn Classifier} Classifier instance
        """
        logger.info("Started classifying file")
        logger.debug("len(data): %s" % len(self.trainData))
        logger.debug("len(labels): %s" % len(self.trainLabels))
        classifier.fit(self.trainData, self.trainLabels)
        self.get_feature_importances(name + '-Single', classifier)
        pred = classifier.predict(self.testData)
        out = open(self.toClassifyOutFile, "w")
        trees = self.apply_prediction_to_tree(
            self.toClassify, pred.tolist())
        for tree in trees:
            # .encode('utf-8'))
            out.write(tree.pprint(margin=999999, nodesep='').encode('utf-8'))
            out.write('\n')
        out.close()
        logger.info("Finished classifying file")
        if self.toClassifyIsTest:
            rootPred = numpy.asarray(pred)[self.testRootInd]
            rootGold = numpy.asarray(self.testLabels)[self.testRootInd]
            rootEval = evaluate.printStatsCoarseInt(rootGold,
                                                    rootPred,
                                                    "root")
            allEval = evaluate.printStatsCoarseInt(self.testLabels,
                                                   pred)
            allEval.update(rootEval)
            allEval = evaluate.ins(['Classifier', 'Mode', 'Fold'],
                                   [name, 'Single', 'N/A'],
                                   allEval)
            self.evalData.append(allEval)