Beispiel #1
0
    def cross_validate(self, data=None, num_folds=10):
        """
        Split the training data into N folds. Use N-1 folds as training data
        and 1 fold as the testing data during each iteration.

        Outputs the confusion matrix and other accuracy measurement stats the end of N-CV.
        """
        if data is None:
            data = self.data
        if len(data) < num_folds:
            raise ValueError(
                'Not enough data to make {} folds'.format(num_folds))

        folds = folds_indexes(data, num_folds)
        folds_accuracy = []
        cm = ConfusionMatrix(np.unique(data.index.values))

        for i, (train_idx, test_idx) in enumerate(folds):
            # find the indexes of other folds data in the dataframe
            print('Fold {}'.format(i))
            training_data = data.iloc[train_idx]
            test_data = data.iloc[test_idx]
            self.train(training_data)
            # predict and evaluate
            predictions = self.predict(test_data.values)
            real_vals = test_data.index.values
            accuracy = accuracy_score(predictions, real_vals)
            cm.update(predictions, real_vals)
            # clean up
            folds_accuracy.append(accuracy)
            self.svm_units = []
            print(accuracy)

        print('Mean Accuracy : {}'.format(np.mean(folds_accuracy)))
        print('***** Detailed Summary *****')
        print(cm.statistics())
        print('***** Confusion Matrix *****')
        print(cm)
        cm.save(folder='../experiments/')
        return self