Example #1
0
class classifier():
    """
    an abstract class that models a classifier
    """

    __metaclass__ = abc.ABCMeta

    def __init__(self, param_grid=None, n_folds=None,
                 n_class_samples=None, n_test_samples=None, n_tests=1, name="classifier"):
        self.name = name
        self.param_grid = param_grid
        self.best_param_set = None
        self.n_folds = n_folds
        # the number of validation or test samples per class
        self.n_test_samples = n_test_samples
        # the number of training samples per class
        self.n_class_samples = n_class_samples
        self.n_tests = n_tests

    def fit(self, X, y):
        self.__call__(X, y)

    def __call__(self, X, y):
        """
        given a dataset X,y we split it, in order to do cross validation,
        according to the procedure explained below:
        if n_folds is not None, then we do cross validation
        based on stratified folds
        if n_class_samples is not None, then we do cross validation
        using only <n_class_samples> training samples per class
        if n_test_samples is not None, then we do cross validation
        using only <n_test_samples> cross validaition samples per class
        assumes that each datapoint is in a column of X
        """
        n_classes = len(set(y))
        if self.n_folds is not None:
            # generate the folds
            self.folds = StratifiedKFold(y, n_folds=self.n_folds,
                                         shuffle=False, random_state=None)

        elif self.n_class_samples is not None:

            self.folds = []
            for i in range(self.n_tests):

                if type(self.n_class_samples) is not list:
                    self.n_class_samples = (np.ones(n_classes) * self.n_class_samples).astype(int)
                if self.n_test_samples is not None:
                    self.n_test_samples = (np.ones(n_classes) * self.n_test_samples).astype(int)

                data_idx = split_dataset(self.n_class_samples, self.n_test_samples, y)
                train_idx = data_idx[0]
                test_idx = data_idx[1]
                self.folds.append((train_idx, test_idx))

        self.cross_validate(X, y)

    def cross_validate(self, X, y):

        print "fitting {} to the training set".format(self.name)
        if self.param_grid is not None:
            param_sets = list(ParameterGrid(self.param_grid))
            n_param_sets = len(param_sets)
            param_scores = []
            for j, param_set in enumerate(param_sets):

                print "--------------"
                print "training the classifier..."
                print "parameter set:"
                for k, v in param_set.iteritems():
                    print "{}:{}".format(k, v)

                param_score = self.evaluate(X, y, param_set=param_set)
                param_scores.append(param_score)
                p = np.argmax(np.array(param_scores))
                self.best_param_set = param_sets[p]
                print "best parameter set", self.best_param_set
                print "best score:", param_scores[p]
        else:
            score = self.evaluate(X, y)

    def evaluate(self, X, y, param_set=None):
        """
        evaluate the performance of the classifier
        trained with the parameters in <param_set>
        """
        cv_scores = []
        # avg_class_accs = []
        for train_index, test_index in self.folds:
            X_train, X_test = X[:, train_index], X[:, test_index]
            y_train, y_test = y[train_index], y[test_index]
            self.train(X_train, y_train, param_set=param_set)

            y_pred = self.predict(X_test)
            y_pred = np.array(y_pred)
            class_acc = class_accuracy(y_pred, y_test)
            # avg_class_acc  = avg_class_accuracy(y_pred,y_test)
            cv_scores.append(class_acc)
            # avg_class_accs.append(avg_class_acc)
            print "average class accuracy:", avg_class_accuracy(y_pred, y_test)

        avg_cv_score = np.mean(cv_scores)
        print "accuracy:", avg_cv_score
        return avg_cv_score

    @abc.abstractmethod
    def train(self, X_train, y_train, param_set=None):
        """train the classifier"""
        raise NotImplementedError

    @abc.abstractmethod
    def predict(self, X_test):
        """predict labels in X_test"""
        raise NotImplementedError
Example #2
0
class classifier():
    """
    an abstract class that models a classifier
    """

    __metaclass__ = abc.ABCMeta

    def __init__(self,
                 param_grid=None,
                 n_folds=None,
                 n_class_samples=None,
                 n_test_samples=None,
                 n_tests=1,
                 name="classifier"):
        self.name = name
        self.param_grid = param_grid
        self.best_param_set = None
        self.n_folds = n_folds
        # the number of validation or test samples per class
        self.n_test_samples = n_test_samples
        # the number of training samples per class
        self.n_class_samples = n_class_samples
        self.n_tests = n_tests

    def fit(self, X, y):
        self.__call__(X, y)

    def __call__(self, X, y):
        """
        given a dataset X,y we split it, in order to do cross validation,
        according to the procedure explained below:
        if n_folds is not None, then we do cross validation
        based on stratified folds
        if n_class_samples is not None, then we do cross validation
        using only <n_class_samples> training samples per class
        if n_test_samples is not None, then we do cross validation
        using only <n_test_samples> cross validaition samples per class
        assumes that each datapoint is in a column of X
        """
        n_classes = len(set(y))
        if self.n_folds is not None:
            # generate the folds
            self.folds = StratifiedKFold(y,
                                         n_folds=self.n_folds,
                                         shuffle=False,
                                         random_state=None)

        elif self.n_class_samples is not None:

            self.folds = []
            for i in range(self.n_tests):

                if type(self.n_class_samples) is not list:
                    self.n_class_samples = (np.ones(n_classes) *
                                            self.n_class_samples).astype(int)
                if self.n_test_samples is not None:
                    self.n_test_samples = (np.ones(n_classes) *
                                           self.n_test_samples).astype(int)

                data_idx = split_dataset(self.n_class_samples,
                                         self.n_test_samples, y)
                train_idx = data_idx[0]
                test_idx = data_idx[1]
                self.folds.append((train_idx, test_idx))

        self.cross_validate(X, y)

    def cross_validate(self, X, y):

        print "fitting {} to the training set".format(self.name)
        if self.param_grid is not None:
            param_sets = list(ParameterGrid(self.param_grid))
            n_param_sets = len(param_sets)
            param_scores = []
            for j, param_set in enumerate(param_sets):

                print "--------------"
                print "training the classifier..."
                print "parameter set:"
                for k, v in param_set.iteritems():
                    print "{}:{}".format(k, v)

                param_score = self.evaluate(X, y, param_set=param_set)
                param_scores.append(param_score)
                p = np.argmax(np.array(param_scores))
                self.best_param_set = param_sets[p]
                print "best parameter set", self.best_param_set
                print "best score:", param_scores[p]
        else:
            score = self.evaluate(X, y)

    def evaluate(self, X, y, param_set=None):
        """
        evaluate the performance of the classifier
        trained with the parameters in <param_set>
        """
        cv_scores = []
        # avg_class_accs = []
        for train_index, test_index in self.folds:
            X_train, X_test = X[:, train_index], X[:, test_index]
            y_train, y_test = y[train_index], y[test_index]
            self.train(X_train, y_train, param_set=param_set)

            y_pred = self.predict(X_test)
            y_pred = np.array(y_pred)
            n_correct = np.sum(y_test == y_pred)
            class_acc = class_accuracy(y_pred, y_test)
            # avg_class_acc  = avg_class_accuracy(y_pred,y_test)
            cv_scores.append(class_acc)
            # avg_class_accs.append(avg_class_acc)
            print "average class accuracy:", avg_class_accuracy(y_pred, y_test)

        avg_cv_score = np.mean(cv_scores)
        print "accuracy:", avg_cv_score
        return avg_cv_score

    @abc.abstractmethod
    def train(self, X_train, y_train, param_set=None):
        '''train the classifier'''
        raise NotImplementedError

    @abc.abstractmethod
    def predict(self, X_test):
        '''test the classifier'''
        raise NotImplementedError