def test_mass_univariate_classification_gnb_2d(self):
        """Simple classification problem, 2d features"""

        X = array([-1, 1, -2, -1, -3, -2, 1, 1, 2, 1, 3, 2])
        features = array([1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2])
        samples = array([1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6])
        labels = array([1, 1, 1, 2, 2, 2])
        params = dict([('labels', labels), ('features', features), ('samples', samples)])
        clf = MassUnivariateClassifier.load(params, "gaussnaivebayes", cv=0)

        data = self.sc.parallelize(zip([1], [X]))

        # first feature predicts perfectly
        result = clf.classify(data, [[1]]).map(lambda (_, v): v).collect()
        assert_array_almost_equal(result[0], [1.0])

        # second feature gets one wrong
        result = clf.classify(data, [[2]]).map(lambda (_, v): v).collect()
        assert_array_almost_equal(result[0], [5.0/6.0])

        # two features together predict perfectly
        result = clf.classify(data, [[1, 2]]).map(lambda (_, v): v).collect()
        assert_array_almost_equal(result[0], [1.0])

        # test iteration over multiple feature sets
        result = clf.classify(data, [[1, 2], [2]]).map(lambda (_, v): v).collect()
        assert_array_almost_equal(result[0], [1.0, 5.0/6.0])
Beispiel #2
0
    def test_mass_univariate_classification_gnb_2d(self):
        """Simple classification problem, 2d features"""

        X = array([-1, 1, -2, -1, -3, -2, 1, 1, 2, 1, 3, 2])
        features = array([1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2])
        samples = array([1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6])
        labels = array([1, 1, 1, 2, 2, 2])
        params = dict([('labels', labels), ('features', features),
                       ('samples', samples)])
        clf = MassUnivariateClassifier.load(params, "gaussnaivebayes", cv=0)

        data = self.sc.parallelize(zip([1], [X]))

        # first feature predicts perfectly
        result = clf.classify(data, [[1]]).map(lambda (_, v): v).collect()
        assert_array_almost_equal(result[0], [1.0])

        # second feature gets one wrong
        result = clf.classify(data, [[2]]).map(lambda (_, v): v).collect()
        assert_array_almost_equal(result[0], [5.0 / 6.0])

        # two features together predict perfectly
        result = clf.classify(data, [[1, 2]]).map(lambda (_, v): v).collect()
        assert_array_almost_equal(result[0], [1.0])

        # test iteration over multiple feature sets
        result = clf.classify(data,
                              [[1, 2], [2]]).map(lambda (_, v): v).collect()
        assert_array_almost_equal(result[0], [1.0, 5.0 / 6.0])
Beispiel #3
0
    def test_mass_univariate_classification_ttest_2d(self):
        """Simple classification problem, 2d features"""
        X = array([-1, -2, -0.1, -2, -0.1, -2.1, 1, 1.1, 1, 1, 1.1, 2])
        features = array([1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2])
        samples = array([1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6])
        labels = array([1, 1, 1, 2, 2, 2])
        params = dict([('labels', labels), ('features', features),
                       ('samples', samples)])

        clf = MassUnivariateClassifier.load(params, "ttest")

        # should match direct calculation using scipy

        # test first feature only
        data = self.sc.parallelize(zip([1], [X]))
        result = clf.classify(data, [[1]]).map(lambda (_, v): v).collect()
        ground_truth = ttest_ind(X[features == 1][:3], X[features == 1][3:])
        assert_array_almost_equal(result[0], ground_truth[0])

        # test both features
        result = clf.classify(data, [[1, 2]]).map(lambda (_, v): v).collect()
        ground_truth = ttest_ind(
            vstack((X[features == 1][:3], X[features == 2][:3])).T,
            vstack((X[features == 1][3:], X[features == 2][3:])).T)
        assert_array_almost_equal(result[0][0], ground_truth[0])
    def test_mass_univariate_classification_ttest_1d(self):
        """Simple classification problem, 1d features"""
        X = array([-1, -0.1, -0.1, 1, 1, 1.1])
        labels = array([1, 1, 1, 2, 2, 2])
        params = dict([('labels', labels)])

        clf = MassUnivariateClassifier.load(params, "ttest")

        # should match direct calculation using scipy
        data = self.sc.parallelize(zip([1], [X]))
        result = clf.classify(data).map(lambda (_, v): v).collect()
        ground_truth = ttest_ind(X[labels == 1], X[labels == 2])
        assert_array_almost_equal(result[0], ground_truth[0])
Beispiel #5
0
    def test_mass_univariate_classification_ttest_1d(self):
        """Simple classification problem, 1d features"""
        X = array([-1, -0.1, -0.1, 1, 1, 1.1])
        labels = array([1, 1, 1, 2, 2, 2])
        params = dict([('labels', labels)])

        clf = MassUnivariateClassifier.load(params, "ttest")

        # should match direct calculation using scipy
        data = self.sc.parallelize(zip([1], [X]))
        result = clf.classify(data).map(lambda (_, v): v).collect()
        ground_truth = ttest_ind(X[labels == 1], X[labels == 2])
        assert_array_almost_equal(result[0], ground_truth[0])
    def test_mass_univariate_classification_gnb_1d(self):
        """Simple classification problem, 1d features"""
        X1 = array([-1, -1, -1.2, 1, 1, 1.2])
        X2 = array([-1, -1, 1.2, 1, 1, 1.2])
        labels = array([1, 1, 1, 2, 2, 2])
        params = dict([('labels', labels)])

        clf = MassUnivariateClassifier.load(params, "gaussnaivebayes", cv=0)

        # should predict perfectly
        data = self.sc.parallelize(zip([1], [X1]))
        result = clf.classify(data).map(lambda (_, v): v).collect()
        assert_array_almost_equal(result[0], [1.0])

        # should predict all but one correctly
        data = self.sc.parallelize(zip([1], [X2]))
        result = clf.classify(data).map(lambda (_, v): v).collect()
        assert_array_almost_equal(result[0], [5.0/6.0])
Beispiel #7
0
    def test_mass_univariate_classification_gnb_1d(self):
        """Simple classification problem, 1d features"""
        X1 = array([-1, -1, -1.2, 1, 1, 1.2])
        X2 = array([-1, -1, 1.2, 1, 1, 1.2])
        labels = array([1, 1, 1, 2, 2, 2])
        params = dict([('labels', labels)])

        clf = MassUnivariateClassifier.load(params, "gaussnaivebayes", cv=0)

        # should predict perfectly
        data = self.sc.parallelize(zip([1], [X1]))
        result = clf.classify(data).map(lambda (_, v): v).collect()
        assert_array_almost_equal(result[0], [1.0])

        # should predict all but one correctly
        data = self.sc.parallelize(zip([1], [X2]))
        result = clf.classify(data).map(lambda (_, v): v).collect()
        assert_array_almost_equal(result[0], [5.0 / 6.0])
    def test_mass_univariate_classification_ttest_2d(self):
        """Simple classification problem, 2d features"""
        X = array([-1, -2, -0.1, -2, -0.1, -2.1, 1, 1.1, 1, 1, 1.1, 2])
        features = array([1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2])
        samples = array([1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6])
        labels = array([1, 1, 1, 2, 2, 2])
        params = dict([('labels', labels), ('features', features), ('samples', samples)])

        clf = MassUnivariateClassifier.load(params, "ttest")

        # should match direct calculation using scipy

        # test first feature only
        data = self.sc.parallelize(zip([1], [X]))
        result = clf.classify(data, [[1]]).map(lambda (_, v): v).collect()
        ground_truth = ttest_ind(X[features == 1][:3], X[features == 1][3:])
        assert_array_almost_equal(result[0], ground_truth[0])

        # test both features
        result = clf.classify(data, [[1, 2]]).map(lambda (_, v): v).collect()
        ground_truth = ttest_ind(vstack((X[features == 1][:3], X[features == 2][:3])).T,
                                 vstack((X[features == 1][3:], X[features == 2][3:])).T)
        assert_array_almost_equal(result[0][0], ground_truth[0])
Beispiel #9
0
import os
import argparse
import glob
from numpy import array
from thunder.classification import MassUnivariateClassifier
from thunder.utils import load
from thunder.utils import save
from pyspark import SparkContext


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="fit a regression model")
    parser.add_argument("datafile", type=str)
    parser.add_argument("paramfile", type=str)
    parser.add_argument("outputdir", type=str)
    parser.add_argument("classifymode", choices="naivebayes", help="form of classifier")
    parser.add_argument("--featureset", type=array, default="None", required=False)
    parser.add_argument("--cv", type=int, default="0", required=False)
    parser.add_argument("--preprocess", choices=("raw", "dff", "dff-highpass", "sub"), default="raw", required=False)

    args = parser.parse_args()

    sc = SparkContext("classify")

    data = load(sc, args.datafile, args.preprocess)
    clf = MassUnivariateClassifier.load(args.paramfile, args.classifymode, cv=args.cv)
    perf = clf.classify(data, args.featureset)

    outputdir = args.outputdir + "-classify"
    save(perf, outputdir, "perf", "matlab")
Beispiel #10
0
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="fit a regression model")
    parser.add_argument("datafile", type=str)
    parser.add_argument("paramfile", type=str)
    parser.add_argument("outputdir", type=str)
    parser.add_argument("classifymode",
                        choices="naivebayes",
                        help="form of classifier")
    parser.add_argument("--featureset",
                        type=array,
                        default="None",
                        required=False)
    parser.add_argument("--cv", type=int, default="0", required=False)
    parser.add_argument("--preprocess",
                        choices=("raw", "dff", "dff-highpass", "sub"),
                        default="raw",
                        required=False)

    args = parser.parse_args()

    sc = SparkContext("classify")

    data = load(sc, args.datafile, args.preprocess)
    clf = MassUnivariateClassifier.load(args.paramfile,
                                        args.classifymode,
                                        cv=args.cv)
    perf = clf.classify(data, args.featureset)

    outputdir = args.outputdir + "-classify"
    save(perf, outputdir, "perf", "matlab")