Ejemplo n.º 1
0
    def test_ContinuousTrainTestSplitWithTiling(self):
        """Uses a synthetic preprocessed as follows: 500 total samples, 25 tiles per group
        240 total features"""

        from wndcharm.ArtificialFeatureSpace import CreateArtificialFeatureSpace_Continuous

        fs = CreateArtificialFeatureSpace_Continuous(
            n_samples=500,
            num_features_per_signal_type=20,
            n_samples_per_group=25)

        from numpy.random import RandomState
        prng = RandomState(42)
        #fs.Print( verbose=True )
        #print "\n\n\n********************\n\n\n"
        train, test = fs.Split(random_state=prng, quiet=True)
        #full_train.Print( verbose=True )
        #full_test.Print( verbose=True )

        train.Normalize(inplace=True, quiet=True)
        fw = PearsonFeatureWeights.NewFromFeatureSpace(train).Threshold()
        train.FeatureReduce(fw, inplace=True)
        test.FeatureReduce(fw, inplace=True).Normalize(train,
                                                       inplace=True,
                                                       quiet=True)
    def test_NewShuffleSplitLeastSquares(self):
        """CONTINUOUS SHUFFLE SPLIT LEAST SQUARES"""

        fs_kwargs = {}
        fs_kwargs['name'] = "CONTINUOUS PerSampleStatistics_TESTFS"
        fs_kwargs['n_samples'] = 100
        fs_kwargs['num_features_per_signal_type'] = 5
        fs_kwargs['initial_noise_sigma'] = 5
        fs_kwargs['noise_gradient'] = 5
        fs_kwargs['n_samples_per_group'] = 1
        fs_kwargs['random_state'] = 43
        fs_kwargs['singularity'] = True
        fs_kwargs['clip'] = True

        fs = CreateArtificialFeatureSpace_Continuous(**fs_kwargs)

        ss_kwargs = {}
        ss_kwargs['n_iter'] = 5
        ss_kwargs[
            'name'] = "Continuous Shuffle Split Least Squares POSITIVE CONTROL"
        ss_kwargs['quiet'] = True
        ss_kwargs['random_state'] = 43
        exp = FeatureSpaceRegressionExperiment.NewShuffleSplit(fs, **ss_kwargs)

        exp.GenerateStats()
        #exp.Print()

        # len( exp ) is supposed to be the number of batch results (split results)
        self.assertIs(len(exp), ss_kwargs['n_iter'])

        # Positive control - Artificial data with defaults should corellate almost perfectly
        self.assertAlmostEqual(exp.pearson_coeff, 1.0, delta=0.02)

        # Negative control - take the bottom quintile of the artificial features
        # which ARE functions of ground truth but should score low on linear correlation,
        # e.g., sin, x^2, etc.

        # With LSTSQ regression of noise features, pearson coeffs tend to be around -0.34 +/- .045
        max_allowable_pearson_coeff = 0.4

        temp_normalized_fs = fs.Normalize(inplace=False)
        ranked_nonzero_features = \
            PearsonFeatureWeights.NewFromFeatureSpace( temp_normalized_fs ).Threshold(_all='nonzero')

        quintile = int(len(ranked_nonzero_features) / 5)
        crappy_features = ranked_nonzero_features.Slice(
            quintile * 4, len(ranked_nonzero_features))
        #crappy_features.Print()
        crap_featureset = fs.FeatureReduce(crappy_features, inplace=False)

        ss_kwargs[
            'name'] = "Continuous Shuffle Split Least Squares NEGATIVE CONTROL"
        exp = FeatureSpaceRegressionExperiment.NewShuffleSplit(
            crap_featureset, **ss_kwargs)
        exp.GenerateStats()
        exp.PerSampleStatistics()
        #exp.Print()
        self.assertAlmostEqual(exp.pearson_coeff,
                               0.0,
                               delta=max_allowable_pearson_coeff)
Ejemplo n.º 3
0
    def test_NewShuffleSplitLinearMultivariateRegression(self):
        """CONTINUOUS SHUFFLE SPLIT LINEAR MULTIVARIATE METHOD"""

        fs_kwargs = {}
        fs_kwargs['name'] = "CONTINUOUS PerSampleStatistics_TESTFS"
        fs_kwargs['n_samples'] = 100
        fs_kwargs['num_features_per_signal_type'] = 5
        fs_kwargs['initial_noise_sigma'] = 5
        fs_kwargs['noise_gradient'] = 5
        fs_kwargs['n_samples_per_group'] = 1
        fs_kwargs['random_state'] = 43
        fs_kwargs['singularity'] = True
        fs_kwargs['clip'] = False

        fs = CreateArtificialFeatureSpace_Continuous(**fs_kwargs)

        ss_kwargs = {}
        ss_kwargs['n_iter'] = 5
        ss_kwargs[
            'name'] = "Continuous Shuffle Split Multivariate-Regression POSITIVE CONTROL"
        ss_kwargs['quiet'] = True
        ss_kwargs['random_state'] = 43
        ss_kwargs['classifier'] = 'linear'
        exp = FeatureSpaceRegressionExperiment.NewShuffleSplit(fs, **ss_kwargs)

        exp.GenerateStats()
        #exp.Print()

        self.assertIs(len(exp), ss_kwargs['n_iter'])

        # Positive control - Artificial data with defaults should corellate almost perfectly
        self.assertAlmostEqual(exp.pearson_coeff, 1.0, delta=0.03)

        # Negative control - take the bottom quintile of the artificial features
        # which ARE functions of ground truth but should score low on linear correlation,
        # e.g., sin, x^2, etc.

        # Voting method with crap features tends to be around 0.14 +/- 0.04
        max_allowable_pearson_coeff = 0.2

        temp_normalized_fs = fs.Normalize(inplace=False)
        ranked_nonzero_features = \
            PearsonFeatureWeights.NewFromFeatureSpace( temp_normalized_fs ).Threshold(_all='nonzero')

        quintile = int(len(ranked_nonzero_features) / 5)
        crappy_features = ranked_nonzero_features[quintile *
                                                  4:len(ranked_nonzero_features
                                                        )]
        #crappy_features.Print()
        crap_featureset = fs.FeatureReduce(crappy_features)

        ss_kwargs[
            'name'] = "Continuous Shuffle Split Linear Multivariate-Regression NEGATIVE CONTROL",
        exp = FeatureSpaceRegressionExperiment.NewShuffleSplit(
            crap_featureset, **ss_kwargs)
        exp.GenerateStats()
        #exp.Print()
        self.assertAlmostEqual(exp.pearson_coeff,
                               0.0,
                               delta=max_allowable_pearson_coeff)
Ejemplo n.º 4
0
    def test_MultivariateLinearFitOnFitNoTiling(self):

        fake_continuous = CreateArtificialFeatureSpace_Continuous(
            n_samples=100,
            num_features_per_signal_type=5,
            noise_gradient=5,
            initial_noise_sigma=10,
            n_samples_per_group=1)

        fake_continuous.Normalize(quiet=True)
        reduced_fw = PearsonFeatureWeights.NewFromFeatureSpace(
            fake_continuous).Threshold()
        reduced_fs = fake_continuous.FeatureReduce(reduced_fw)
        batch_result = FeatureSpaceRegression.NewMultivariateLinear(
            test_set=reduced_fs, feature_weights=reduced_fw, quiet=True)
Ejemplo n.º 5
0
    def test_LeastSquaresFitOnFitLeaveOneOutNoTiling(self):

        fake_continuous = CreateArtificialFeatureSpace_Continuous(
            n_samples=100,
            num_features_per_signal_type=5,
            noise_gradient=5,
            initial_noise_sigma=10,
            n_samples_per_group=1)

        normalized_fs = fake_continuous.Normalize(inplace=False, quiet=True)
        reduced_fw = PearsonFeatureWeights.NewFromFeatureSpace(
            normalized_fs).Threshold()
        reduced_fs = fake_continuous.FeatureReduce(reduced_fw)

        batch_result = FeatureSpaceRegression.NewLeastSquares(
            training_set=reduced_fs,
            test_set=None,
            feature_weights=reduced_fw,
            quiet=True)
    def test_PerSampleStatistics(self):
        """Testing ContinuousClassificationExperimentResult.PerSampleStatistics()

        Goal is to check the aggregating functionality of PerSampleStatistics"""

        # create a small FeatureSet with not a lot of samples and not a lot of features
        # to enable quick classification

        fs_kwargs = {}
        fs_kwargs['name'] = "CONTINUOUS PerSampleStatistics_TESTFS"
        fs_kwargs['n_samples'] = n_samples = 20
        fs_kwargs[
            'num_features_per_signal_type'] = 2  # small on purpose, to make test fast
        fs_kwargs['initial_noise_sigma'] = 75
        fs_kwargs['noise_gradient'] = 25
        fs_kwargs['n_samples_per_group'] = 1
        fs_kwargs['random_state'] = 42
        fs_kwargs['singularity'] = False
        fs_kwargs['clip'] = False

        fs = CreateArtificialFeatureSpace_Continuous(**fs_kwargs)

        ss_kwargs = {}
        ss_kwargs['name'] = "Continuous PerSampleStatistics ShuffleSplit"
        ss_kwargs['quiet'] = True
        ss_kwargs[
            'n_iter'] = n_iter = 50  # do a lot of iterations so that all samples will be hit
        ss_kwargs['train_size'] = train_size = 16
        ss_kwargs['test_size'] = test_size = 4
        ss_kwargs['random_state'] = 42
        exp = FeatureSpaceRegressionExperiment.NewShuffleSplit(fs, **ss_kwargs)

        exp.GenerateStats()

        # Capture output from PerSampleStatistics
        from StringIO import StringIO
        out = StringIO()
        try:
            exp.PerSampleStatistics(output_stream=out)
        except Exception as e:
            m = 'Error in experiment.PredictedValueAnalysis: %s' % e
            self.fail(m)

        #print out
        # Count the number of lines
        # 3 header lines + 2*num_samples + n_iter*test_size
        #per_sample_output = out.getvalue().splitlines()
        #self.assertEqual( len(per_sample_output), 3 + 2*n_samples + n_iter*test_size )
        self.assertTrue(True)
Ejemplo n.º 7
0
    def test_SampleReduce( self ):
        from wndcharm.ArtificialFeatureSpace import CreateArtificialFeatureSpace_Discrete

        n_classes = 10
        #========================================================
        # Section 1: LEAVE IN, Untiled Discrete (w/ classes) FeatureSpace instances
        fs_discrete = CreateArtificialFeatureSpace_Discrete( n_samples=1000, n_classes=n_classes,
                num_features_per_signal_type=30, noise_gradient=5, initial_noise_sigma=10,
                n_samples_per_group=1, interpolatable=True)

        # Reduce to 9 classes from 10, one sample per class
        # Drop the last class:
        desired = range(50, 950, 100)

        A = fs_discrete.SampleReduce( desired )
        # Further reduce to 8 classes
        A.RemoveClass( "FakeClass-055.6", inplace=True )

        correct_samplenames = ['FakeClass-100.0_050', 'FakeClass-077.8_050', 'FakeClass-033.3_050', 'FakeClass-011.1_050', 'FakeClass+011.1_050', 'FakeClass+033.3_050', 'FakeClass+055.6_050', 'FakeClass+077.8_050']
        #The actual alphanumeric sort order is different from the value sort order
        #correct_samplenames = ['FakeClass+011.1_050', 'FakeClass+033.3_050', 'FakeClass+055.6_050', 'FakeClass+077.8_050', 'FakeClass-011.1_050', 'FakeClass-033.3_050', 'FakeClass-077.8_050', 'FakeClass-100.0_050']
        self.assertEqual( correct_samplenames, A._contiguous_sample_names )

        correct_classnames = ['FakeClass-100.0', 'FakeClass-077.8', 'FakeClass-033.3', 'FakeClass-011.1',  'FakeClass+011.1', 'FakeClass+033.3', 'FakeClass+055.6', 'FakeClass+077.8'  ]
        #correct_classnames = ['FakeClass+011.1', 'FakeClass+033.3', 'FakeClass+055.6', 'FakeClass+077.8', 'FakeClass-011.1', 'FakeClass-033.3', 'FakeClass-077.8', 'FakeClass-100.0']
        self.assertEqual( correct_classnames, A.class_names )
        del A

        #========================================================
        # Section 2: LEAVE OUT, UNTiled Feature sets, Discrete FeatureSpace instances

        UNdesired = range(50, 950, 100)
        C = fs_discrete.SampleReduce( leave_out_sample_group_ids=UNdesired )
        self.assertEqual( C.num_samples, fs_discrete.num_samples - len( UNdesired ) )

        # Single integers for leave_out_list is ok
        UNdesired = 50
        C = fs_discrete.SampleReduce( leave_out_sample_group_ids=UNdesired )
        self.assertEqual( C.num_samples, fs_discrete.num_samples - 1 )
        del C

        #========================================================
        # Section 3: LEAVE IN, Tiled Feature sets, Discrete FeatureSpace instances
        num_tiles = 4
        fs_discrete = CreateArtificialFeatureSpace_Discrete( n_samples=1000, n_classes=n_classes,
                num_features_per_signal_type=30, noise_gradient=5, initial_noise_sigma=10,
                n_samples_per_group=num_tiles, interpolatable=True)

        desired = range(5, 95, 10) # Rearrange into 9 classes
        D = fs_discrete.SampleReduce( desired )
        # Total num samples should be 9 classes, 1 sample group per class, 4 tiles per SG = 36
        self.assertEqual( num_tiles * len( desired ), D.num_samples )
        del D

        #========================================================
        # Section 4: LEAVE OUT, WITH Tiled Feature sets, Discrete FeatureSpace instances

        # You can't leave out a sample group that doesn't exist
        UNdesired = range(50000, 50010)
        self.assertRaises( ValueError, fs_discrete.SampleReduce,
                leave_out_sample_group_ids=UNdesired )

        # Can't leave out trash
        UNdesired = ['foo', 'bar']
        self.assertRaises( TypeError, fs_discrete.SampleReduce,
                leave_out_sample_group_ids=UNdesired )

        # This input is ok:
        UNdesired = range(5, 95, 10)
        E = fs_discrete.SampleReduce( leave_out_sample_group_ids=UNdesired )
        self.assertEqual( E.num_samples, fs_discrete.num_samples - len( UNdesired ) * num_tiles )
        del E

        #========================================================
        # Section 5: LEAVE IN, Untiled Continuous FeatureSpace instances
        from wndcharm.ArtificialFeatureSpace import CreateArtificialFeatureSpace_Continuous

        fs_cont = CreateArtificialFeatureSpace_Continuous( n_samples=1000,
                num_features_per_signal_type=30, noise_gradient=5, initial_noise_sigma=10,
                n_samples_per_group=1)

        # dummyproof
        desired = ['foo', 'bar']
        self.assertRaises( TypeError, fs_cont.SampleReduce, desired )

        desired = range(50, 950)
        F = fs_cont.SampleReduce( desired )
        self.assertEqual( F.num_samples, len(desired) )
        del F

        #========================================================
        # Section 6: LEAVE OUT, Untiled Continuous FeatureSpace instances

        UNdesired = range(50, 950)
        G = fs_cont.SampleReduce( leave_out_sample_group_ids=UNdesired )
        self.assertEqual( G.num_samples, fs_cont.num_samples - len(UNdesired) )
        del G

        # single int is ok
        H = fs_cont.SampleReduce( leave_out_sample_group_ids=998 )
        self.assertEqual( H.num_samples, fs_cont.num_samples - 1 )
        del H

        #========================================================
        # Section 7: LEAVE IN, TILED Continuous FeatureSpace instances

        fs_cont = CreateArtificialFeatureSpace_Continuous( n_samples=1000,
                num_features_per_signal_type=30, noise_gradient=5, initial_noise_sigma=10,
                n_samples_per_group=num_tiles)

        desired = range(50, 95)
        I = fs_cont.SampleReduce( desired )
        self.assertEqual( I.num_samples, len(desired) * num_tiles )
        del I

        # single int is ok, ALTHOUGH NOT SURE WHY YOU'D EVER WANT A FS WITH A SINGLE SAMPLE
        J = fs_cont.SampleReduce( 98 )
        self.assertEqual( J.num_samples, num_tiles )
        del J

        #========================================================
        # Section 8: LEAVE OUT, TILED Continuous FeatureSpace instances

        UNdesired = range(50, 95)
        K = fs_cont.SampleReduce( leave_out_sample_group_ids=UNdesired )
        self.assertEqual( K.num_samples, fs_cont.num_samples - len(UNdesired) * num_tiles )
        del K

        # single int is ok
        L = fs_cont.SampleReduce( leave_out_sample_group_ids=98 )
        self.assertEqual( L.num_samples, fs_cont.num_samples - num_tiles  )
        del L