def test_ContinuousTrainTestSplitWithTiling(self): """Uses a synthetic preprocessed as follows: 500 total samples, 25 tiles per group 240 total features""" from wndcharm.ArtificialFeatureSpace import CreateArtificialFeatureSpace_Continuous fs = CreateArtificialFeatureSpace_Continuous( n_samples=500, num_features_per_signal_type=20, n_samples_per_group=25) from numpy.random import RandomState prng = RandomState(42) #fs.Print( verbose=True ) #print "\n\n\n********************\n\n\n" train, test = fs.Split(random_state=prng, quiet=True) #full_train.Print( verbose=True ) #full_test.Print( verbose=True ) train.Normalize(inplace=True, quiet=True) fw = PearsonFeatureWeights.NewFromFeatureSpace(train).Threshold() train.FeatureReduce(fw, inplace=True) test.FeatureReduce(fw, inplace=True).Normalize(train, inplace=True, quiet=True)
def test_NewShuffleSplitLeastSquares(self): """CONTINUOUS SHUFFLE SPLIT LEAST SQUARES""" fs_kwargs = {} fs_kwargs['name'] = "CONTINUOUS PerSampleStatistics_TESTFS" fs_kwargs['n_samples'] = 100 fs_kwargs['num_features_per_signal_type'] = 5 fs_kwargs['initial_noise_sigma'] = 5 fs_kwargs['noise_gradient'] = 5 fs_kwargs['n_samples_per_group'] = 1 fs_kwargs['random_state'] = 43 fs_kwargs['singularity'] = True fs_kwargs['clip'] = True fs = CreateArtificialFeatureSpace_Continuous(**fs_kwargs) ss_kwargs = {} ss_kwargs['n_iter'] = 5 ss_kwargs[ 'name'] = "Continuous Shuffle Split Least Squares POSITIVE CONTROL" ss_kwargs['quiet'] = True ss_kwargs['random_state'] = 43 exp = FeatureSpaceRegressionExperiment.NewShuffleSplit(fs, **ss_kwargs) exp.GenerateStats() #exp.Print() # len( exp ) is supposed to be the number of batch results (split results) self.assertIs(len(exp), ss_kwargs['n_iter']) # Positive control - Artificial data with defaults should corellate almost perfectly self.assertAlmostEqual(exp.pearson_coeff, 1.0, delta=0.02) # Negative control - take the bottom quintile of the artificial features # which ARE functions of ground truth but should score low on linear correlation, # e.g., sin, x^2, etc. # With LSTSQ regression of noise features, pearson coeffs tend to be around -0.34 +/- .045 max_allowable_pearson_coeff = 0.4 temp_normalized_fs = fs.Normalize(inplace=False) ranked_nonzero_features = \ PearsonFeatureWeights.NewFromFeatureSpace( temp_normalized_fs ).Threshold(_all='nonzero') quintile = int(len(ranked_nonzero_features) / 5) crappy_features = ranked_nonzero_features.Slice( quintile * 4, len(ranked_nonzero_features)) #crappy_features.Print() crap_featureset = fs.FeatureReduce(crappy_features, inplace=False) ss_kwargs[ 'name'] = "Continuous Shuffle Split Least Squares NEGATIVE CONTROL" exp = FeatureSpaceRegressionExperiment.NewShuffleSplit( crap_featureset, **ss_kwargs) exp.GenerateStats() exp.PerSampleStatistics() #exp.Print() self.assertAlmostEqual(exp.pearson_coeff, 0.0, delta=max_allowable_pearson_coeff)
def test_NewShuffleSplitLinearMultivariateRegression(self): """CONTINUOUS SHUFFLE SPLIT LINEAR MULTIVARIATE METHOD""" fs_kwargs = {} fs_kwargs['name'] = "CONTINUOUS PerSampleStatistics_TESTFS" fs_kwargs['n_samples'] = 100 fs_kwargs['num_features_per_signal_type'] = 5 fs_kwargs['initial_noise_sigma'] = 5 fs_kwargs['noise_gradient'] = 5 fs_kwargs['n_samples_per_group'] = 1 fs_kwargs['random_state'] = 43 fs_kwargs['singularity'] = True fs_kwargs['clip'] = False fs = CreateArtificialFeatureSpace_Continuous(**fs_kwargs) ss_kwargs = {} ss_kwargs['n_iter'] = 5 ss_kwargs[ 'name'] = "Continuous Shuffle Split Multivariate-Regression POSITIVE CONTROL" ss_kwargs['quiet'] = True ss_kwargs['random_state'] = 43 ss_kwargs['classifier'] = 'linear' exp = FeatureSpaceRegressionExperiment.NewShuffleSplit(fs, **ss_kwargs) exp.GenerateStats() #exp.Print() self.assertIs(len(exp), ss_kwargs['n_iter']) # Positive control - Artificial data with defaults should corellate almost perfectly self.assertAlmostEqual(exp.pearson_coeff, 1.0, delta=0.03) # Negative control - take the bottom quintile of the artificial features # which ARE functions of ground truth but should score low on linear correlation, # e.g., sin, x^2, etc. # Voting method with crap features tends to be around 0.14 +/- 0.04 max_allowable_pearson_coeff = 0.2 temp_normalized_fs = fs.Normalize(inplace=False) ranked_nonzero_features = \ PearsonFeatureWeights.NewFromFeatureSpace( temp_normalized_fs ).Threshold(_all='nonzero') quintile = int(len(ranked_nonzero_features) / 5) crappy_features = ranked_nonzero_features[quintile * 4:len(ranked_nonzero_features )] #crappy_features.Print() crap_featureset = fs.FeatureReduce(crappy_features) ss_kwargs[ 'name'] = "Continuous Shuffle Split Linear Multivariate-Regression NEGATIVE CONTROL", exp = FeatureSpaceRegressionExperiment.NewShuffleSplit( crap_featureset, **ss_kwargs) exp.GenerateStats() #exp.Print() self.assertAlmostEqual(exp.pearson_coeff, 0.0, delta=max_allowable_pearson_coeff)
def test_MultivariateLinearFitOnFitNoTiling(self): fake_continuous = CreateArtificialFeatureSpace_Continuous( n_samples=100, num_features_per_signal_type=5, noise_gradient=5, initial_noise_sigma=10, n_samples_per_group=1) fake_continuous.Normalize(quiet=True) reduced_fw = PearsonFeatureWeights.NewFromFeatureSpace( fake_continuous).Threshold() reduced_fs = fake_continuous.FeatureReduce(reduced_fw) batch_result = FeatureSpaceRegression.NewMultivariateLinear( test_set=reduced_fs, feature_weights=reduced_fw, quiet=True)
def test_LeastSquaresFitOnFitLeaveOneOutNoTiling(self): fake_continuous = CreateArtificialFeatureSpace_Continuous( n_samples=100, num_features_per_signal_type=5, noise_gradient=5, initial_noise_sigma=10, n_samples_per_group=1) normalized_fs = fake_continuous.Normalize(inplace=False, quiet=True) reduced_fw = PearsonFeatureWeights.NewFromFeatureSpace( normalized_fs).Threshold() reduced_fs = fake_continuous.FeatureReduce(reduced_fw) batch_result = FeatureSpaceRegression.NewLeastSquares( training_set=reduced_fs, test_set=None, feature_weights=reduced_fw, quiet=True)
def test_PerSampleStatistics(self): """Testing ContinuousClassificationExperimentResult.PerSampleStatistics() Goal is to check the aggregating functionality of PerSampleStatistics""" # create a small FeatureSet with not a lot of samples and not a lot of features # to enable quick classification fs_kwargs = {} fs_kwargs['name'] = "CONTINUOUS PerSampleStatistics_TESTFS" fs_kwargs['n_samples'] = n_samples = 20 fs_kwargs[ 'num_features_per_signal_type'] = 2 # small on purpose, to make test fast fs_kwargs['initial_noise_sigma'] = 75 fs_kwargs['noise_gradient'] = 25 fs_kwargs['n_samples_per_group'] = 1 fs_kwargs['random_state'] = 42 fs_kwargs['singularity'] = False fs_kwargs['clip'] = False fs = CreateArtificialFeatureSpace_Continuous(**fs_kwargs) ss_kwargs = {} ss_kwargs['name'] = "Continuous PerSampleStatistics ShuffleSplit" ss_kwargs['quiet'] = True ss_kwargs[ 'n_iter'] = n_iter = 50 # do a lot of iterations so that all samples will be hit ss_kwargs['train_size'] = train_size = 16 ss_kwargs['test_size'] = test_size = 4 ss_kwargs['random_state'] = 42 exp = FeatureSpaceRegressionExperiment.NewShuffleSplit(fs, **ss_kwargs) exp.GenerateStats() # Capture output from PerSampleStatistics from StringIO import StringIO out = StringIO() try: exp.PerSampleStatistics(output_stream=out) except Exception as e: m = 'Error in experiment.PredictedValueAnalysis: %s' % e self.fail(m) #print out # Count the number of lines # 3 header lines + 2*num_samples + n_iter*test_size #per_sample_output = out.getvalue().splitlines() #self.assertEqual( len(per_sample_output), 3 + 2*n_samples + n_iter*test_size ) self.assertTrue(True)
def test_SampleReduce( self ): from wndcharm.ArtificialFeatureSpace import CreateArtificialFeatureSpace_Discrete n_classes = 10 #======================================================== # Section 1: LEAVE IN, Untiled Discrete (w/ classes) FeatureSpace instances fs_discrete = CreateArtificialFeatureSpace_Discrete( n_samples=1000, n_classes=n_classes, num_features_per_signal_type=30, noise_gradient=5, initial_noise_sigma=10, n_samples_per_group=1, interpolatable=True) # Reduce to 9 classes from 10, one sample per class # Drop the last class: desired = range(50, 950, 100) A = fs_discrete.SampleReduce( desired ) # Further reduce to 8 classes A.RemoveClass( "FakeClass-055.6", inplace=True ) correct_samplenames = ['FakeClass-100.0_050', 'FakeClass-077.8_050', 'FakeClass-033.3_050', 'FakeClass-011.1_050', 'FakeClass+011.1_050', 'FakeClass+033.3_050', 'FakeClass+055.6_050', 'FakeClass+077.8_050'] #The actual alphanumeric sort order is different from the value sort order #correct_samplenames = ['FakeClass+011.1_050', 'FakeClass+033.3_050', 'FakeClass+055.6_050', 'FakeClass+077.8_050', 'FakeClass-011.1_050', 'FakeClass-033.3_050', 'FakeClass-077.8_050', 'FakeClass-100.0_050'] self.assertEqual( correct_samplenames, A._contiguous_sample_names ) correct_classnames = ['FakeClass-100.0', 'FakeClass-077.8', 'FakeClass-033.3', 'FakeClass-011.1', 'FakeClass+011.1', 'FakeClass+033.3', 'FakeClass+055.6', 'FakeClass+077.8' ] #correct_classnames = ['FakeClass+011.1', 'FakeClass+033.3', 'FakeClass+055.6', 'FakeClass+077.8', 'FakeClass-011.1', 'FakeClass-033.3', 'FakeClass-077.8', 'FakeClass-100.0'] self.assertEqual( correct_classnames, A.class_names ) del A #======================================================== # Section 2: LEAVE OUT, UNTiled Feature sets, Discrete FeatureSpace instances UNdesired = range(50, 950, 100) C = fs_discrete.SampleReduce( leave_out_sample_group_ids=UNdesired ) self.assertEqual( C.num_samples, fs_discrete.num_samples - len( UNdesired ) ) # Single integers for leave_out_list is ok UNdesired = 50 C = fs_discrete.SampleReduce( leave_out_sample_group_ids=UNdesired ) self.assertEqual( C.num_samples, fs_discrete.num_samples - 1 ) del C #======================================================== # Section 3: LEAVE IN, Tiled Feature sets, Discrete FeatureSpace instances num_tiles = 4 fs_discrete = CreateArtificialFeatureSpace_Discrete( n_samples=1000, n_classes=n_classes, num_features_per_signal_type=30, noise_gradient=5, initial_noise_sigma=10, n_samples_per_group=num_tiles, interpolatable=True) desired = range(5, 95, 10) # Rearrange into 9 classes D = fs_discrete.SampleReduce( desired ) # Total num samples should be 9 classes, 1 sample group per class, 4 tiles per SG = 36 self.assertEqual( num_tiles * len( desired ), D.num_samples ) del D #======================================================== # Section 4: LEAVE OUT, WITH Tiled Feature sets, Discrete FeatureSpace instances # You can't leave out a sample group that doesn't exist UNdesired = range(50000, 50010) self.assertRaises( ValueError, fs_discrete.SampleReduce, leave_out_sample_group_ids=UNdesired ) # Can't leave out trash UNdesired = ['foo', 'bar'] self.assertRaises( TypeError, fs_discrete.SampleReduce, leave_out_sample_group_ids=UNdesired ) # This input is ok: UNdesired = range(5, 95, 10) E = fs_discrete.SampleReduce( leave_out_sample_group_ids=UNdesired ) self.assertEqual( E.num_samples, fs_discrete.num_samples - len( UNdesired ) * num_tiles ) del E #======================================================== # Section 5: LEAVE IN, Untiled Continuous FeatureSpace instances from wndcharm.ArtificialFeatureSpace import CreateArtificialFeatureSpace_Continuous fs_cont = CreateArtificialFeatureSpace_Continuous( n_samples=1000, num_features_per_signal_type=30, noise_gradient=5, initial_noise_sigma=10, n_samples_per_group=1) # dummyproof desired = ['foo', 'bar'] self.assertRaises( TypeError, fs_cont.SampleReduce, desired ) desired = range(50, 950) F = fs_cont.SampleReduce( desired ) self.assertEqual( F.num_samples, len(desired) ) del F #======================================================== # Section 6: LEAVE OUT, Untiled Continuous FeatureSpace instances UNdesired = range(50, 950) G = fs_cont.SampleReduce( leave_out_sample_group_ids=UNdesired ) self.assertEqual( G.num_samples, fs_cont.num_samples - len(UNdesired) ) del G # single int is ok H = fs_cont.SampleReduce( leave_out_sample_group_ids=998 ) self.assertEqual( H.num_samples, fs_cont.num_samples - 1 ) del H #======================================================== # Section 7: LEAVE IN, TILED Continuous FeatureSpace instances fs_cont = CreateArtificialFeatureSpace_Continuous( n_samples=1000, num_features_per_signal_type=30, noise_gradient=5, initial_noise_sigma=10, n_samples_per_group=num_tiles) desired = range(50, 95) I = fs_cont.SampleReduce( desired ) self.assertEqual( I.num_samples, len(desired) * num_tiles ) del I # single int is ok, ALTHOUGH NOT SURE WHY YOU'D EVER WANT A FS WITH A SINGLE SAMPLE J = fs_cont.SampleReduce( 98 ) self.assertEqual( J.num_samples, num_tiles ) del J #======================================================== # Section 8: LEAVE OUT, TILED Continuous FeatureSpace instances UNdesired = range(50, 95) K = fs_cont.SampleReduce( leave_out_sample_group_ids=UNdesired ) self.assertEqual( K.num_samples, fs_cont.num_samples - len(UNdesired) * num_tiles ) del K # single int is ok L = fs_cont.SampleReduce( leave_out_sample_group_ids=98 ) self.assertEqual( L.num_samples, fs_cont.num_samples - num_tiles ) del L