def test_NumFeaturesGridSearch(self): fs_kwargs = {} fs_kwargs['name'] = "DISCRETE PerSampleStatistics WITH Pred Values" fs_kwargs['n_samples'] = n_samples = 250 fs_kwargs['n_classes'] = 10 fs_kwargs[ 'num_features_per_signal_type'] = 10 # small on purpose, to make test fast fs_kwargs['noise_gradient'] = 5 fs_kwargs['initial_noise_sigma'] = 75 fs_kwargs['n_samples_per_group'] = 1 fs_kwargs['random_state'] = 42 fs_kwargs['interpolatable'] = True fs_kwargs['singularity'] = False fs_kwargs['clip'] = False fs = CreateArtificialFeatureSpace_Discrete(**fs_kwargs) ss_kwargs = {} ss_kwargs['feature_space'] = fs ss_kwargs['quiet'] = False ss_kwargs['n_iter'] = n_iter = 10 ss_kwargs['random_state'] = 42 ss_kwargs[ 'conserve_mem'] = False # otherwise the input fs will be modified FeatureSpaceClassificationExperiment.NumFeaturesGridSearch(**ss_kwargs) ss_kwargs['lda'] = True ss_kwargs['pre_lda_feature_filter'] = True #import pdb; pdb.set_trace() FeatureSpaceClassificationExperiment.NumFeaturesGridSearch(**ss_kwargs)
def test_PerSampleStatisticsWITHPredictedValue(self): """DISCRETE PerSampleStatistics with numeric predicted value""" fs_kwargs = {} fs_kwargs['name'] = "DISCRETE PerSampleStatistics WITH Pred Values" fs_kwargs['n_samples'] = n_samples = 40 fs_kwargs['n_classes'] = 2 fs_kwargs[ 'num_features_per_signal_type'] = 10 # small on purpose, to make test fast fs_kwargs['noise_gradient'] = 50 fs_kwargs['initial_noise_sigma'] = 75 fs_kwargs['n_samples_per_group'] = 1 fs_kwargs['random_state'] = 42 fs_kwargs['interpolatable'] = True fs_kwargs['singularity'] = False fs_kwargs['clip'] = False fs = CreateArtificialFeatureSpace_Discrete(**fs_kwargs) # Use case 1: Straight, classic WND-CHARM train/test splits ss_kwargs = {} ss_kwargs[ 'name'] = "Discrete PerSampleStatistics ShuffleSplit WITH Pred Values" ss_kwargs['quiet'] = True ss_kwargs['n_iter'] = n_iter = 10 ss_kwargs['train_size'] = train_size = 8 # per-class ss_kwargs['test_size'] = test_size = 2 # per-class ss_kwargs['random_state'] = 42 exp = FeatureSpaceClassificationExperiment.NewShuffleSplit( fs, **ss_kwargs) # Use case 2: Put LDA in pipeline (no fisher feature space prefilter, by default) ss_kwargs['lda'] = True exp = FeatureSpaceClassificationExperiment.NewShuffleSplit( fs, **ss_kwargs) ## Use case 3: LDA AND Fisher feature space prefilter #ss_kwargs['pre_lda_feature_filter'] = True #exp = FeatureSpaceClassificationExperiment.NewShuffleSplit( fs, **ss_kwargs ) ## Use case 4: LDA AND Fisher feature space prefilter, AND post-LDA dimension reduction #ss_kwargs['lda_features_size'] = 0.5 #exp = FeatureSpaceClassificationExperiment.NewShuffleSplit( fs, **ss_kwargs ) #Print calls self.GenereateStats() #from os import devnull exp.Print() #output_stream=devnull ) exp.PerSampleStatistics() #output_stream=devnull ) self.assertTrue(True)
def test_FromDiscreteClassificationExperimentResults(self): """Rank Ordered Predicted values graph from an experiment result (multiple splits)""" testfilename = 'test_graph_rank_ordered_experiment.npy' # Make a smaller featureset to do multiple splits fs_kwargs = {} fs_kwargs['name'] = "DiscreteArtificialFS RANK ORDERED SHUFFLE SPLIT" fs_kwargs['n_samples'] = 100 # smaller fs_kwargs['n_classes'] = 5 # smaller, 20 samples per class fs_kwargs['num_features_per_signal_type'] = 10 # smaller fs_kwargs['initial_noise_sigma'] = 50 fs_kwargs['noise_gradient'] = 20 fs_kwargs['n_samples_per_group'] = 1 fs_kwargs['interpolatable'] = True fs_kwargs['random_state'] = 42 fs_kwargs['singularity'] = False fs_kwargs['clip'] = False small_fs = CreateArtificialFeatureSpace_Discrete(**fs_kwargs) ss_kwargs = {} ss_kwargs['quiet'] = True ss_kwargs['n_iter'] = n_iter = 10 ss_kwargs['train_size'] = train_size = 18 # per-class ss_kwargs['test_size'] = test_size = 2 # per-class ss_kwargs['random_state'] = 42 exp = FeatureSpaceClassificationExperiment.NewShuffleSplit( small_fs, **ss_kwargs) graph = PredictedValuesGraph(exp, use_averaged_results=False) graph.RankOrderedPredictedValuesGraph() self.CompareGraphs(graph, testfilename)
def test_FitOnFitClassification(self): fitfile_path = wndchrm_test_dir + sep + 'test-l.fit' #fs = FeatureSet.NewFromFitFile( fitfile_path ) fs = FeatureSpace.NewFromFitFile(fitfile_path) fs.Normalize(inplace=True, quiet=True) fw = FisherFeatureWeights.NewFromFeatureSpace(fs).Threshold(438) fw.Print(50) fs.FeatureReduce(fw, inplace=True) pychrm_split = FeatureSpaceClassification.NewWND5(fs, fs, fw, quiet=False) from wndcharm.FeatureSpacePredictionExperiment import FeatureSpaceClassificationExperiment html_path = pychrm_test_dir + sep + 'test-l_training_error_result.html' html_exp = FeatureSpaceClassificationExperiment.NewFromHTMLReport( html_path, quiet=False) # single split in this html html_split = html_exp.individual_results[0] for i, (html_result, pychrm_result) in enumerate( zip( html_split.individual_results,\ pychrm_split.individual_results ) ): try: self.assertEqual(html_result, pychrm_result) except: outstr = "Error in comparison # {0}:\n".format(i) outstr += "HTML result:\n{0}\n Python API res:\n{1}".format( html_result, pychrm_result) raise
def test_PerSampleStatisticsWITHPredictedValue(self): """DISCRETE PerSampleStatistics with numeric predicted value""" fs_kwargs = {} fs_kwargs['name'] = "DISCRETE PerSampleStatistics WITH Pred Values" fs_kwargs['n_samples'] = n_samples = 40 fs_kwargs['n_classes'] = 2 fs_kwargs[ 'num_features_per_signal_type'] = 10 # small on purpose, to make test fast fs_kwargs['noise_gradient'] = 50 fs_kwargs['initial_noise_sigma'] = 75 fs_kwargs['n_samples_per_group'] = 1 fs_kwargs['random_state'] = 42 fs_kwargs['interpolatable'] = True fs_kwargs['singularity'] = False fs_kwargs['clip'] = False fs = CreateArtificialFeatureSpace_Discrete(**fs_kwargs) ss_kwargs = {} ss_kwargs[ 'name'] = "Discrete PerSampleStatistics ShuffleSplit WITH Pred Values" ss_kwargs['quiet'] = True ss_kwargs['n_iter'] = n_iter = 10 ss_kwargs['train_size'] = train_size = 8 # per-class ss_kwargs['test_size'] = test_size = 2 # per-class ss_kwargs['random_state'] = 42 exp = FeatureSpaceClassificationExperiment.NewShuffleSplit( fs, **ss_kwargs) #Print calls self.GenereateStats() #from os import devnull exp.Print() #output_stream=devnull ) exp.PerSampleStatistics() #output_stream=devnull ) self.assertTrue(True)
def test_PerSampleStatisticsWITHOUTPredictedValue(self): """DISCRETE ShuffleSplit/PerSampleStatistics w/ no predicted value""" # CAN'T USE THIS, SINCE THE CLASS NAMES ARE INTERPOLATABLE # 2-class, 10 samples per class #fs = FeatureSet_Discrete.NewFromFitFile( '../wndchrm_tests/test-l.fit' ) fs_kwargs = {} fs_kwargs['name'] = "DISCRETE PerSampleStatistics No Pred Values" fs_kwargs['n_samples'] = n_samples = 20 fs_kwargs['n_classes'] = 2 fs_kwargs[ 'num_features_per_signal_type'] = 10 # small on purpose, to make test fast fs_kwargs['noise_gradient'] = 50 fs_kwargs['initial_noise_sigma'] = 75 fs_kwargs['n_samples_per_group'] = 1 fs_kwargs['random_state'] = 42 fs_kwargs['interpolatable'] = False fs_kwargs['singularity'] = False fs_kwargs['clip'] = False fs = CreateArtificialFeatureSpace_Discrete(**fs_kwargs) ss_kwargs = {} ss_kwargs[ 'name'] = "Discrete PerSampleStatistics ShuffleSplit No Pred Values" ss_kwargs['quiet'] = True ss_kwargs['n_iter'] = n_iter = 1 ss_kwargs['train_size'] = train_size = 8 # per-class ss_kwargs['test_size'] = test_size = 2 # per-class ss_kwargs['random_state'] = 42 exp = FeatureSpaceClassificationExperiment.NewShuffleSplit( fs, **ss_kwargs) ss_kwargs['lda'] = True exp = FeatureSpaceClassificationExperiment.NewShuffleSplit( fs, **ss_kwargs) #Print calls self.GenereateStats() #from os import devnull exp.Print() #output_stream=devnull ) exp.PerSampleStatistics() #output_stream=devnull ) self.assertTrue(True)
def __init__( self, training_set, feature_weights, test_image_path, chart_title=None, max_num_features=300 ): self.timing_axes = None import time timings = [] from wndcharm.FeatureSpacePredictionExperiment import FeatureSpaceClassificationExperiment from wndcharm.SingleSamplePrediction import SingleSampleClassification from wndcharm.FeatureSpacePrediction import FeatureSpaceClassification experiment = FeatureSpaceClassificationExperiment( training_set, training_set, feature_weights ) for number_of_features_to_use in range( 1, max_num_features + 1 ): reduced_ts = None reduced_fw = None three_timings = [] # Take the best of 3 for timing in range( 3 ): # Time the creation and classification of a single signature t1 = time.time() reduced_fw = feature_weights.Threshold( number_of_features_to_use ) sig = FeatureVector( source_filepath=test_image_path, feature_names=reduced_fw.feature_names ).GenerateFeatures() reduced_ts = training_set.FeatureReduce( reduced_fw ) sig.Normalize( reduced_ts ) result = SingleSampleClassification.NewWND5( reduced_ts, reduced_fw, sig ) result.Print() # FIXME: save intermediates just in case of interruption or parallization # result.PickleMe() t2 = time.time() three_timings.append( t2 - t1 ) timings.append( min( three_timings ) ) # now, do a fit-on-fit test to measure classification accuracy split_result = FeatureSpaceClassification.NewWND5( reduced_ts, reduced_ts, reduced_fw ) split_result.Print() experiment.individual_results.append( split_result ) import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt x_vals = list( range( 1, max_num_features + 1 ) ) self.figure = plt.figure() self.main_axes = self.figure.add_subplot(111) if chart_title == None: self.chart_title = "Feature timing v. classification accuracy" else: self.chart_title = chart_title self.main_axes.set_title( self.chart_title ) self.main_axes.set_xlabel( 'Number of features' ) self.main_axes.set_ylabel( 'Classification accuracy (%)', color='b' ) classification_accuracies = \ [ split_result.classification_accuracy * 100 for split_result in experiment.individual_results ] self.main_axes.plot( x_vals, classification_accuracies, color='b', linewidth=2 ) for tl in self.main_axes.get_yticklabels(): tl.set_color('b') self.timing_axes = self.main_axes.twinx() self.timing_axes.set_ylabel( 'Time to calculate features (s)', color='r' ) self.timing_axes.plot( x_vals, timings, color='r' ) for tl in self.timing_axes.get_yticklabels(): tl.set_color('r')
def test_FitOnFit(self): """Uses a curated subset of the IICBU 2008 Lymphoma dataset, preprocessed as follows: auto-deconvolved, eosin channel only, tiled 5x6, 3 classes, 10 imgs per class, 300 samples per class. """ # Inflate the zipped test fit into a temp file import zipfile zipped_file_path = pychrm_test_dir + sep + 'lymphoma_iicbu2008_subset_EOSIN_ONLY_t5x6_v3.2features.fit.zip' zf = zipfile.ZipFile(zipped_file_path, mode='r') tempdir = mkdtemp() zf.extractall(tempdir) try: fitfilepath = tempdir + sep + zf.namelist()[0] # Do fit on fit WITHOUT tiling and compare with fit on fit results # generated with wndchrm 1.60 fs = FeatureSpace.NewFromFitFile(fitfilepath).Normalize( inplace=True, quiet=True) #fs = FeatureSpace.NewFromFitFile( wndchrm_test_dir + sep + 'test-l.fit' ) #fs.ToFitFile( 'temp.fit' ) fw = FisherFeatureWeights.NewFromFeatureSpace(fs).Threshold() fs.FeatureReduce(fw, inplace=True) # #fw.Print() # #fs.Print(verbose=True) pychrm_res = FeatureSpaceClassification.NewWND5(fs, fs, fw) pychrm_res.Print() # # import cProfile as pr # #import profile as pr # import tempfile # import pstats # prof = tempfile.NamedTemporaryFile() # cmd = 'no_tile_pychrm_result = DiscreteBatchClassificationResult.New( reduced_fs, reduced_fs, fw )' # pr.runctx( cmd, globals(), locals(), prof.name) # p = pstats.Stats(prof.name) # p.sort_stats('time').print_stats(20) # prof.close() self.maxDiff = None html_path = pychrm_test_dir + sep + 'lymphoma_iicbu2008_subset_eosin_t5x6_v3.2feats_REFERENCE_RESULTS_900_samples_TRAINING_ERROR.html' wres = FeatureSpaceClassificationExperiment.NewFromHTMLReport( html_path) wres.Print() wc_batch_result = wres.individual_results[ 0] # only 1 split in fit-on-fit # This takes WAY too long: #self.assertSequenceEqual( wc_batch_result.individual_results, pychrm_res.individual_results ) wc_result = np.empty((3 * len(wc_batch_result.individual_results))) for i, single_result in enumerate( wc_batch_result.individual_results): wc_result[i * 3:(i + 1) * 3] = single_result.marginal_probabilities pc_result = np.empty((3 * len(pychrm_res.individual_results))) for i, single_result in enumerate(pychrm_res.individual_results): # HTML report only has 3 decimal places pc_result[ i*3 : (i+1)*3 ] = \ [ float( "{0:0.3f}".format( val ) ) for val in single_result.marginal_probabilities ] from numpy.testing import assert_allclose assert_allclose(actual=pc_result, desired=wc_result, atol=0.003) #wc_batch_result.Print() #pres.Print() # ========================================================== # Now do the same with tiling, reusing fs from before: num_samples_per_group = 30 n_groups = fs.num_samples / num_samples_per_group new_sg_ids = [ i for i in xrange(n_groups) for j in xrange(num_samples_per_group) ] fs.Update( tile_num_rows=5, tile_num_cols=6, num_samples_per_group=30,\ _contiguous_sample_group_ids=new_sg_ids )._RebuildViews() with_tile_pychrm_result = FeatureSpaceClassification.NewWND5( fs, fs, fw) html_path = pychrm_test_dir + sep + 'lymphoma_iicbu2008_subset_eosin_t5x6_v3.2feats_REFERENCE_RESULTS_30_samples_tiled_TRAINING_ERROR.html' with_tile_wndchrm_result = \ FeatureSpaceClassificationExperiment.NewFromHTMLReport( html_path ).individual_results[0] #self.assertSequenceEqual( with_tile_pychrm_result.averaged_results, with_tile_wndchrm_result.individual_results ) wc_result = np.empty( (3 * len(with_tile_wndchrm_result.individual_results))) for i, single_result in enumerate( with_tile_wndchrm_result.individual_results): wc_result[i * 3:(i + 1) * 3] = single_result.marginal_probabilities pc_result = np.empty( (3 * len(with_tile_pychrm_result.averaged_results))) for i, single_result in enumerate( with_tile_pychrm_result.averaged_results): # HTML report only has 3 decimal places pc_result[ i*3 : (i+1)*3 ] = \ [ float( "{0:0.3f}".format( val ) ) for val in single_result.marginal_probabilities ] assert_allclose(actual=pc_result, desired=wc_result, atol=0.003) finally: rmtree(tempdir)
else: test_set = get_featureset( testing_filename ) if write_intermediates: train_set.ToFitFile() if feature_usage_fraction: if feature_usage_fraction < 0 or feature_usage_fraction > 1.0: raise Exception('Feature usage fraction must be on interval [0,1]') num_features = int( feature_usage_fraction * train_set.num_features ) if num_features: print "Using top {0} Fisher-ranked features.".format( num_features ) else: print "Using top 15% Fisher-ranked features." experiment = FeatureSpaceClassificationExperiment( training_set=train_set ) train_set.Normalize( inplace=True ) weights = FisherFeatureWeights.NewFromFeatureSpace( train_set ).Threshold( num_features ) train_set.FeatureReduce( weights, inplace=True ) if train_set != test_set: test_set.FeatureReduce( weights, inplace=True ).Normalize( train_set ) for i in range( num_splits ): split = FeatureSpaceClassification.NewWND5( train_set, test_set, weights, batch_number=i ) experiment.individual_results.append( split ) if outpath: experiment.Print( output_filepath=outpath, mode='w' ) #experiment.PerSampleStatistics( output_filepath=outpath, mode= 'a' )