Ejemplo n.º 1
0
    def test_NumFeaturesGridSearch(self):

        fs_kwargs = {}
        fs_kwargs['name'] = "DISCRETE PerSampleStatistics WITH Pred Values"
        fs_kwargs['n_samples'] = n_samples = 250
        fs_kwargs['n_classes'] = 10
        fs_kwargs[
            'num_features_per_signal_type'] = 10  # small on purpose, to make test fast
        fs_kwargs['noise_gradient'] = 5
        fs_kwargs['initial_noise_sigma'] = 75
        fs_kwargs['n_samples_per_group'] = 1
        fs_kwargs['random_state'] = 42
        fs_kwargs['interpolatable'] = True
        fs_kwargs['singularity'] = False
        fs_kwargs['clip'] = False
        fs = CreateArtificialFeatureSpace_Discrete(**fs_kwargs)

        ss_kwargs = {}
        ss_kwargs['feature_space'] = fs
        ss_kwargs['quiet'] = False
        ss_kwargs['n_iter'] = n_iter = 10
        ss_kwargs['random_state'] = 42
        ss_kwargs[
            'conserve_mem'] = False  # otherwise the input fs will be modified

        FeatureSpaceClassificationExperiment.NumFeaturesGridSearch(**ss_kwargs)
        ss_kwargs['lda'] = True
        ss_kwargs['pre_lda_feature_filter'] = True
        #import pdb; pdb.set_trace()
        FeatureSpaceClassificationExperiment.NumFeaturesGridSearch(**ss_kwargs)
Ejemplo n.º 2
0
    def test_PerSampleStatisticsWITHPredictedValue(self):
        """DISCRETE PerSampleStatistics with numeric predicted value"""

        fs_kwargs = {}
        fs_kwargs['name'] = "DISCRETE PerSampleStatistics WITH Pred Values"
        fs_kwargs['n_samples'] = n_samples = 40
        fs_kwargs['n_classes'] = 2
        fs_kwargs[
            'num_features_per_signal_type'] = 10  # small on purpose, to make test fast
        fs_kwargs['noise_gradient'] = 50
        fs_kwargs['initial_noise_sigma'] = 75
        fs_kwargs['n_samples_per_group'] = 1
        fs_kwargs['random_state'] = 42
        fs_kwargs['interpolatable'] = True
        fs_kwargs['singularity'] = False
        fs_kwargs['clip'] = False
        fs = CreateArtificialFeatureSpace_Discrete(**fs_kwargs)

        # Use case 1: Straight, classic WND-CHARM train/test splits
        ss_kwargs = {}
        ss_kwargs[
            'name'] = "Discrete PerSampleStatistics ShuffleSplit WITH Pred Values"
        ss_kwargs['quiet'] = True
        ss_kwargs['n_iter'] = n_iter = 10
        ss_kwargs['train_size'] = train_size = 8  # per-class
        ss_kwargs['test_size'] = test_size = 2  # per-class
        ss_kwargs['random_state'] = 42
        exp = FeatureSpaceClassificationExperiment.NewShuffleSplit(
            fs, **ss_kwargs)

        # Use case 2: Put LDA in pipeline (no fisher feature space prefilter, by default)
        ss_kwargs['lda'] = True
        exp = FeatureSpaceClassificationExperiment.NewShuffleSplit(
            fs, **ss_kwargs)

        ## Use case 3: LDA AND Fisher feature space prefilter
        #ss_kwargs['pre_lda_feature_filter'] = True
        #exp = FeatureSpaceClassificationExperiment.NewShuffleSplit( fs, **ss_kwargs )

        ## Use case 4: LDA AND Fisher feature space prefilter, AND post-LDA dimension reduction
        #ss_kwargs['lda_features_size'] = 0.5
        #exp = FeatureSpaceClassificationExperiment.NewShuffleSplit( fs, **ss_kwargs )

        #Print calls self.GenereateStats()
        #from os import devnull
        exp.Print()  #output_stream=devnull )
        exp.PerSampleStatistics()  #output_stream=devnull )
        self.assertTrue(True)
Ejemplo n.º 3
0
    def test_FromDiscreteClassificationExperimentResults(self):
        """Rank Ordered Predicted values graph from an experiment result (multiple splits)"""

        testfilename = 'test_graph_rank_ordered_experiment.npy'

        # Make a smaller featureset to do multiple splits
        fs_kwargs = {}
        fs_kwargs['name'] = "DiscreteArtificialFS RANK ORDERED SHUFFLE SPLIT"
        fs_kwargs['n_samples'] = 100  # smaller
        fs_kwargs['n_classes'] = 5  # smaller, 20 samples per class
        fs_kwargs['num_features_per_signal_type'] = 10  # smaller
        fs_kwargs['initial_noise_sigma'] = 50
        fs_kwargs['noise_gradient'] = 20
        fs_kwargs['n_samples_per_group'] = 1
        fs_kwargs['interpolatable'] = True
        fs_kwargs['random_state'] = 42
        fs_kwargs['singularity'] = False
        fs_kwargs['clip'] = False

        small_fs = CreateArtificialFeatureSpace_Discrete(**fs_kwargs)

        ss_kwargs = {}
        ss_kwargs['quiet'] = True
        ss_kwargs['n_iter'] = n_iter = 10
        ss_kwargs['train_size'] = train_size = 18  # per-class
        ss_kwargs['test_size'] = test_size = 2  # per-class
        ss_kwargs['random_state'] = 42
        exp = FeatureSpaceClassificationExperiment.NewShuffleSplit(
            small_fs, **ss_kwargs)
        graph = PredictedValuesGraph(exp, use_averaged_results=False)
        graph.RankOrderedPredictedValuesGraph()
        self.CompareGraphs(graph, testfilename)
    def test_FitOnFitClassification(self):

        fitfile_path = wndchrm_test_dir + sep + 'test-l.fit'
        #fs = FeatureSet.NewFromFitFile( fitfile_path )
        fs = FeatureSpace.NewFromFitFile(fitfile_path)
        fs.Normalize(inplace=True, quiet=True)
        fw = FisherFeatureWeights.NewFromFeatureSpace(fs).Threshold(438)
        fw.Print(50)
        fs.FeatureReduce(fw, inplace=True)
        pychrm_split = FeatureSpaceClassification.NewWND5(fs,
                                                          fs,
                                                          fw,
                                                          quiet=False)

        from wndcharm.FeatureSpacePredictionExperiment import FeatureSpaceClassificationExperiment
        html_path = pychrm_test_dir + sep + 'test-l_training_error_result.html'
        html_exp = FeatureSpaceClassificationExperiment.NewFromHTMLReport(
            html_path, quiet=False)
        # single split in this html
        html_split = html_exp.individual_results[0]
        for i, (html_result, pychrm_result) in enumerate( zip( html_split.individual_results,\
                pychrm_split.individual_results ) ):
            try:
                self.assertEqual(html_result, pychrm_result)
            except:
                outstr = "Error in comparison # {0}:\n".format(i)
                outstr += "HTML result:\n{0}\n Python API res:\n{1}".format(
                    html_result, pychrm_result)
                raise
Ejemplo n.º 5
0
    def test_PerSampleStatisticsWITHPredictedValue(self):
        """DISCRETE PerSampleStatistics with numeric predicted value"""

        fs_kwargs = {}
        fs_kwargs['name'] = "DISCRETE PerSampleStatistics WITH Pred Values"
        fs_kwargs['n_samples'] = n_samples = 40
        fs_kwargs['n_classes'] = 2
        fs_kwargs[
            'num_features_per_signal_type'] = 10  # small on purpose, to make test fast
        fs_kwargs['noise_gradient'] = 50
        fs_kwargs['initial_noise_sigma'] = 75
        fs_kwargs['n_samples_per_group'] = 1
        fs_kwargs['random_state'] = 42
        fs_kwargs['interpolatable'] = True
        fs_kwargs['singularity'] = False
        fs_kwargs['clip'] = False
        fs = CreateArtificialFeatureSpace_Discrete(**fs_kwargs)

        ss_kwargs = {}
        ss_kwargs[
            'name'] = "Discrete PerSampleStatistics ShuffleSplit WITH Pred Values"
        ss_kwargs['quiet'] = True
        ss_kwargs['n_iter'] = n_iter = 10
        ss_kwargs['train_size'] = train_size = 8  # per-class
        ss_kwargs['test_size'] = test_size = 2  # per-class
        ss_kwargs['random_state'] = 42
        exp = FeatureSpaceClassificationExperiment.NewShuffleSplit(
            fs, **ss_kwargs)
        #Print calls self.GenereateStats()
        #from os import devnull
        exp.Print()  #output_stream=devnull )
        exp.PerSampleStatistics()  #output_stream=devnull )
        self.assertTrue(True)
    def test_PerSampleStatisticsWITHOUTPredictedValue(self):
        """DISCRETE ShuffleSplit/PerSampleStatistics w/ no predicted value"""

        # CAN'T USE THIS, SINCE THE CLASS NAMES ARE INTERPOLATABLE
        # 2-class, 10 samples per class
        #fs = FeatureSet_Discrete.NewFromFitFile( '../wndchrm_tests/test-l.fit' )

        fs_kwargs = {}
        fs_kwargs['name'] = "DISCRETE PerSampleStatistics No Pred Values"
        fs_kwargs['n_samples'] = n_samples = 20
        fs_kwargs['n_classes'] = 2
        fs_kwargs[
            'num_features_per_signal_type'] = 10  # small on purpose, to make test fast
        fs_kwargs['noise_gradient'] = 50
        fs_kwargs['initial_noise_sigma'] = 75
        fs_kwargs['n_samples_per_group'] = 1
        fs_kwargs['random_state'] = 42
        fs_kwargs['interpolatable'] = False
        fs_kwargs['singularity'] = False
        fs_kwargs['clip'] = False
        fs = CreateArtificialFeatureSpace_Discrete(**fs_kwargs)

        ss_kwargs = {}
        ss_kwargs[
            'name'] = "Discrete PerSampleStatistics ShuffleSplit No Pred Values"
        ss_kwargs['quiet'] = True
        ss_kwargs['n_iter'] = n_iter = 1
        ss_kwargs['train_size'] = train_size = 8  # per-class
        ss_kwargs['test_size'] = test_size = 2  # per-class
        ss_kwargs['random_state'] = 42
        exp = FeatureSpaceClassificationExperiment.NewShuffleSplit(
            fs, **ss_kwargs)

        ss_kwargs['lda'] = True
        exp = FeatureSpaceClassificationExperiment.NewShuffleSplit(
            fs, **ss_kwargs)
        #Print calls self.GenereateStats()
        #from os import devnull
        exp.Print()  #output_stream=devnull )
        exp.PerSampleStatistics()  #output_stream=devnull )
        self.assertTrue(True)
Ejemplo n.º 7
0
    def __init__( self, training_set, feature_weights, test_image_path,
        chart_title=None, max_num_features=300 ):

        self.timing_axes = None
        import time
        timings = []

        from wndcharm.FeatureSpacePredictionExperiment import FeatureSpaceClassificationExperiment
        from wndcharm.SingleSamplePrediction import SingleSampleClassification
        from wndcharm.FeatureSpacePrediction import FeatureSpaceClassification

        experiment = FeatureSpaceClassificationExperiment( training_set, training_set, feature_weights )
        for number_of_features_to_use in range( 1, max_num_features + 1 ):

            reduced_ts = None
            reduced_fw = None
            three_timings = []
            # Take the best of 3
            for timing in range( 3 ):
                # Time the creation and classification of a single signature
                t1 = time.time()
                reduced_fw = feature_weights.Threshold( number_of_features_to_use )
                sig = FeatureVector( source_filepath=test_image_path, feature_names=reduced_fw.feature_names ).GenerateFeatures()
                reduced_ts = training_set.FeatureReduce( reduced_fw )
                sig.Normalize( reduced_ts )
        
                result = SingleSampleClassification.NewWND5( reduced_ts, reduced_fw, sig )
                result.Print()
                # FIXME: save intermediates just in case of interruption or parallization
                # result.PickleMe()
                t2 = time.time()
                three_timings.append( t2 - t1 )

            timings.append( min( three_timings ) )

            # now, do a fit-on-fit test to measure classification accuracy
            split_result = FeatureSpaceClassification.NewWND5( reduced_ts, reduced_ts, reduced_fw )
            split_result.Print()
            experiment.individual_results.append( split_result )

        import matplotlib
        matplotlib.use('Agg')
        import matplotlib.pyplot as plt

        x_vals = list( range( 1, max_num_features + 1 ) )

        self.figure = plt.figure()
        self.main_axes = self.figure.add_subplot(111)
        if chart_title == None:
            self.chart_title = "Feature timing v. classification accuracy"    
        else:
            self.chart_title = chart_title
        self.main_axes.set_title( self.chart_title )
        self.main_axes.set_xlabel( 'Number of features' )
        self.main_axes.set_ylabel( 'Classification accuracy (%)', color='b' )
        classification_accuracies = \
          [ split_result.classification_accuracy * 100 for split_result in experiment.individual_results ]

        self.main_axes.plot( x_vals, classification_accuracies, color='b', linewidth=2 )
        for tl in self.main_axes.get_yticklabels():
            tl.set_color('b')    

        self.timing_axes = self.main_axes.twinx()
        self.timing_axes.set_ylabel( 'Time to calculate features (s)', color='r' )
        self.timing_axes.plot( x_vals, timings, color='r' )
        for tl in self.timing_axes.get_yticklabels():
            tl.set_color('r')    
    def test_FitOnFit(self):
        """Uses a curated subset of the IICBU 2008 Lymphoma dataset, preprocessed as follows:
        auto-deconvolved, eosin channel only, tiled 5x6, 3 classes, 10 imgs per class,
        300 samples per class.
        """

        # Inflate the zipped test fit into a temp file
        import zipfile
        zipped_file_path = pychrm_test_dir + sep + 'lymphoma_iicbu2008_subset_EOSIN_ONLY_t5x6_v3.2features.fit.zip'
        zf = zipfile.ZipFile(zipped_file_path, mode='r')
        tempdir = mkdtemp()
        zf.extractall(tempdir)

        try:
            fitfilepath = tempdir + sep + zf.namelist()[0]

            # Do fit on fit WITHOUT tiling and compare with fit on fit results
            # generated with wndchrm 1.60
            fs = FeatureSpace.NewFromFitFile(fitfilepath).Normalize(
                inplace=True, quiet=True)
            #fs = FeatureSpace.NewFromFitFile( wndchrm_test_dir + sep + 'test-l.fit' )
            #fs.ToFitFile( 'temp.fit' )
            fw = FisherFeatureWeights.NewFromFeatureSpace(fs).Threshold()
            fs.FeatureReduce(fw, inplace=True)
            #            #fw.Print()
            #            #fs.Print(verbose=True)
            pychrm_res = FeatureSpaceClassification.NewWND5(fs, fs, fw)
            pychrm_res.Print()
            #
            #            import cProfile as pr
            #            #import profile as pr
            #            import tempfile
            #            import pstats
            #            prof = tempfile.NamedTemporaryFile()
            #            cmd = 'no_tile_pychrm_result = DiscreteBatchClassificationResult.New( reduced_fs, reduced_fs, fw )'
            #            pr.runctx( cmd, globals(), locals(), prof.name)
            #            p = pstats.Stats(prof.name)
            #            p.sort_stats('time').print_stats(20)
            #            prof.close()

            self.maxDiff = None

            html_path = pychrm_test_dir + sep + 'lymphoma_iicbu2008_subset_eosin_t5x6_v3.2feats_REFERENCE_RESULTS_900_samples_TRAINING_ERROR.html'
            wres = FeatureSpaceClassificationExperiment.NewFromHTMLReport(
                html_path)
            wres.Print()
            wc_batch_result = wres.individual_results[
                0]  # only 1 split in fit-on-fit

            # This takes WAY too long:
            #self.assertSequenceEqual( wc_batch_result.individual_results, pychrm_res.individual_results )
            wc_result = np.empty((3 * len(wc_batch_result.individual_results)))
            for i, single_result in enumerate(
                    wc_batch_result.individual_results):
                wc_result[i * 3:(i + 1) *
                          3] = single_result.marginal_probabilities

            pc_result = np.empty((3 * len(pychrm_res.individual_results)))
            for i, single_result in enumerate(pychrm_res.individual_results):
                # HTML report only has 3 decimal places
                pc_result[ i*3 : (i+1)*3 ] = \
                    [ float( "{0:0.3f}".format( val ) ) for val in single_result.marginal_probabilities ]

            from numpy.testing import assert_allclose
            assert_allclose(actual=pc_result, desired=wc_result, atol=0.003)

            #wc_batch_result.Print()
            #pres.Print()

            # ==========================================================
            # Now do the same with tiling, reusing fs from before:

            num_samples_per_group = 30
            n_groups = fs.num_samples / num_samples_per_group
            new_sg_ids = [
                i for i in xrange(n_groups)
                for j in xrange(num_samples_per_group)
            ]
            fs.Update( tile_num_rows=5, tile_num_cols=6, num_samples_per_group=30,\
                    _contiguous_sample_group_ids=new_sg_ids )._RebuildViews()
            with_tile_pychrm_result = FeatureSpaceClassification.NewWND5(
                fs, fs, fw)
            html_path = pychrm_test_dir + sep + 'lymphoma_iicbu2008_subset_eosin_t5x6_v3.2feats_REFERENCE_RESULTS_30_samples_tiled_TRAINING_ERROR.html'
            with_tile_wndchrm_result = \
              FeatureSpaceClassificationExperiment.NewFromHTMLReport( html_path ).individual_results[0]

            #self.assertSequenceEqual( with_tile_pychrm_result.averaged_results, with_tile_wndchrm_result.individual_results )
            wc_result = np.empty(
                (3 * len(with_tile_wndchrm_result.individual_results)))
            for i, single_result in enumerate(
                    with_tile_wndchrm_result.individual_results):
                wc_result[i * 3:(i + 1) *
                          3] = single_result.marginal_probabilities

            pc_result = np.empty(
                (3 * len(with_tile_pychrm_result.averaged_results)))
            for i, single_result in enumerate(
                    with_tile_pychrm_result.averaged_results):
                # HTML report only has 3 decimal places
                pc_result[ i*3 : (i+1)*3 ] = \
                    [ float( "{0:0.3f}".format( val ) ) for val in single_result.marginal_probabilities ]

            assert_allclose(actual=pc_result, desired=wc_result, atol=0.003)

        finally:
            rmtree(tempdir)
Ejemplo n.º 9
0
else:
    test_set = get_featureset( testing_filename )
    if write_intermediates:
        train_set.ToFitFile()

if feature_usage_fraction:
    if feature_usage_fraction < 0 or feature_usage_fraction > 1.0:
        raise Exception('Feature usage fraction must be on interval [0,1]')
    num_features = int( feature_usage_fraction * train_set.num_features )

if num_features:
    print "Using top {0} Fisher-ranked features.".format( num_features )
else:
    print "Using top 15% Fisher-ranked features."

experiment = FeatureSpaceClassificationExperiment( training_set=train_set )

train_set.Normalize( inplace=True )
weights = FisherFeatureWeights.NewFromFeatureSpace( train_set ).Threshold( num_features )
train_set.FeatureReduce( weights, inplace=True )

if train_set != test_set:
    test_set.FeatureReduce( weights, inplace=True ).Normalize( train_set )

for i in range( num_splits ):
    split = FeatureSpaceClassification.NewWND5( train_set, test_set, weights, batch_number=i )
    experiment.individual_results.append( split )

if outpath:
    experiment.Print( output_filepath=outpath, mode='w' )
    #experiment.PerSampleStatistics( output_filepath=outpath, mode= 'a' )