def test_NewFromFeatureSet(self): """Fisher score calculation""" fs = FeatureSpace.NewFromFitFile( self.test_fit_path).Normalize(inplace=True) fw = FisherFeatureWeights.NewFromFeatureSpace(fs) # test weights generated from test-l.fit: # wndchrm classify -l -f1.0 -vtest_fit-l.weights test-l.fit test-l.fit target_fw = FisherFeatureWeights.NewFromFile( self.test_feat_weight_path) for target_val, res_val in zip(target_fw.values, fw.values): self.assertAlmostEqual(target_val, res_val, delta=self.epsilon) # test slice operator orig_len = len(fw) sliced = fw[:10] self.assertEqual(len(sliced), 10) self.assertEqual(len(sliced.feature_names), 10) self.assertEqual(len(sliced.values), 10) for i in xrange(10): self.assertEqual(sliced.feature_names[i], fw.feature_names[i]) self.assertEqual(sliced.values[i], fw.values[i]) sliced = fw[50:100:2] for i, j in zip(range(len(sliced)), range(50, 100, 2)): self.assertEqual(sliced.feature_names[i], fw.feature_names[j]) self.assertEqual(sliced.values[i], fw.values[j])
def test_NewFromFeatureSet(self): """Fisher score calculation""" feature_set = FeatureSpace.NewFromFitFile(self.test_fit_path) feature_set.Normalize(inplace=True) result_weights = FisherFeatureWeights.NewFromFeatureSpace(feature_set) # test weights generated from test-l.fit: # wndchrm classify -l -f1.0 -vtest_fit-l.weights test-l.fit test-l.fit target_weights = FisherFeatureWeights.NewFromFile( self.test_feat_weight_path) for target_val, res_val in zip(target_weights.values, result_weights.values): self.assertAlmostEqual(target_val, res_val, delta=self.epsilon)
def test_DiscreteTrainTestSplitWithTiling( self ): """Uses a curated subset of the IICBU 2008 Lymphoma dataset, preprocessed as follows: auto-deconvolved, eosin channel only, tiled 5x6, 3 classes, 10 imgs per class, 300 samples per class. """ # Inflate the zipped test fit into a temp file import zipfile zipped_file_path = pychrm_test_dir + sep + 'lymphoma_iicbu2008_subset_EOSIN_ONLY_t5x6_v3.2features.fit.zip' zf = zipfile.ZipFile( zipped_file_path, mode='r' ) tempdir = mkdtemp() zf.extractall( tempdir ) try: fitfilepath = tempdir + sep + zf.namelist()[0] #fs = FeatureSet.NewFromFitFile( fitfilepath ) fs = FeatureSpace.NewFromFitFile( fitfilepath, tile_num_rows=5, tile_num_cols=6 ) from numpy.random import RandomState prng = RandomState(42) train, test = fs.Split( random_state=prng, quiet=True ) train.Normalize( inplace=True, quiet=True ) fw = FisherFeatureWeights.NewFromFeatureSpace( train ).Threshold() train.FeatureReduce( fw, inplace=True ) test.FeatureReduce( fw, inplace=True ).Normalize( train, inplace=True, quiet=True ) finally: rmtree( tempdir )
def test_FitOnFitClassification(self): fitfile_path = wndchrm_test_dir + sep + 'test-l.fit' #fs = FeatureSet.NewFromFitFile( fitfile_path ) fs = FeatureSpace.NewFromFitFile(fitfile_path) fs.Normalize(inplace=True, quiet=True) fw = FisherFeatureWeights.NewFromFeatureSpace(fs).Threshold(438) fw.Print(50) fs.FeatureReduce(fw, inplace=True) pychrm_split = FeatureSpaceClassification.NewWND5(fs, fs, fw, quiet=False) from wndcharm.FeatureSpacePredictionExperiment import FeatureSpaceClassificationExperiment html_path = pychrm_test_dir + sep + 'test-l_training_error_result.html' html_exp = FeatureSpaceClassificationExperiment.NewFromHTMLReport( html_path, quiet=False) # single split in this html html_split = html_exp.individual_results[0] for i, (html_result, pychrm_result) in enumerate( zip( html_split.individual_results,\ pychrm_split.individual_results ) ): try: self.assertEqual(html_result, pychrm_result) except: outstr = "Error in comparison # {0}:\n".format(i) outstr += "HTML result:\n{0}\n Python API res:\n{1}".format( html_result, pychrm_result) raise
def test_WND5_all_features(self): epsilon = 0.00001 # Define paths to original files test_sig_path = join(test_dir, 't1_s01_c05_ij-l_precalculated.sig') test_fit_path = join(test_dir, 'test-l.fit') test_feat_wght_path = join(test_dir, 'test_fit-l.weights') test_tif_path = join(test_dir, 't1_s01_c05_ij.tif') # Here are the correct values that Python API needs to return: # wndchrm classify -l -f0.75 test-l.fit t1_s01_c05_ij.tif # t1_s01_c05_ij.tif 1.6e-27 0.083 0.917 * 4cell 3.835 # wndchrm classify -l test-l.fit t1_s01_c05_ij.tif # t1_s01_c05_ij.tif 3.19e-27 0.076 0.924 * 4cell 3.848 # wndchrm classify -l -f0.05 test-l.fit t1_s01_c05_ij.tif # t1_s01_c05_ij.tif 1.06e-26 0.066 0.934 * 4cell 3.869 correct_marg_probs = {} correct_marg_probs[2189] = [0.083, 0.917] correct_marg_probs[438] = [0.076, 0.924] correct_marg_probs[146] = [0.066, 0.934] # Load the original files once and only once for all this class's tests feature_set = FeatureSpace.NewFromFitFile(test_fit_path) fs1 = feature_set.feature_names feature_set.Normalize() fs2 = feature_set.feature_names self.assertSequenceEqual(fs1, fs2) test_sample = FeatureVector(source_filepath=test_tif_path, long=True) test_sample.LoadSigFile(test_sig_path) self.assertSequenceEqual(feature_set.feature_names, test_sample.feature_names) test_sample.Normalize(feature_set) all_weights = FisherFeatureWeights.NewFromFile(test_feat_wght_path) def Check(num_feats): weights = all_weights.Threshold(num_feats) feat_set = feature_set.FeatureReduce(weights) sample = test_sample.FeatureReduce(weights) result = SingleSampleClassification.NewWND5( feat_set, weights, sample) result_marg_probs = [ round( val, 3 ) \ for val in result.marginal_probabilities ] for target_val, res_val in zip(correct_marg_probs[num_feats], result_marg_probs): self.assertAlmostEqual(target_val, res_val, delta=epsilon) for num_feats in correct_marg_probs: Check(num_feats)
def test_DiscreteTrainTestSplitNoTiling( self ): """Uses binucleate test set""" fitfilepath = wndchrm_test_dir + sep + 'test-l.fit' fs = FeatureSpace.NewFromFitFile( fitfilepath ) from numpy.random import RandomState prng = RandomState(42) full_train, full_test = fs.Split( random_state=prng, quiet=True ) full_train.Normalize( quiet=True ) reduced_fw = FisherFeatureWeights.NewFromFeatureSpace( full_train ).Threshold() reduced_train = full_train.FeatureReduce( reduced_fw ) reduced_test = full_test.FeatureReduce( reduced_fw ) reduced_test.Normalize( reduced_train, quiet=True ) batch_result = FeatureSpaceClassification.NewWND5( reduced_train, reduced_test, reduced_fw, quiet=True )
def test_IfNotInterpolatable( self ): """You can't graph predicted values if the classes aren't interpolatable.""" testfilename = 'ShouldntBeGraphable.png' small_fs = CreateArtificialFeatureSpace_Discrete( n_samples=20, n_classes=2, random_state=42, interpolatable=False ) train_set, test_set = small_fs.Split( random_state=False, quiet=True ) train_set.Normalize() fw = FisherFeatureWeights.NewFromFeatureSpace( train_set ).Threshold() reduced_train_set = train_set.FeatureReduce( fw ) reduced_test_set = test_set.FeatureReduce( fw ) test_set.Normalize( train_set, quiet=True ) batch_result = FeatureSpaceClassification.NewWND5( reduced_train_set, reduced_test_set, fw, quiet=True ) with self.assertRaises( ValueError ): graph = PredictedValuesGraph( batch_result )
def test_TiledTrainTestSplit(self): """Uses a fake FeatureSpace""" from wndcharm.ArtificialFeatureSpace import CreateArtificialFeatureSpace_Discrete fs_kwargs = {} fs_kwargs['name'] = "DiscreteArtificialFS 10-class" fs_kwargs['n_samples'] = 1000 fs_kwargs['n_classes'] = 10 # 100 samples per class fs_kwargs['num_features_per_signal_type'] = 25 fs_kwargs['initial_noise_sigma'] = 40 fs_kwargs['noise_gradient'] = 20 fs_kwargs['n_samples_per_group'] = 4 # 25 images, 2x2 tiling scheme fs_kwargs['interpolatable'] = True fs_kwargs['random_state'] = 43 fs_kwargs['singularity'] = False fs_kwargs['clip'] = False fs = CreateArtificialFeatureSpace_Discrete(**fs_kwargs) train, test = fs.Split(random_state=False, quiet=True) train.Normalize(inplace=True, quiet=True) fw = FisherFeatureWeights.NewFromFeatureSpace(train).Threshold() train.FeatureReduce(fw, inplace=True) test.FeatureReduce(fw, inplace=True, quiet=True).Normalize(train, inplace=True, quiet=True) result = FeatureSpaceClassification.NewWND5(train, test, fw) result.Print() for class_name in result.test_set.class_names: try: self.assertEqual( result.similarity_matrix[class_name][class_name], float(1)) except: print "offending class: {0}, val: {1}".format( class_name, result.similarity_matrix[class_name][class_name]) raise
class TestGraphs(unittest.TestCase): """Test WND-CHARM's graph-making functionality.""" fs_kwargs = {} fs_kwargs['name'] = "DiscreteArtificialFS 10-class" fs_kwargs['n_samples'] = 1000 fs_kwargs['n_classes'] = 10 fs_kwargs['num_features_per_signal_type'] = 25 fs_kwargs['initial_noise_sigma'] = 40 fs_kwargs['noise_gradient'] = 20 fs_kwargs['n_samples_per_group'] = 1 fs_kwargs['interpolatable'] = True fs_kwargs['random_state'] = 43 fs_kwargs['singularity'] = False fs_kwargs['clip'] = False fs = CreateArtificialFeatureSpace_Discrete(**fs_kwargs) train_set, test_set = fs.Split(random_state=False, quiet=True) train_set.Normalize(quiet=True) fw = FisherFeatureWeights.NewFromFeatureSpace(train_set).Threshold() reduced_train_set = train_set.FeatureReduce(fw) reduced_test_set = test_set.FeatureReduce(fw) reduced_test_set.Normalize(reduced_train_set, quiet=True) batch_result = FeatureSpaceClassification.NewWND5(reduced_train_set, reduced_test_set, fw, quiet=True) def setUp(self): self.tempdir = mkdtemp() def tearDown(self): rmtree(self.tempdir) def CompareGraphs(self, graph, testfilename): """Helper function to check output graphs""" # Uncoment to see what graph looks like! #graph.SaveToFile( testfilename + 'GRAPH.png' ) # We used to output the graphs to a png file and do a binary diff on a reference png # but there are superficial differences between matplotlib versions that result in # the points still being in the right place, but the font is slightly larger, # or the text is subtlely offset. So now, we interrogate the matplotlib.figure # object and retrieve its coordinates and check them against blessed numpy arrays # saved to a npy file. axessubplot = graph.figure.gca() if len(axessubplot.lines) > 0: # line plot try: all_coords = np.dstack( tuple([ group._path._vertices for group in axessubplot.lines ])) except AttributeError: # older version of matplotlib didn't include leading underscore in attribute # "_vertices" all_coords = np.dstack( tuple( [group._path.vertices for group in axessubplot.lines])) elif len(axessubplot.collections) > 0: # scatter plot all_coords = np.dstack( tuple([group._offsets for group in axessubplot.collections])) else: self.fail("Graph doesn't have any lines nor points") # uncomment to replace old coords #np.save( testfilename, all_coords ) #from os.path import splitext #testfilename_base, ext = splitext( testfilename ) #np.save( testfilename_base + 'NEW.npy', all_coords ) reference_array = np.load(testfilename) if not np.array_equal(all_coords, reference_array): if not np.allclose(all_coords, reference_array): errmsg = 'Reference graph "{0}" coordinates '.format(testfilename) + \ 'do not concur with coordinates generated by this test.' self.fail(errmsg) @unittest.skipIf(HasMatplotlib, "Skipped if matplotlib IS installed") def test_ErrMsgIfMatplotibNotInstalled(self): """Fail gracefully with informative message if matplotlib""" graph = PredictedValuesGraph(self.batch_result) with self.assertRaises(ImportError): graph.RankOrderedPredictedValuesGraph() with self.assertRaises(ImportError): graph.KernelSmoothedDensityGraph() @unittest.skipUnless(HasMatplotlib, "Skipped if matplotlib IS NOT installed") @unittest.expectedFailure def test_RankOrderedFromBatchClassificationResult(self): """Rank Ordered Predicted values graph from a single split""" testfilename = 'test_graph_rank_ordered_interpolated_discrete.npy' graph = PredictedValuesGraph(self.batch_result) graph.RankOrderedPredictedValuesGraph() self.CompareGraphs(graph, testfilename) @unittest.skipUnless(HasMatplotlib, "Skipped if matplotlib IS NOT installed") @unittest.expectedFailure def test_KernelSmoothedFromBatchClassificationResult(self): """Kernel Smoothed Probability density graph from a single split""" testfilename = 'test_graph_kernel_smoothed.npy' graph = PredictedValuesGraph(self.batch_result) graph.KernelSmoothedDensityGraph() self.CompareGraphs(graph, testfilename) @unittest.skipUnless(HasMatplotlib, "Skipped if matplotlib IS NOT installed") def test_FromDiscreteClassificationExperimentResults(self): """Rank Ordered Predicted values graph from an experiment result (multiple splits)""" testfilename = 'test_graph_rank_ordered_experiment.npy' # Make a smaller featureset to do multiple splits fs_kwargs = {} fs_kwargs['name'] = "DiscreteArtificialFS RANK ORDERED SHUFFLE SPLIT" fs_kwargs['n_samples'] = 100 # smaller fs_kwargs['n_classes'] = 5 # smaller, 20 samples per class fs_kwargs['num_features_per_signal_type'] = 10 # smaller fs_kwargs['initial_noise_sigma'] = 50 fs_kwargs['noise_gradient'] = 20 fs_kwargs['n_samples_per_group'] = 1 fs_kwargs['interpolatable'] = True fs_kwargs['random_state'] = 42 fs_kwargs['singularity'] = False fs_kwargs['clip'] = False small_fs = CreateArtificialFeatureSpace_Discrete(**fs_kwargs) ss_kwargs = {} ss_kwargs['quiet'] = True ss_kwargs['n_iter'] = n_iter = 10 ss_kwargs['train_size'] = train_size = 18 # per-class ss_kwargs['test_size'] = test_size = 2 # per-class ss_kwargs['random_state'] = 42 exp = FeatureSpaceClassificationExperiment.NewShuffleSplit( small_fs, **ss_kwargs) graph = PredictedValuesGraph(exp, use_averaged_results=False) graph.RankOrderedPredictedValuesGraph() self.CompareGraphs(graph, testfilename) @unittest.skipUnless(HasMatplotlib, "Skipped if matplotlib IS NOT installed") def test_HyperparameterOptimizationGraph(self): """Accuracy vs. # features or samples with and without LDA feature space transform""" testfilename = 'test_graph_rank_ordered_experiment.npy' # Make a smaller featureset to do multiple splits fs_kwargs = {} fs_kwargs['name'] = "DiscreteArtificialFS RANK ORDERED SHUFFLE SPLIT" fs_kwargs['n_samples'] = 100 # smaller fs_kwargs['n_classes'] = 5 # smaller, 20 samples per class fs_kwargs['num_features_per_signal_type'] = 10 # smaller fs_kwargs['initial_noise_sigma'] = 50 fs_kwargs['noise_gradient'] = 20 fs_kwargs['n_samples_per_group'] = 1 fs_kwargs['interpolatable'] = True fs_kwargs['random_state'] = 42 fs_kwargs['singularity'] = False fs_kwargs['clip'] = False small_fs = CreateArtificialFeatureSpace_Discrete(**fs_kwargs) ss_kwargs = {} ss_kwargs['quiet'] = False ss_kwargs['n_iter'] = n_iter = 10 ss_kwargs['train_size'] = train_size = 18 # per-class ss_kwargs['test_size'] = test_size = 2 # per-class ss_kwargs['random_state'] = 42 ss_kwargs['show_raw'] = True ss_kwargs['show_lda'] = True ss_kwargs['param'] = 'features' ss_kwargs['text_angle'] = -30 graph = HyperparameterOptimizationGraph(small_fs) graph.GridSearch(**ss_kwargs) #graph.savefig( '/Users/colettace/test_features.png' ) ss_kwargs['param'] = 'samples' ss_kwargs['quiet'] = False ss_kwargs['text_angle'] = -30 graph = HyperparameterOptimizationGraph(small_fs) graph.GridSearch(**ss_kwargs) #graph.savefig( '/Users/colettace/test_samples.png' ) @unittest.skipUnless(HasMatplotlib, "Skipped if matplotlib IS NOT installed") def test_FromHTML(self): """Rank Ordered Predicted values graph from an experiment result (multiple splits)""" testfilename = 'test_graph_fromHTML.npy' # Inflate the zipped html file into a temp file import zipfile #zipped_file_path = pychrm_test_dir + sep + 'c_elegans_terminal_bulb.html' #import zlib #zf = zipfile.ZipFile( zipped_file_path + '.zip', mode='w' ) #zf.write( zipped_file_path, compress_type=zipfile.ZIP_DEFLATED ) #zf.close() zipped_file_path = pychrm_test_dir + sep + 'c_elegans_terminal_bulb.html.zip' zf = zipfile.ZipFile(zipped_file_path, mode='r') zf.extractall(self.tempdir) htmlfilepath = self.tempdir + sep + zf.namelist()[0] graph = PredictedValuesGraph.NewFromHTMLReport( htmlfilepath, use_averaged_results=False) graph.RankOrderedPredictedValuesGraph() self.CompareGraphs(graph, testfilename) @unittest.skipUnless(HasMatplotlib, "Skipped if matplotlib IS NOTinstalled") def test_IfNotInterpolatable(self): """You can't graph predicted values if the classes aren't interpolatable.""" testfilename = 'ShouldntBeGraphable.png' small_fs = CreateArtificialFeatureSpace_Discrete(n_samples=20, n_classes=2, random_state=42, interpolatable=False) train_set, test_set = small_fs.Split(random_state=False, quiet=True) train_set.Normalize() fw = FisherFeatureWeights.NewFromFeatureSpace(train_set).Threshold() reduced_train_set = train_set.FeatureReduce(fw) reduced_test_set = test_set.FeatureReduce(fw) test_set.Normalize(train_set, quiet=True) batch_result = FeatureSpaceClassification.NewWND5(reduced_train_set, reduced_test_set, fw, quiet=True) with self.assertRaises(ValueError): graph = PredictedValuesGraph(batch_result)
def test_FitOnFit(self): """Uses a curated subset of the IICBU 2008 Lymphoma dataset, preprocessed as follows: auto-deconvolved, eosin channel only, tiled 5x6, 3 classes, 10 imgs per class, 300 samples per class. """ # Inflate the zipped test fit into a temp file import zipfile zipped_file_path = pychrm_test_dir + sep + 'lymphoma_iicbu2008_subset_EOSIN_ONLY_t5x6_v3.2features.fit.zip' zf = zipfile.ZipFile(zipped_file_path, mode='r') tempdir = mkdtemp() zf.extractall(tempdir) try: fitfilepath = tempdir + sep + zf.namelist()[0] # Do fit on fit WITHOUT tiling and compare with fit on fit results # generated with wndchrm 1.60 fs = FeatureSpace.NewFromFitFile(fitfilepath).Normalize( inplace=True, quiet=True) #fs = FeatureSpace.NewFromFitFile( wndchrm_test_dir + sep + 'test-l.fit' ) #fs.ToFitFile( 'temp.fit' ) fw = FisherFeatureWeights.NewFromFeatureSpace(fs).Threshold() fs.FeatureReduce(fw, inplace=True) # #fw.Print() # #fs.Print(verbose=True) pychrm_res = FeatureSpaceClassification.NewWND5(fs, fs, fw) pychrm_res.Print() # # import cProfile as pr # #import profile as pr # import tempfile # import pstats # prof = tempfile.NamedTemporaryFile() # cmd = 'no_tile_pychrm_result = DiscreteBatchClassificationResult.New( reduced_fs, reduced_fs, fw )' # pr.runctx( cmd, globals(), locals(), prof.name) # p = pstats.Stats(prof.name) # p.sort_stats('time').print_stats(20) # prof.close() self.maxDiff = None html_path = pychrm_test_dir + sep + 'lymphoma_iicbu2008_subset_eosin_t5x6_v3.2feats_REFERENCE_RESULTS_900_samples_TRAINING_ERROR.html' wres = FeatureSpaceClassificationExperiment.NewFromHTMLReport( html_path) wres.Print() wc_batch_result = wres.individual_results[ 0] # only 1 split in fit-on-fit # This takes WAY too long: #self.assertSequenceEqual( wc_batch_result.individual_results, pychrm_res.individual_results ) wc_result = np.empty((3 * len(wc_batch_result.individual_results))) for i, single_result in enumerate( wc_batch_result.individual_results): wc_result[i * 3:(i + 1) * 3] = single_result.marginal_probabilities pc_result = np.empty((3 * len(pychrm_res.individual_results))) for i, single_result in enumerate(pychrm_res.individual_results): # HTML report only has 3 decimal places pc_result[ i*3 : (i+1)*3 ] = \ [ float( "{0:0.3f}".format( val ) ) for val in single_result.marginal_probabilities ] from numpy.testing import assert_allclose assert_allclose(actual=pc_result, desired=wc_result, atol=0.003) #wc_batch_result.Print() #pres.Print() # ========================================================== # Now do the same with tiling, reusing fs from before: num_samples_per_group = 30 n_groups = fs.num_samples / num_samples_per_group new_sg_ids = [ i for i in xrange(n_groups) for j in xrange(num_samples_per_group) ] fs.Update( tile_num_rows=5, tile_num_cols=6, num_samples_per_group=30,\ _contiguous_sample_group_ids=new_sg_ids )._RebuildViews() with_tile_pychrm_result = FeatureSpaceClassification.NewWND5( fs, fs, fw) html_path = pychrm_test_dir + sep + 'lymphoma_iicbu2008_subset_eosin_t5x6_v3.2feats_REFERENCE_RESULTS_30_samples_tiled_TRAINING_ERROR.html' with_tile_wndchrm_result = \ FeatureSpaceClassificationExperiment.NewFromHTMLReport( html_path ).individual_results[0] #self.assertSequenceEqual( with_tile_pychrm_result.averaged_results, with_tile_wndchrm_result.individual_results ) wc_result = np.empty( (3 * len(with_tile_wndchrm_result.individual_results))) for i, single_result in enumerate( with_tile_wndchrm_result.individual_results): wc_result[i * 3:(i + 1) * 3] = single_result.marginal_probabilities pc_result = np.empty( (3 * len(with_tile_pychrm_result.averaged_results))) for i, single_result in enumerate( with_tile_pychrm_result.averaged_results): # HTML report only has 3 decimal places pc_result[ i*3 : (i+1)*3 ] = \ [ float( "{0:0.3f}".format( val ) ) for val in single_result.marginal_probabilities ] assert_allclose(actual=pc_result, desired=wc_result, atol=0.003) finally: rmtree(tempdir)
train_set.ToFitFile() if feature_usage_fraction: if feature_usage_fraction < 0 or feature_usage_fraction > 1.0: raise Exception('Feature usage fraction must be on interval [0,1]') num_features = int( feature_usage_fraction * train_set.num_features ) if num_features: print "Using top {0} Fisher-ranked features.".format( num_features ) else: print "Using top 15% Fisher-ranked features." experiment = FeatureSpaceClassificationExperiment( training_set=train_set ) train_set.Normalize( inplace=True ) weights = FisherFeatureWeights.NewFromFeatureSpace( train_set ).Threshold( num_features ) train_set.FeatureReduce( weights, inplace=True ) if train_set != test_set: test_set.FeatureReduce( weights, inplace=True ).Normalize( train_set ) for i in range( num_splits ): split = FeatureSpaceClassification.NewWND5( train_set, test_set, weights, batch_number=i ) experiment.individual_results.append( split ) if outpath: experiment.Print( output_filepath=outpath, mode='w' ) #experiment.PerSampleStatistics( output_filepath=outpath, mode= 'a' ) else: experiment.Print() #experiment.PerSampleStatistics()
sys.exit(0) if from_scratch: # I preprocessed your training set and feature weights and pickled them for speed. # Pickle files are binary files that are super fast to load. # You don't need to use a pickle file though, you can make one from scratch # Here's how: # 1. Load the raw c-charm fit file full_training_set = FeatureSet_Discrete.NewFromFitFile(input_filename) # 3. Normalize the features: full_training_set.Normalize() # 4. Make Fisher scores based on the normalized training set full_fisher_weights = FisherFeatureWeights.NewFromFeatureSet( full_training_set) # 5. Take only the top 200 features reduced_fisher_weights = full_fisher_weights.Threshold(num_features) # 6. Reduce the training set feature space to contain only those top 200 features reduced_training_set = full_training_set.FeatureReduce( reduced_fisher_weights.names) # 7. Save your work: reduced_training_set.PickleMe( os.path.splitext(input_filename)[0] + ".fit.pickled") reduced_fisher_weights.PickleMe( os.path.splitext(input_filename)[0] + "_w" + str(num_features) + ".weights.pickled") else: