Beispiel #1
0
    def test_NewFromFileOfFiles( self ):
        """Pulls in the lymphoma eosin histology 5x6 tiled featureset via .sig files."""

        # Types of files containing features:
        # FIT: contains an entire FeatureSpace definition including features.
        # FOF: "File Of Files" containing a FeatureSpace structure definition only,
        #      listing paths to files of pre-calculated features (.sig) or the
        #      tiff images themselves so features can be calculated
        # SIG: A text file containing pre-calculated features for a single sample.

        # Test dataset: subset of the IICBU2008 lymphoma dataset. 2 channels (H+E),
        #    3 classes ('CLL', 'FL', 'MCL'), 10 images per class per channel,
        #    5x6 tiling grid = 30 samples per image resulting in 
        #    2 x 3 x 10 X 30 = 1800 total samples available

        # Files containing features included in this test suite:
        # 1. lymphoma_iicbu2008_subset_EOSIN_ONLY_t5x6_v3.2features.fit.zip:
        #    A zip archive containing a single FIT file with features pre-calculated.
        # 2. lymphoma_iicbu2008_subset_HE_t5x6_v3.2features_SIGFILES.zip:
        #    Contains 1800 SIG files, plus 4 FOF files (items 2-5 below):
        #       "lymphoma_iicbu2008_subset_EOSIN_ONLY_images.fof.tsv"
        #       "lymphoma_iicbu2008_subset_EOSIN_ONLY_sigfiles_t5x6-l.fof.tsv"
        #       "lymphoma_iicbu2008_subset_2CHAN_HE_images.fof.tsv"
        #       "lymphoma_iicbu2008_subset_2CHAN_HE_sigfiles_t5x6-l.fof.tsv"

        # List of possible feature sources:
        #    1. Single channel FIT (Eosin only)
        #    2. Single channel FOF (Eosin only) referencing to 30 tiffs (requires global sampling options -t5x6 -l to grab sigs)
        #    3. Single channel FOF (Eosin only) referencing 900 sig files
        #    4. Double channel FOF (Eosin+Haemotoxylin) referencing 60 tiffs (requires global sampling options -t5x6 -l to grab sigs)
        #    5. Double channel FOF (Eosin+Haemotoxylin) referencing 1800 sig files.

        #=============================================
        # BEGIN CODE TO CREATE TESTDATA ZIP PACKAGE

        #import zipfile
        #import zlib
        #path = '/Users/chris/src/wnd-charm/tests/pywndcharm_tests/TESTDATA_lymphoma_iicbu2008_subset_HE_t5x6_v3.2features_SIGFILES.zip'
        #zf = zipfile.ZipFile( path, mode='w' )
        #import os
        #classes = 'CLL', 'FL', 'MCL',
        #channels = 'haemotoxylin', 'eosin'
        #from collections import defaultdict
        #sig_tracker = defaultdict(int)
        #samplegroupid_tracker = {}
        #samplegroup_counter = 0
        #
        #eosin_tif_fof = [] # 30 lines
        #eosin_sig_fof = [] # 900 lines
        #double_tif_fof = [] # 30 lines, 2 feature set columns
        #double_sig_fof = [] # 900 lines, 2 feature set columns
        #
        #for _channel in channels:
        #    zf.write( './' + _channel, compress_type=zipfile.ZIP_DEFLATED )
        #    for _class in classes:
        #        zf.write( './' + _channel + '/' + _class, compress_type=zipfile.ZIP_DEFLATED )
        #        for root, dirs, files in os.walk( _channel + '/' + _class ):
        #            for _file in files:
        #                if _file.endswith( '.tif' ):
        #                    # Strip off the _H.tif or _E.tif
        #                    samplename = _file[:-6]
        #                    eosinpath = './eosin/' + _class + '/' + samplename + '_E.tif'
        #                    haemopath = './haemotoxylin/' + _class + '/' + samplename + '_H.tif'
        #                    if _channel == 'eosin':
        #                        eosin_tif_fof.append( eosinpath + '\t' + _class )
        #                        double_tif_fof.append( samplename + '\t' + _class + '\t' + eosinpath + '\t{\tchannel\t=\teosin\t}\t' + haemopath + '\t{\tchannel\t=\thaemotoxylin\t}')
        #                elif _file.endswith( '.sig' ):
        #                    zf.write( './' + _channel + '/' + _class + '/' + _file, compress_type=zipfile.ZIP_DEFLATED )
        #                    if _channel == 'eosin':
        #                        # Strip off the _H-t5x6_0_0-l.sig
        #                        samplename = _file[:-17] + '.tif'
        #                        eosinpath = './eosin/' + _class + '/' + _file
        #                        haemopath = './haemotoxylin/' + _class + '/' + _file.replace( '_E-t5x6_', '_H-t5x6_' )
        #                        # count samples from 0:
        #                        samplesequenceid = str( sig_tracker[ samplename ] )
        #                        sig_tracker[ samplename ] += 1
        #                        if samplename not in samplegroupid_tracker:
        #                            samplegroupid_tracker[ samplename ] = samplegroup_counter
        #                            samplegroup_counter += 1
        #                        samplegroupid = str( samplegroupid_tracker[ samplename ] )
        #                        eosin_sig_fof.append( eosinpath + '\t' + _class )
        #                        double_sig_fof.append( samplename + '\t' + _class + '\t' + eosinpath + '\t{\tchannel\t=\teosin\t;\tsamplegroupid\t=\t' + samplegroupid + '\t;\tsamplesequenceid\t=\t' + samplesequenceid + '\t}\t' + haemopath + '\t{\tchannel\t=\thaemotoxylin\t;\tsamplegroupid\t=\t' + samplegroupid + '\t;\tsamplesequenceid\t=\t' + samplesequenceid + '\t}\t')
        #
        #fof_dir = '/Users/chris/src/wnd-charm/tests/pywndcharm_tests/'
        #with open( 'lymphoma_iicbu2008_subset_EOSIN_ONLY_images.fof.tsv', 'w') as out:
        #    for _ in eosin_tif_fof:
        #        out.write( _ + '\n')
        #with open( 'lymphoma_iicbu2008_subset_EOSIN_ONLY_sigfiles_t5x6-l.fof.tsv', 'w') as out:
        #    for _ in eosin_sig_fof:
        #        out.write( _ + '\n')
        #with open( 'lymphoma_iicbu2008_subset_2CHAN_HE_images.fof.tsv', 'w') as out:
        #    for _ in double_tif_fof:
        #        out.write( _ + '\n')
        #with open( 'lymphoma_iicbu2008_subset_2CHAN_HE_sigfiles_t5x6-l.fof.tsv', 'w') as out:
        #    for _ in double_sig_fof:
        #        out.write( _ + '\n')
        #zf.write( './' + 'lymphoma_iicbu2008_subset_EOSIN_ONLY_images.fof.tsv', compress_type=zipfile.ZIP_DEFLATED )
        #zf.write( './' + 'lymphoma_iicbu2008_subset_EOSIN_ONLY_sigfiles_t5x6-l.fof.tsv', compress_type=zipfile.ZIP_DEFLATED )
        #zf.write( './' + 'lymphoma_iicbu2008_subset_2CHAN_HE_images.fof.tsv', compress_type=zipfile.ZIP_DEFLATED )
        #zf.write( './' + 'lymphoma_iicbu2008_subset_2CHAN_HE_sigfiles_t5x6-l.fof.tsv', compress_type=zipfile.ZIP_DEFLATED )
        #zf.printdir()
        #zf.close()

        # END CODE TO CREATE TESTDATA ZIP PACKAGE
        #=============================================

        # Inflate the zipped test fit into a temp file
        import zipfile
        
        zipped_file_path = pychrm_test_dir + sep + 'lymphoma_iicbu2008_subset_HE_t5x6_v3.2features_SIGFILES.zip'
        zf1 = zipfile.ZipFile( zipped_file_path, mode='r' )
        tempdir = mkdtemp()
        zf1.extractall( tempdir )

        # for comparison:
        zf2 = zipfile.ZipFile( pychrm_test_dir + sep + 'lymphoma_iicbu2008_subset_EOSIN_ONLY_t5x6_v3.2features.fit.zip', mode='r')
        zf2.extractall( tempdir )

        try:
            kwargs = {}
            kwargs['pathname'] = tempdir + sep + 'lymphoma_iicbu2008_subset_EOSIN_ONLY_sigfiles_t5x6-l.fof.tsv'
            kwargs['quiet'] = True
            # sampling opts: -l -t5x6 implies 5 columns and 6 rows ... I know it's weird.
            kwargs['long'] = True
            kwargs['tile_num_rows'] = 6
            kwargs['tile_num_cols'] = 5
            fs_fof = FeatureSpace.NewFromFileOfFiles( **kwargs )

            kwargs['pathname'] = tempdir + sep + 'lymphoma_iicbu2008_subset_eosin_t5x6_v3.2features.fit'
            fs_fit = FeatureSpace.NewFromFitFile( **kwargs )

            # Fit file has less significant figures than Signature files, and it's not
            # consistent how many there are. Seems like fit file just lops off numbers
            # at the end. Example: (signatures on top, fit on bottom)
            #
            # Example:
            # -  17.232246,  # sig
            # ?         --
            #
            # +  17.2322,    # fit
            # -  -63.549056, # sig
            # ?         ^^^
            #
            # +  -63.5491,   # fit
            # ?         ^
            #
            # -  223.786977, # sig
            # ?        ---
            #
            # +  223.787,    # fit

            # More of the same:
            #(Pdb) fs_fof.data_matrix[0,-5:]
            #array([   0.935442,   14.005003,  -43.562076,  127.394914,    0.628772])
            #(Pdb) fs_fit.data_matrix[0,-5:]
            #array([   0.935442,   14.005   ,  -43.5621  ,  127.395   ,    0.628772])

            # default is rtol=1e-07, atol=0
            #np.testing.assert_allclose( actual=fs_fit.data_matrix, desired=fs_fof.data_matrix,
            #        rtol=1e-03, atol=0 )
            #np.testing.assert_array_almost_equal_nulp( fs_fit.data_matrix, fs_fof.data_matrix )
            for row_num, (fit_row, fof_row) in enumerate( zip( fs_fit.data_matrix, fs_fof.data_matrix )):
                retval = compare( fit_row, fof_row )
                if retval == False:
                    print "error in sample row", row_num
                    print "FIT: ", fs_fit._contiguous_sample_names[row_num], "FOF", fs_fof._contiguous_sample_names[row_num]
                self.assertTrue( retval )


            # Test sorting; scramble the FOF then load and check:

            sorted_fof = tempdir + sep + \
                    'lymphoma_iicbu2008_subset_EOSIN_ONLY_sigfiles_t5x6-l.fof.tsv'

            with open( sorted_fof) as fof:
                lines = fof.readlines()

            from random import shuffle
            shuffle(lines)

            unsorted_fof = tempdir + sep + \
                    'lymphoma_iicbu2008_subset_EOSIN_ONLY_sigfiles_t5x6-l_UNSORTED.fof.tsv'

            with open( unsorted_fof, 'w' ) as fof:
                for line in lines:
                    fof.write( line )

            kwargs = {}
            kwargs['pathname'] = unsorted_fof
            kwargs['quiet'] = True
            # sampling opts: -l -t5x6 implies 5 columns and 6 rows ... I know it's weird.
            kwargs['long'] = True
            kwargs['tile_num_rows'] = 6
            kwargs['tile_num_cols'] = 5
            fs_fof = FeatureSpace.NewFromFileOfFiles( **kwargs )
            # Check again
            for row_num, (fit_row, fof_row) in enumerate( zip( fs_fit.data_matrix, fs_fof.data_matrix )):
                retval = compare( fit_row, fof_row )
                if retval == False:
                    print "error in sample row", row_num
                    print "FIT: ", fs_fit._contiguous_sample_names[row_num], "FOF", fs_fof._contiguous_sample_names[row_num]
                self.assertTrue( retval )

            # TESTING TAKE TILES:
            self.assertRaises( ValueError, fs_fof.TakeTiles, tuple() )
            self.assertRaises( ValueError, fs_fof.TakeTiles, (45, 46, 47,) )
            self.assertRaises( TypeError, fs_fof.TakeTiles, 'crap' )

            # take middle 4
            wanted_tiles = ( 14, 15, 20, 21 )

            took = fs_fof.TakeTiles( wanted_tiles, inplace=False )
            num_sample_groups = len( set( fs_fof._contiguous_sample_group_ids ) )
            self.assertEqual( took.num_samples_per_group, len( wanted_tiles ) )
            self.assertEqual( took.num_samples, len( wanted_tiles ) * num_sample_groups )

#            mid4 = 'lymphoma_iicbu2008_subset_EOSIN_ONLY_sigfiles_MIDDLE_4_TILES_t5x6-l.fof.tsv'
#            # fake out wndcharm by putting empty tiffs in the temp dir
#            # we don't need them, the sigs are in there already.
#            with open( mid4) as fof:
#                lines = fof.readlines()
#                names, classes, paths, opts = zip( *[ _.split('\t') for _ in lines ] )
#                for _path in paths:
#                    with open( tempdir + sep + _path, 'w' ):
#                        pass
#            took_via_fof = FeatureSpace.NewFromFileOfFiles( mid4, num_samples_per_group=4 )
#
#            for row_num, (fit_row, fof_row) in enumerate( zip( took.data_matrix, took_via_fof.data_matrix )):
#                retval = compare( fit_row, fof_row )
#                if retval == False:
#                    print "error in sample row", row_num
#                    print "FIT: ", took._contiguous_sample_names[row_num], "FOF", took_via_fof._contiguous_sample_names[row_num]
#                self.assertTrue( retval )


        finally:
            rmtree( tempdir )
Beispiel #2
0
args = parser.parse_args()


num_splits = args.n
num_bins = args.b
input_filename = args.classifier_file_path[0]
outpath = args.output_filepath
dump_pickle = args.D

if input_filename.endswith( ".fit" ):
    full_set = FeatureSpace.NewFromFitFile( input_filename )
elif input_filename.endswith( ".fit.pickled" ):
    full_set = FeatureSpace.NewFromPickleFile( input_filename )
elif input_filename.endswith( ".fof" ):
    full_set = FeatureSpace.NewFromFileOfFiles( input_filename )
else:
    raise Exception( 'The classifier must either end in .fit, .fit.pickled, or .fof' )


if not dump_pickle == 'unset':
    if dump_pickle:
        # user used -D to specify a name for their training set pickle
        full_set.PickleMe( dump_pickle )
    else:
        # user used -D as a flag, use default pickle name pattern
        full_set.PickleMe()

num_features_per_bin = int( float( len( full_set.feature_names ) ) / float( num_bins ) )
bin_offset = 0
Beispiel #3
0
    def test_ParallelTiling(self):
        """Specify bounding box to FeatureVector, calc features, then compare
        with C++ implementation-calculated feats."""

        import zipfile
        from shutil import copy
        from tempfile import NamedTemporaryFile

        refdir = mkdtemp(prefix='ref')
        targetdir = mkdtemp(prefix='target')

        try:
            reference_feats = pychrm_test_dir + sep + 'lymphoma_eosin_channel_MCL_test_img_sj-05-3362-R2_001_E_t6x5_REFERENCE_SIGFILES.zip'
            zf = zipfile.ZipFile(reference_feats, mode='r')
            zf.extractall(refdir)

            img_filename = "lymphoma_eosin_channel_MCL_test_img_sj-05-3362-R2_001_E.tif"
            orig_img_filepath = pychrm_test_dir + sep + img_filename

            # copy the tiff to the tempdir so the .sig files end up there too
            copy(orig_img_filepath, targetdir)
            copy(orig_img_filepath, refdir)
            input_image_path = targetdir + sep + img_filename

            with NamedTemporaryFile(mode='w',
                                    dir=refdir,
                                    prefix='ref',
                                    delete=False) as temp:
                ref_fof = temp.name
                temp.write('reference_samp\ttest_class\t{}\t{{}}\n'.format(
                    refdir + sep + img_filename))
            with NamedTemporaryFile(mode='w',
                                    dir=targetdir,
                                    prefix='target',
                                    delete=False) as temp:
                target_fof = temp.name
                temp.write(
                    'test_samp\ttest_class\t{}\t{{}}\n'.format(targetdir +
                                                               sep +
                                                               img_filename))

            global_sampling_options = \
                FeatureVector( long=True, tile_num_cols=6, tile_num_rows=5 )

            # Should just load reference sigs
            ref_fs = FeatureSpace.NewFromFileOfFiles(
                ref_fof,
                quiet=False,
                global_sampling_options=global_sampling_options)
            target_fs = FeatureSpace.NewFromFileOfFiles(
                target_fof,
                n_jobs=True,
                quiet=False,
                global_sampling_options=global_sampling_options)

            #from numpy.testing import assert_allclose
            #self.assertTrue( assert_allclose( ref_fs.data_matrix, target_fs.data_matrix ) )
            from wndcharm.utils import compare
            for row_num, (ref_row, test_row) in enumerate(
                    zip(ref_fs.data_matrix, target_fs.data_matrix)):
                retval = compare(ref_row, test_row)
                if retval == False:
                    print "error in sample row", row_num
                    print "FIT: ", ref_fs._contiguous_sample_names[
                        row_num], "FOF", target_fs._contiguous_sample_names[
                            row_num]
                self.assertTrue(retval)
        finally:
            rmtree(refdir)
            rmtree(targetdir)