def test_train_test_split(self): ts = dtr.TrainingSelector(img_filepaths=self.img_filepaths) X = ts.descr_feature_matrix self.assertEqual(len(X), len(self.img_filepaths)) # test `method` argument split_df = ts.train_test_split() self.assertIn('train', split_df) self.assertIn('img_cluster', split_df) split_df = ts.train_test_split(method='cluster-I') self.assertIn('train', split_df) self.assertNotIn('img_cluster', split_df) # test `return_evr` argument (expected variance ratio) split_df, evr = ts.train_test_split(return_evr=True) self.assertIsInstance(split_df, pd.DataFrame) self.assertIsInstance(evr, float)
def test_init(self): # if providing `img_filepaths`, `img_dir` and `img_filename_pattern` # are ignored ts = dtr.TrainingSelector(img_filepaths=self.img_filepaths) self.assertEqual( ts.img_filepaths, dtr.TrainingSelector(img_filepaths=self.img_filepaths, img_dir='foo').img_filepaths) self.assertEqual( ts.img_filepaths, dtr.TrainingSelector(img_filepaths=self.img_filepaths, img_filename_pattern='foo').img_filepaths) self.assertEqual( ts.img_filepaths, dtr.TrainingSelector(img_filepaths=self.img_filepaths, img_dir='foo', img_filename_pattern='foo').img_filepaths) # if not providing `img_filepaths`, inexistent `img_dir` or non-tif # `img_filename_pattern` will result in an empty `img_filepaths` # attribute self.assertEqual( len(dtr.TrainingSelector(img_dir='foo').img_filepaths), 0) self.assertEqual( len( dtr.TrainingSelector( img_dir=self.img_dir, img_filename_pattern='foo').img_filepaths), 0) # otherwise, there should be at least one img self.assertGreater( len(dtr.TrainingSelector(img_dir=self.img_dir).img_filepaths), 0) # even when providing an integer in the `gabor_num_orientations` # argument, the respective attribute will be a tuple after `__init__` # is executed self.assertIsInstance( dtr.TrainingSelector( self.img_filepaths, gabor_num_orientations=8).gabor_num_orientations, tuple)
def train_test_split(ctx, img_filepaths, img_dir, img_filename_pattern, gabor_frequencies, gabor_num_orientations, response_bins_per_axis, num_color_bins, method, num_components, num_img_clusters, train_prop, output_filepath): logger = ctx.obj['LOGGER'] ts = dtr.TrainingSelector(img_filepaths=img_filepaths, img_dir=img_dir, img_filename_pattern=img_filename_pattern, gabor_frequencies=gabor_frequencies, gabor_num_orientations=gabor_num_orientations, response_bins_per_axis=response_bins_per_axis, num_color_bins=num_color_bins) num_imgs = len(ts.img_filepaths) if img_dir is not None: logger.info("Loaded %d images from %s", num_imgs, img_dir) else: logger.info("Loaded %d images", num_imgs) tts_kws = {} if method is not None: tts_kws['method'] = method if num_components is not None: tts_kws['num_components'] = num_components if num_img_clusters is not None: tts_kws['num_img_clusters'] = num_img_clusters if train_prop is not None: tts_kws['train_prop'] = train_prop df, evr = ts.train_test_split(return_evr=True, **tts_kws) logger.info("Variance ratio explained by PCA: %f", evr) if output_filepath is None: output_filepath = 'split.csv' df.to_csv(output_filepath) logger.info("Dumped train/test split to %s", output_filepath)