def _prepare_data(self, X, y=None, **kwargs): logger.info( "Preparing data for outer fold " + str(self.cross_validaton_info.outer_folds[self.outer_fold_id].fold_nr) + "..." ) # Prepare Train and validation set data train_indices = self.cross_validaton_info.outer_folds[ self.outer_fold_id ].train_indices test_indices = self.cross_validaton_info.outer_folds[ self.outer_fold_id ].test_indices self._validation_X, self._validation_y, self._validation_kwargs = PhotonDataHelper.split_data( X, y, kwargs, indices=train_indices ) self._test_X, self._test_y, self._test_kwargs = PhotonDataHelper.split_data( X, y, kwargs, indices=test_indices ) # write numbers to database info object self.result_object.number_samples_validation = self._validation_y.shape[0] self.result_object.number_samples_test = self._test_y.shape[0] if self._pipe._estimator_type == "classifier": self.result_object.class_distribution_validation = FoldInfo.data_overview( self._validation_y ) self.result_object.class_distribution_test = FoldInfo.data_overview( self._test_y )
def setUp(self): super(InnerFoldTests, self).setUp() self.pipe = PhotonPipeline([ ("StandardScaler", PipelineElement("StandardScaler")), ("PCA", PipelineElement("PCA")), ("RidgeClassifier", PipelineElement("RidgeClassifier")), ]) self.config = { "PCA__n_components": 5, "RidgeClassifier__solver": "svd", "RidgeClassifier__random_state": 42, } self.outer_fold_id = "TestID" self.inner_cv = KFold(n_splits=4) self.X, self.y = load_breast_cancer(True) self.cross_validation = Hyperpipe.CrossValidation( self.inner_cv, None, True, 0.2, True, False) self.cross_validation.inner_folds = { self.outer_fold_id: { i: FoldInfo(i, i + 1, train, test) for i, (train, test) in enumerate(self.inner_cv.split(self.X, self.y)) } } self.optimization = Hyperpipe.Optimization( "grid_search", {}, ["accuracy", "recall", "specificity"], "accuracy", None)
def setUp(self): super(InnerFoldTests, self).setUp() self.pipe = PhotonPipeline([ ('StandardScaler', PipelineElement('StandardScaler')), ('PCA', PipelineElement('PCA')), ('RidgeClassifier', PipelineElement('RidgeClassifier')) ]) self.config = { 'PCA__n_components': 5, 'RidgeClassifier__solver': 'svd', 'RidgeClassifier__random_state': 42 } self.outer_fold_id = 'TestID' self.inner_cv = KFold(n_splits=4) self.X, self.y = load_breast_cancer(return_X_y=True) self.cross_validation = Hyperpipe.CrossValidation( self.inner_cv, None, True, 0.2, True, False, False, None) self.cross_validation.inner_folds = { self.outer_fold_id: { i: FoldInfo(i, i + 1, train, test) for i, (train, test) in enumerate(self.inner_cv.split(self.X, self.y)) } } self.optimization = Hyperpipe.Optimization( 'grid_search', {}, ['accuracy', 'recall', 'specificity'], 'accuracy', None)
def _generate_inner_folds(self): self.inner_folds = FoldInfo.generate_folds(self.cross_validaton_info.inner_cv, self._validation_X, self._validation_y, self._validation_kwargs) self.cross_validaton_info.inner_folds[self.outer_fold_id] = {f.fold_id: f for f in self.inner_folds}
def test_no_cv_strategy_eval_final_performance_false(self): test_size = 0.15 fold_list = FoldInfo.generate_folds(None, self.X, self.y, self.kwargs, eval_final_performance=False, test_size=test_size) # check that we have only one outer fold, that is split in training and test according to test size self.assertEqual(len(fold_list), 1) self.assertEqual(len(fold_list[0].train_indices), self.num_subjects) self.assertEqual(len(fold_list[0].test_indices), 0)
def base_assertions(self, cv, nr_of_folds, eval_final_performance=True): fold_list = FoldInfo.generate_folds( cv, self.X, self.y, self.kwargs, eval_final_performance=eval_final_performance) self.assertTrue(len(fold_list) == nr_of_folds) for generated_fold in fold_list: self.assertEqual(len(generated_fold.train_indices), (nr_of_folds - 1) * (self.num_subjects / nr_of_folds)) self.assertEqual(len(generated_fold.test_indices), (self.num_subjects / nr_of_folds)) # we always start with 1 for the investigator self.assertEqual(fold_list[0].fold_nr, 1) return fold_list
def setUp(self): super(OuterFoldTests, self).setUp() self.fold_nr_inner_cv = 5 self.inner_cv = ShuffleSplit(n_splits=self.fold_nr_inner_cv, random_state=42) self.outer_cv = ShuffleSplit(n_splits=1, test_size=0.2, random_state=42) self.cv_info = Hyperpipe.CrossValidation( inner_cv=self.inner_cv, outer_cv=self.outer_cv, eval_final_performance=True, test_size=0.2, calculate_metrics_per_fold=True, calculate_metrics_across_folds=False, learning_curves=False, learning_curves_cut=None) self.X, self.y = load_boston(return_X_y=True) self.outer_fold_id = "TestFoldOuter1" self.cv_info.outer_folds = { self.outer_fold_id: FoldInfo(0, 1, train, test) for train, test in self.outer_cv.split(self.X, self.y) } self.config_num = 2 self.optimization_info = Hyperpipe.Optimization( metrics=['mean_absolute_error', 'mean_squared_error'], best_config_metric='mean_absolute_error', optimizer_input='grid_search', optimizer_params={}, performance_constraints=None) self.elements = [ PipelineElement('StandardScaler'), PipelineElement('PCA', {'n_components': [4, 7]}), PipelineElement('DecisionTreeRegressor', random_state=42) ] self.pipe = PhotonPipeline([(p.name, p) for p in self.elements])
def test_data_overview(self): expected_outcome = {str(i): 10 for i in range(10)} data_count = FoldInfo.data_overview(self.kwargs['groups'].astype(int)) self.assertDictEqual(expected_outcome, data_count)
def test_class_distribution_info(self): unique, counts = np.unique(self.__y, return_counts=True) nr_dict = FoldInfo.data_overview(self.__y) self.assertEqual(counts[1], nr_dict["1"])