def get_model_train_test_dataset_splits(self, dataset_df: pd.DataFrame) -> DatasetSplits: return DatasetSplits.from_proportions( df=dataset_df, proportion_train=0.7, proportion_test=0.2, proportion_val=0.1, )
def get_model_train_test_dataset_splits( self, dataset_df: pd.DataFrame) -> DatasetSplits: return DatasetSplits.from_proportions(dataset_df, proportion_train=0.8, proportion_val=0.05, proportion_test=0.15, random_seed=0)
def get_model_train_test_dataset_splits(self, dataset_df: pd.DataFrame) -> DatasetSplits: if self.test_set_ids_csv: test_set_ids_csv = self.local_dataset / self.test_set_ids_csv test_series = pd.read_csv(test_set_ids_csv).series all_series = dataset_df.series.values check_all_test_series = all(test_series.isin(all_series)) if not check_all_test_series: raise ValueError(f"Not all test series from {test_set_ids_csv} were found in the dataset.") test_set_subjects = dataset_df[dataset_df.series.isin(test_series)].subject.values train_and_val_series = dataset_df[~dataset_df.subject.isin(test_set_subjects)].series.values random.seed(42) random.shuffle(train_and_val_series) num_val_samples = math.floor(len(train_and_val_series) / 9) val_series = train_and_val_series[:num_val_samples] train_series = train_and_val_series[num_val_samples:] logging.info(f"Dropped {len(all_series) - (len(test_series) + len(train_and_val_series))} series " f"due to subject overlap with test set.") return DatasetSplits.from_subject_ids(dataset_df, train_ids=train_series, val_ids=val_series, test_ids=test_series, subject_column="series", group_column="subject") else: return DatasetSplits.from_proportions(dataset_df, proportion_train=0.8, proportion_val=0.1, proportion_test=0.1, subject_column="series", group_column="subject", shuffle=True)
def get_model_train_test_dataset_splits( self, dataset_df: pd.DataFrame) -> DatasetSplits: return DatasetSplits.from_proportions( df=dataset_df, proportion_train=0.7, proportion_test=0.2, proportion_val=0.1, random_seed=1, subject_column=self.subject_column)
def test_grouped_splits(group_column: str) -> None: test_df = _get_test_df()[0] proportions = [0.5, 0.4, 0.1] splits = DatasetSplits.from_proportions(test_df, proportions[0], proportions[1], proportions[2], group_column=group_column) _check_is_partition(test_df, [splits.train, splits.test, splits.val], CSV_SUBJECT_HEADER) _check_is_partition(test_df, [splits.train, splits.test, splits.val], group_column)
def test_grouped_k_fold_cross_validation_splits(group_column: str) -> None: test_df = _get_test_df()[0] proportions = [0.5, 0.4, 0.1] splits = DatasetSplits.from_proportions(test_df, proportions[0], proportions[1], proportions[2], group_column=group_column) n_splits = 7 # mutually prime with numbers of subjects and groups val_folds = [] for fold in splits.get_k_fold_cross_validation_splits(n_splits): _check_is_partition(test_df, [fold.train, fold.test, fold.val], CSV_SUBJECT_HEADER) _check_is_partition(test_df, [fold.train, fold.test, fold.val], group_column) assert fold.test.equals(splits.test) val_folds.append(fold.val) # ensure validation folds partition the original train+val set train_val = pd.concat([splits.train, splits.val]) _check_is_partition(train_val, val_folds, CSV_SUBJECT_HEADER) _check_is_partition(train_val, val_folds, group_column)