def test_split_by_subject_ids_invalid(splits: List[List[str]]) -> None: df1 = pd.read_csv(full_ml_test_data_path(DATASET_CSV_FILE_NAME), dtype=str) with pytest.raises(ValueError): DatasetSplits.from_subject_ids(df1, train_ids=splits[0], val_ids=splits[1], test_ids=splits[2])
def get_model_train_test_dataset_splits(self, dataset_df: pd.DataFrame) -> DatasetSplits: return DatasetSplits.from_subject_ids( df=dataset_df, train_ids=['0', '1'], test_ids=['5'], val_ids=['2'] )
def test_restrict_subjects3() -> None: test_df, test_ids, train_ids, val_ids = _get_test_df() splits = DatasetSplits.from_subject_ids(test_df, train_ids, test_ids, val_ids).restrict_subjects(",0,+") assert len(splits.train.subject.unique()) == len(train_ids) assert len(splits.val.subject.unique()) == 0 assert len(splits.test.subject.unique()) == len(test_ids) + len(val_ids)
def get_model_train_test_dataset_splits(self, dataset_df: pd.DataFrame) -> DatasetSplits: if self.test_set_ids_csv: test_set_ids_csv = self.local_dataset / self.test_set_ids_csv test_series = pd.read_csv(test_set_ids_csv).series all_series = dataset_df.series.values check_all_test_series = all(test_series.isin(all_series)) if not check_all_test_series: raise ValueError(f"Not all test series from {test_set_ids_csv} were found in the dataset.") test_set_subjects = dataset_df[dataset_df.series.isin(test_series)].subject.values train_and_val_series = dataset_df[~dataset_df.subject.isin(test_set_subjects)].series.values random.seed(42) random.shuffle(train_and_val_series) num_val_samples = math.floor(len(train_and_val_series) / 9) val_series = train_and_val_series[:num_val_samples] train_series = train_and_val_series[num_val_samples:] logging.info(f"Dropped {len(all_series) - (len(test_series) + len(train_and_val_series))} series " f"due to subject overlap with test set.") return DatasetSplits.from_subject_ids(dataset_df, train_ids=train_series, val_ids=val_series, test_ids=test_series, subject_column="series", group_column="subject") else: return DatasetSplits.from_proportions(dataset_df, proportion_train=0.8, proportion_val=0.1, proportion_test=0.1, subject_column="series", group_column="subject", shuffle=True)
def get_model_train_test_dataset_splits( self, dataset_df: pd.DataFrame) -> DatasetSplits: return DatasetSplits.from_subject_ids( df=dataset_df, train_ids=[1, 2, 3], val_ids=[4, 5], test_ids=[6], )
def test_split_by_subject_ids() -> None: test_df, test_ids, train_ids, val_ids = _get_test_df() splits = DatasetSplits.from_subject_ids(test_df, train_ids, test_ids, val_ids) for x, y in zip([splits.train, splits.test, splits.val], [train_ids, test_ids, val_ids]): pd.testing.assert_frame_equal(x, test_df[test_df.subject.isin(y)])
def test_get_k_fold_cross_validation_splits() -> None: # check the dataset splits have deterministic randomness for i in range(2): test_df, test_ids, train_ids, val_ids = _get_test_df() splits = DatasetSplits.from_subject_ids(test_df, train_ids, test_ids, val_ids) folds = splits.get_k_fold_cross_validation_splits(n_splits=5) assert len(folds) == 5 assert all([x.test.equals(splits.test) for x in folds]) assert all( [len(set(list(x.train.subject.unique()) + list(x.test.subject.unique()) + list(x.val.subject.unique())) .difference(set(test_df.subject.unique()))) == 0 for x in folds])
def get_model_train_test_dataset_splits( self, dataset_df: pd.DataFrame) -> DatasetSplits: # The first 24 subject IDs are the designated test subjects in this dataset. test = list(range(0, 24)) train_val = list( dataset_df[~dataset_df.subject.isin(test)].subject.unique()) val = numpy.random.choice(train_val, int(len(train_val) * 0.1), replace=False) train = [x for x in train_val if x not in val] return DatasetSplits.from_subject_ids(df=dataset_df, test_ids=test, val_ids=val, train_ids=train)