Ejemplo n.º 1
0
def test_split_by_subject_ids_invalid(splits: List[List[str]]) -> None:
    df1 = pd.read_csv(full_ml_test_data_path(DATASET_CSV_FILE_NAME), dtype=str)
    with pytest.raises(ValueError):
        DatasetSplits.from_subject_ids(df1,
                                       train_ids=splits[0],
                                       val_ids=splits[1],
                                       test_ids=splits[2])
 def get_model_train_test_dataset_splits(self, dataset_df: pd.DataFrame) -> DatasetSplits:
     return DatasetSplits.from_subject_ids(
         df=dataset_df,
         train_ids=['0', '1'],
         test_ids=['5'],
         val_ids=['2']
     )
Ejemplo n.º 3
0
def test_restrict_subjects3() -> None:
    test_df, test_ids, train_ids, val_ids = _get_test_df()
    splits = DatasetSplits.from_subject_ids(test_df, train_ids, test_ids,
                                            val_ids).restrict_subjects(",0,+")
    assert len(splits.train.subject.unique()) == len(train_ids)
    assert len(splits.val.subject.unique()) == 0
    assert len(splits.test.subject.unique()) == len(test_ids) + len(val_ids)
Ejemplo n.º 4
0
    def get_model_train_test_dataset_splits(self, dataset_df: pd.DataFrame) -> DatasetSplits:
        if self.test_set_ids_csv:
            test_set_ids_csv = self.local_dataset / self.test_set_ids_csv
            test_series = pd.read_csv(test_set_ids_csv).series

            all_series = dataset_df.series.values
            check_all_test_series = all(test_series.isin(all_series))
            if not check_all_test_series:
                raise ValueError(f"Not all test series from {test_set_ids_csv} were found in the dataset.")

            test_set_subjects = dataset_df[dataset_df.series.isin(test_series)].subject.values
            train_and_val_series = dataset_df[~dataset_df.subject.isin(test_set_subjects)].series.values
            random.seed(42)
            random.shuffle(train_and_val_series)
            num_val_samples = math.floor(len(train_and_val_series) / 9)
            val_series = train_and_val_series[:num_val_samples]
            train_series = train_and_val_series[num_val_samples:]

            logging.info(f"Dropped {len(all_series) - (len(test_series) + len(train_and_val_series))} series "
                         f"due to subject overlap with test set.")
            return DatasetSplits.from_subject_ids(dataset_df,
                                                  train_ids=train_series,
                                                  val_ids=val_series,
                                                  test_ids=test_series,
                                                  subject_column="series",
                                                  group_column="subject")
        else:
            return DatasetSplits.from_proportions(dataset_df,
                                                  proportion_train=0.8,
                                                  proportion_val=0.1,
                                                  proportion_test=0.1,
                                                  subject_column="series",
                                                  group_column="subject",
                                                  shuffle=True)
 def get_model_train_test_dataset_splits(
         self, dataset_df: pd.DataFrame) -> DatasetSplits:
     return DatasetSplits.from_subject_ids(
         df=dataset_df,
         train_ids=[1, 2, 3],
         val_ids=[4, 5],
         test_ids=[6],
     )
Ejemplo n.º 6
0
def test_split_by_subject_ids() -> None:
    test_df, test_ids, train_ids, val_ids = _get_test_df()
    splits = DatasetSplits.from_subject_ids(test_df, train_ids, test_ids,
                                            val_ids)

    for x, y in zip([splits.train, splits.test, splits.val],
                    [train_ids, test_ids, val_ids]):
        pd.testing.assert_frame_equal(x, test_df[test_df.subject.isin(y)])
def test_get_k_fold_cross_validation_splits() -> None:
    # check the dataset splits have deterministic randomness
    for i in range(2):
        test_df, test_ids, train_ids, val_ids = _get_test_df()
        splits = DatasetSplits.from_subject_ids(test_df, train_ids, test_ids, val_ids)
        folds = splits.get_k_fold_cross_validation_splits(n_splits=5)
        assert len(folds) == 5
        assert all([x.test.equals(splits.test) for x in folds])
        assert all(
            [len(set(list(x.train.subject.unique()) + list(x.test.subject.unique()) + list(x.val.subject.unique()))
                 .difference(set(test_df.subject.unique()))) == 0 for x in folds])
Ejemplo n.º 8
0
    def get_model_train_test_dataset_splits(
            self, dataset_df: pd.DataFrame) -> DatasetSplits:
        # The first 24 subject IDs are the designated test subjects in this dataset.
        test = list(range(0, 24))
        train_val = list(
            dataset_df[~dataset_df.subject.isin(test)].subject.unique())

        val = numpy.random.choice(train_val,
                                  int(len(train_val) * 0.1),
                                  replace=False)
        train = [x for x in train_val if x not in val]

        return DatasetSplits.from_subject_ids(df=dataset_df,
                                              test_ids=test,
                                              val_ids=val,
                                              train_ids=train)