Exemple #1
0
    def test_different_seeds_for_subset_of_trials(self):
        n_trials = 25
        cols = ["col_" + str(i) for i in range(4)]

        dfs = [
            pd.DataFrame(np.random.rand(10, len(cols)), columns=cols)
            for _ in range(n_trials)
        ]
        self.write_dataframes_to_file(dfs, self.hdf_path)

        first_dataset = dataset.read_dataset(self.hdf_path,
                                             shuffle=True,
                                             n_trials=4,
                                             seed=90)
        second_dataset = dataset.read_dataset(self.hdf_path,
                                              shuffle=True,
                                              n_trials=4,
                                              seed=0)

        different_order = False
        for df1, df2 in zip(first_dataset, second_dataset):
            if not df1.equals(df2):
                different_order = True
                break

        self.assertTrue(different_order)
Exemple #2
0
    def test_cols_that_do_not_exist(self):
        n_trials = 5
        dfs = [pd.DataFrame(np.random.rand(10, 4)) for _ in range(n_trials)]
        self.write_dataframes_to_file(dfs, self.hdf_path)

        inexisting_cols = ["inexisting_col_a", "inexisting_col_b"]
        with self.assertRaises(KeyError):
            dataset.read_dataset(self.hdf_path,
                                 cols=inexisting_cols,
                                 add_class_columns=False)
Exemple #3
0
    def test_n_trials_is_none(self):
        n_trials = 5
        dfs = [pd.DataFrame(np.random.rand(10, 4)) for _ in range(n_trials)]
        self.write_dataframes_to_file(dfs, self.hdf_path)

        read_dataset = dataset.read_dataset(self.hdf_path)
        self.assertEqual(n_trials, len(read_dataset))
Exemple #4
0
    def test_n_trials_larger_than_available(self):
        n_trials = 5
        dfs = [pd.DataFrame(np.random.rand(10, 4)) for _ in range(n_trials)]
        self.write_dataframes_to_file(dfs, self.hdf_path)

        read_dataset = dataset.read_dataset(self.hdf_path,
                                            n_trials=n_trials * 5)
        self.assertEqual(n_trials, len(read_dataset))
Exemple #5
0
    def test_datasets_sizes(self):
        gps.create_passive_datasets_for_training(**self.args)

        for path, dataset_trials in zip(
            [self.train_hdf_path, self.val_hdf_path, self.test_hdf_path], [
                self.args["n_simulations_train"],
                self.args["n_simulations_val"], self.args["n_simulations_test"]
            ]):
            dataset = read_dataset(path)
            self.assertEqual(len(dataset), dataset_trials)
Exemple #6
0
    def test_reading_all_trials_with_different_seeds_but_without_shuffle(self):
        n_trials = 25
        cols = ["col_" + str(i) for i in range(4)]

        dfs = [
            pd.DataFrame(np.random.rand(10, len(cols)), columns=cols)
            for _ in range(n_trials)
        ]
        self.write_dataframes_to_file(dfs, self.hdf_path)

        # Specifying shuffle=False (or not specifying at all) should make the method ignore
        # the seed argument
        first_dataset = dataset.read_dataset(self.hdf_path,
                                             shuffle=False,
                                             seed=90)
        second_dataset = dataset.read_dataset(self.hdf_path,
                                              shuffle=False,
                                              seed=0)

        for df1, df2 in zip(first_dataset, second_dataset):
            pd.testing.assert_frame_equal(df1, df2)
Exemple #7
0
    def test_reproducibility_for_subset_of_trials(self):
        n_trials = 25
        cols = ["col_" + str(i) for i in range(4)]

        dfs = [
            pd.DataFrame(np.random.rand(10, len(cols)), columns=cols)
            for _ in range(n_trials)
        ]
        self.write_dataframes_to_file(dfs, self.hdf_path)

        first_dataset = dataset.read_dataset(self.hdf_path,
                                             shuffle=True,
                                             n_trials=4,
                                             seed=90)
        second_dataset = dataset.read_dataset(self.hdf_path,
                                              shuffle=True,
                                              n_trials=4,
                                              seed=90)

        for df1, df2 in zip(first_dataset, second_dataset):
            pd.testing.assert_frame_equal(df1, df2)
Exemple #8
0
    def test_cols_is_none(self):
        n_trials = 5
        cols = ["col_" + str(i) for i in range(4)]
        dfs = [
            pd.DataFrame(np.random.rand(10, 4), columns=cols)
            for _ in range(n_trials)
        ]
        self.write_dataframes_to_file(dfs, self.hdf_path)

        read_dataset = dataset.read_dataset(self.hdf_path,
                                            cols=None,
                                            add_class_columns=False)
        self.assertTrue(self.are_columns_in_all_trials(read_dataset, cols))
Exemple #9
0
    def test_add_class_columns(self):
        n_trials = 5
        cols = ["col_" + str(i) for i in range(4)]
        cols_to_write = cols + list(MASS_CLASS_COLS) + list(FORCE_CLASS_COLS)

        dfs = [
            pd.DataFrame(np.random.rand(10, len(cols_to_write)),
                         columns=cols_to_write) for _ in range(n_trials)
        ]
        self.write_dataframes_to_file(dfs, self.hdf_path)

        read_dataset = dataset.read_dataset(self.hdf_path,
                                            cols=cols,
                                            add_class_columns=True)
        self.assertTrue(
            self.are_columns_in_all_trials(read_dataset, cols_to_write))
Exemple #10
0
    def test_duplicated_cols_are_only_read_once(self):
        n_trials = 5
        cols = ["col_" + str(i) for i in range(4)]
        dfs = [
            pd.DataFrame(np.random.rand(10, 4), columns=cols)
            for _ in range(n_trials)
        ]
        self.write_dataframes_to_file(dfs, self.hdf_path)

        read_dataset = dataset.read_dataset(self.hdf_path,
                                            cols=cols + cols,
                                            add_class_columns=False)
        self.assertTrue(self.are_columns_in_all_trials(read_dataset, cols))
        self.assertTrue(
            reduce(lambda x, y: x and y,
                   [len(df.columns) == 4 for df in read_dataset]))
Exemple #11
0
    def test_empty_dataset(self):
        args_copy = self.args.copy()
        args_copy["n_simulations_train"] = 0

        gps.create_passive_datasets_for_training(**args_copy)

        for path, dataset_trials in zip(
            [self.train_hdf_path, self.val_hdf_path, self.test_hdf_path], [
                args_copy["n_simulations_train"],
                args_copy["n_simulations_val"], args_copy["n_simulations_test"]
            ]):
            if dataset_trials > 0:
                dataset = read_dataset(path)
                self.assertEqual(len(dataset), dataset_trials)
            else:
                self.assertFalse(os.path.exists(path))
Exemple #12
0
    def test_do_not_add_class_columns_when_cols_unspecified(self):
        n_trials = 5
        cols = ["col_" + str(i) for i in range(4)]

        dfs = [
            pd.DataFrame(np.random.rand(10, len(cols)), columns=cols)
            for _ in range(n_trials)
        ]
        self.write_dataframes_to_file(dfs, self.hdf_path)

        try:
            read_dataset = dataset.read_dataset(self.hdf_path,
                                                cols=None,
                                                add_class_columns=True)
        except KeyError:
            self.fail("read_dataset failed unexpectedly")

        self.assertTrue(self.are_columns_in_all_trials(read_dataset, cols))
Exemple #13
0
    def test_datasets_sizes_with_previously_existing_dataset(self):
        # Create initial datasets with default key_prefix
        gps.create_passive_datasets_for_training(**self.args)

        # Create new datasets with a different key_prefix and number of trials
        # If the previous dataset isn't deleted: either there will be more trials (added to the
        # same path) or an incorrect number of trials.
        args_copy = self.args.copy()
        args_copy["trial_hdf_key_prefix"] = "newtrials_"
        args_copy["n_simulations_train"] = 1
        args_copy["n_simulations_val"] = 1
        args_copy["n_simulations_test"] = 1
        gps.create_passive_datasets_for_training(**args_copy)

        for path, dataset_trials in zip(
            [self.train_hdf_path, self.val_hdf_path, self.test_hdf_path], [
                args_copy["n_simulations_train"],
                args_copy["n_simulations_val"], args_copy["n_simulations_test"]
            ]):
            dataset = read_dataset(path)
            self.assertEqual(len(dataset), dataset_trials)