Example #1
0
    def test_fth_random_cv(self):
        bucket = 1000
        dim = 20
        cv_folds = 2
        cv_iterations = 2

        def cv_function(data_df, params, random_state):
            return fth.fasttext_cv_independent_associations(
                data_df,
                params,
                self.ft_path,
                cv_folds=cv_folds,
                random_state=random_state)

        test_df = data_tools.load_data_frame(self.ft_cv_test_path)
        test_df['text'] = test_df['text'].apply(lambda s: s.strip().lower())
        cv_results = cv.random_cv(test_df, cv_function, cv_iterations, {
            '-bucket': bucket,
            '-dim': dim
        }, fth.get_hyperparameter_distributions(), 3)
        expected_col_names = [
            'bucket',
            'dim',
            'epoch',
            'lr',
            'wordNgrams',
            'ws',
            'mean_test_score',
            'stdev_test_score',
            'mean_train_score',
            'stdev_train_score',
            'split_0_test_score',
            'split_0_train_score',
            'split_0_n_test',
            'split_0_pos_test',
            'split_0_n_train',
            'split_0_pos_train',
            'split_1_test_score',
            'split_1_train_score',
            'split_1_n_test',
            'split_1_pos_test',
            'split_1_n_train',
            'split_1_pos_train',
        ]
        assert expected_col_names == list(cv_results.columns)
        # following parameters are chosen randomly and hence cannot be tested but only that they differ between the CV
        # runs
        random_col_names = [
            'epoch',
            'lr',
            'wordNgrams',
            'ws',
        ]
        for rand in random_col_names:
            assert cv_results.loc[0, rand] != cv_results.loc[1, rand]

        # ignore columns that are linked to test performance since this cannot be tested for random parameter choices
        ignore_params = [
            'mean_test_score',
            'stdev_test_score',
            'mean_train_score',
            'stdev_train_score',
            'split_0_test_score',
            'split_0_train_score',
            'split_1_test_score',
            'split_1_train_score',
        ]

        expected_values = [
            [1000] * cv_iterations,
            [20] * cv_iterations,
            [.1] * cv_iterations,
            [.2] * cv_iterations,
            [.3] * cv_iterations,
            [.4] * cv_iterations,
            [1.0] * cv_iterations,
            [0.0] * cv_iterations,
            [1.0] * cv_iterations,
            [0.0] * cv_iterations,
            [1.0] * cv_iterations,
            [1.0] * cv_iterations,
            [20] * cv_iterations,
            [0.5] * cv_iterations,
            [20] * cv_iterations,
            [0.5] * cv_iterations,
            [1.0] * cv_iterations,
            [1.0] * cv_iterations,
            [20] * cv_iterations,
            [0.5] * cv_iterations,
            [20] * cv_iterations,
            [0.5] * cv_iterations,
        ]
        expected_df = pandas.DataFrame(
            {
                col: values
                for col, values in zip(expected_col_names, expected_values)
            },
            columns=expected_col_names)
        results_test_df = cv_results.drop(random_col_names + ignore_params,
                                          axis=1)
        expected_test_df = expected_df.drop(random_col_names + ignore_params,
                                            axis=1)
        assert_frame_equal(results_test_df, expected_test_df)
Example #2
0
 def test_random_fth_parameter_sampler_random(self):
     dist = fth.get_hyperparameter_distributions()
     generator = cv.get_random_parameter_sampler(dist, 3)
     sample1 = generator.__next__()
     sample2 = generator.__next__()
     assert sample1 != sample2
Example #3
0
 def test_random_cos_parameter_sampler_random_seed_argument(self):
     dist1 = cos.get_hyperparameter_distributions(1)
     sample1 = cv.get_random_parameter_sampler(dist1, 3).__next__()
     dist2 = fth.get_hyperparameter_distributions(12)
     sample2 = cv.get_random_parameter_sampler(dist2, 3).__next__()
     assert sample1 != sample2
Example #4
0
 def test_random_fth_parameter_sampler_reproducibility_seed_argument(self):
     dist1 = fth.get_hyperparameter_distributions(1)
     sample1 = cv.get_random_parameter_sampler(dist1, 3).__next__()
     dist2 = fth.get_hyperparameter_distributions(1)
     sample2 = cv.get_random_parameter_sampler(dist2, 3).__next__()
     assert sample1 == sample2