Exemple #1
0
 def test_document_scores_reciprocal(self):
     document_df = load_data_frame(self.document_scores_file,
                                   class_labels=False,
                                   match_distance=True)
     scores = reciprocal_distance(document_df)
     expected = pd.Series([1., .5, 1 / 3])
     pd.testing.assert_series_equal(scores, expected, check_names=False)
Exemple #2
0
 def test_paragraph_scores_reciprocal(self):
     paragraph_df = load_data_frame(self.paragraph_scores_file,
                                    class_labels=False,
                                    match_distance=True)
     scores = reciprocal_distance(paragraph_df)
     expected = pd.Series([1., .1, .01, .001])
     pd.testing.assert_series_equal(scores, expected, check_names=False)
Exemple #3
0
 def test_fasttext_cv_independent_associations(self):
     dim = 20
     bucket = 1000
     cv_folds = 2
     test_df = dt.load_data_frame(self.cv_test_path)
     test_df['text'] = test_df['text'].apply(lambda s: s.strip().lower())
     cv_results = fth.fasttext_cv_independent_associations(
         test_df, {
             '-bucket': bucket,
             '-dim': dim
         },
         self.ft_path,
         cv_folds=cv_folds,
         random_state=np.random.RandomState(3))
     expected_col_names = [
         'mean_test_score',
         'stdev_test_score',
         'mean_train_score',
         'stdev_train_score',
         'split_0_test_score',
         'split_0_train_score',
         'split_0_n_test',
         'split_0_pos_test',
         'split_0_n_train',
         'split_0_pos_train',
         'split_1_test_score',
         'split_1_train_score',
         'split_1_n_test',
         'split_1_pos_test',
         'split_1_n_train',
         'split_1_pos_train',
     ]
     cv_runs = 1
     expected_values = [
         [1.0] * cv_runs,
         [0.0] * cv_runs,
         [1.0] * cv_runs,
         [0.0] * cv_runs,
         [1.0] * cv_runs,
         [1.0] * cv_runs,
         [20] * cv_runs,
         [0.5] * cv_runs,
         [20] * cv_runs,
         [0.5] * cv_runs,
         [1.0] * cv_runs,
         [1.0] * cv_runs,
         [20] * cv_runs,
         [0.5] * cv_runs,
         [20] * cv_runs,
         [0.5] * cv_runs,
     ]
     expected_df = pd.DataFrame(
         {
             col: values
             for col, values in zip(expected_col_names, expected_values)
         },
         columns=expected_col_names)
     assert_frame_equal(cv_results, expected_df)
Exemple #4
0
 def test_reproducibility_associations(self):
     test_case_df = data_tools.load_data_frame(self.test_case_df_path)
     run1 = cv.cv_independent_associations(
         test_case_df, cv_folds=3, random_state=np.random.RandomState(0))
     run2 = cv.cv_independent_associations(
         test_case_df, cv_folds=3, random_state=np.random.RandomState(0))
     for first, second in zip(run1, run2):
         train_first, test_first = first
         train_second, test_second = second
         np.testing.assert_array_equal(train_first, train_second)
         np.testing.assert_array_equal(test_first, test_second)
         assert len(train_first) == 4
         assert len(test_first) == 2
         assert len(train_second) == 4
         assert len(test_second) == 2
Exemple #5
0
    def test_cos_random_cv_bad_param(self):
        cv_folds = 2
        cv_iterations = 2

        def cv_function(data_df, params, random_state):
            return cos.cv_independent_associations(data_df,
                                                   params,
                                                   cv_folds=cv_folds,
                                                   random_state=random_state,
                                                   fasttext_epochs=5,
                                                   fasttext_bucket=1000,
                                                   fasttext_dim=20)

        test_df = data_tools.load_data_frame(self.cos_cv_test_path,
                                             match_distance=True)
        test_df['text'] = test_df['text'].apply(lambda s: s.strip().lower())
        with raises(TypeError, match="got an unexpected keyword argument"):
            _ = cv.random_cv(test_df, cv_function, cv_iterations,
                             {'sentence_weightXXXX': 1},
                             cos.get_hyperparameter_distributions(), 3)
Exemple #6
0
 def test_randomness_associations(self):
     test_case_df = data_tools.load_data_frame(self.test_case_df_path)
     # since this may fail due to randomness in some cases, allow a couple of attempts
     max_attempts = 20
     attempt = 0
     while True:
         if attempt == max_attempts:
             fail(
                 'Failed due since no randomness in shuffled splits found.')
         else:
             try:
                 run1 = cv.cv_independent_associations(test_case_df,
                                                       cv_folds=3)
                 run2 = cv.cv_independent_associations(test_case_df,
                                                       cv_folds=3)
                 for first, second in zip(run1, run2):
                     train_first, test_first = first
                     train_second, test_second = second
                     assert not all(
                         [x in train_first for x in train_second])
                     assert not all([x in test_first for x in test_second])
                 break
             except AssertionError:
                 attempt += 1
Exemple #7
0
 def test_distance_scorer_exception(self):
     with raises(ValueError):
         _distance_scorer(
             load_data_frame(self.paragraph_scores_file,
                             class_labels=False,
                             match_distance=False), None)
Exemple #8
0
    def test_cos_random_cv(self):
        paragraph_weight = 3
        cv_folds = 2
        cv_iterations = 2

        def cv_function(data_df, params, random_state):
            return cos.cv_independent_associations(data_df,
                                                   params,
                                                   cv_folds=cv_folds,
                                                   random_state=random_state,
                                                   fasttext_epochs=5,
                                                   fasttext_bucket=1000,
                                                   fasttext_dim=20)

        test_df = data_tools.load_data_frame(self.cos_cv_test_path,
                                             match_distance=True)
        test_df['text'] = test_df['text'].apply(lambda s: s.strip().lower())
        cv_results = cv.random_cv(test_df, cv_function, cv_iterations,
                                  {'paragraph_weight': paragraph_weight},
                                  cos.get_hyperparameter_distributions(), 3)

        expected_col_names = [
            'decay_rate',
            'distance_ceiling',
            'distance_offset',
            'document_weight',
            'paragraph_weight',
            'score_cutoff',
            'weighting_exponent',
            'mean_test_score',
            'stdev_test_score',
            'mean_train_score',
            'stdev_train_score',
            'split_0_test_score',
            'split_0_train_score',
            'split_0_n_test',
            'split_0_pos_test',
            'split_0_n_train',
            'split_0_pos_train',
            'split_1_test_score',
            'split_1_train_score',
            'split_1_n_test',
            'split_1_pos_test',
            'split_1_n_train',
            'split_1_pos_train',
        ]
        assert expected_col_names == list(cv_results.columns)
        # following parameters are chosen randomly and hence cannot be tested but only that they differ between the CV
        # runs
        random_col_names = [
            'decay_rate',
            'distance_ceiling',
            'distance_offset',
            'document_weight',
            'score_cutoff',
            'weighting_exponent',
        ]
        for rand in random_col_names:
            assert cv_results.loc[0, rand] != cv_results.loc[1, rand]

        # ignore columns that are linked to test performance since this cannot be tested for random parameter choices
        ignore_params = [
            'mean_test_score',
            'stdev_test_score',
            'mean_train_score',
            'stdev_train_score',
            'split_0_test_score',
            'split_0_train_score',
            'split_1_test_score',
            'split_1_train_score',
        ]

        expected_values = [
            [.444] * cv_iterations,
            [.333] * cv_iterations,
            [.222] * cv_iterations,
            [.111] * cv_iterations,
            [paragraph_weight] * cv_iterations,
            [.111] * cv_iterations,
            [.111] * cv_iterations,
            [1.0] * cv_iterations,
            [0.0] * cv_iterations,
            [1.0] * cv_iterations,
            [0.0] * cv_iterations,
            [1.0] * cv_iterations,
            [1.0] * cv_iterations,
            [24] * cv_iterations,
            [0.5] * cv_iterations,
            [24] * cv_iterations,
            [0.5] * cv_iterations,
            [1.0] * cv_iterations,
            [1.0] * cv_iterations,
            [24] * cv_iterations,
            [0.5] * cv_iterations,
            [24] * cv_iterations,
            [0.5] * cv_iterations,
        ]
        expected_df = pandas.DataFrame(
            {
                col: values
                for col, values in zip(expected_col_names, expected_values)
            },
            columns=expected_col_names)
        results_test_df = cv_results.drop(random_col_names + ignore_params,
                                          axis=1)
        expected_test_df = expected_df.drop(random_col_names + ignore_params,
                                            axis=1)
        assert_frame_equal(results_test_df, expected_test_df)
Exemple #9
0
    def test_fth_random_cv(self):
        bucket = 1000
        dim = 20
        cv_folds = 2
        cv_iterations = 2

        def cv_function(data_df, params, random_state):
            return fth.fasttext_cv_independent_associations(
                data_df,
                params,
                self.ft_path,
                cv_folds=cv_folds,
                random_state=random_state)

        test_df = data_tools.load_data_frame(self.ft_cv_test_path)
        test_df['text'] = test_df['text'].apply(lambda s: s.strip().lower())
        cv_results = cv.random_cv(test_df, cv_function, cv_iterations, {
            '-bucket': bucket,
            '-dim': dim
        }, fth.get_hyperparameter_distributions(), 3)
        expected_col_names = [
            'bucket',
            'dim',
            'epoch',
            'lr',
            'wordNgrams',
            'ws',
            'mean_test_score',
            'stdev_test_score',
            'mean_train_score',
            'stdev_train_score',
            'split_0_test_score',
            'split_0_train_score',
            'split_0_n_test',
            'split_0_pos_test',
            'split_0_n_train',
            'split_0_pos_train',
            'split_1_test_score',
            'split_1_train_score',
            'split_1_n_test',
            'split_1_pos_test',
            'split_1_n_train',
            'split_1_pos_train',
        ]
        assert expected_col_names == list(cv_results.columns)
        # following parameters are chosen randomly and hence cannot be tested but only that they differ between the CV
        # runs
        random_col_names = [
            'epoch',
            'lr',
            'wordNgrams',
            'ws',
        ]
        for rand in random_col_names:
            assert cv_results.loc[0, rand] != cv_results.loc[1, rand]

        # ignore columns that are linked to test performance since this cannot be tested for random parameter choices
        ignore_params = [
            'mean_test_score',
            'stdev_test_score',
            'mean_train_score',
            'stdev_train_score',
            'split_0_test_score',
            'split_0_train_score',
            'split_1_test_score',
            'split_1_train_score',
        ]

        expected_values = [
            [1000] * cv_iterations,
            [20] * cv_iterations,
            [.1] * cv_iterations,
            [.2] * cv_iterations,
            [.3] * cv_iterations,
            [.4] * cv_iterations,
            [1.0] * cv_iterations,
            [0.0] * cv_iterations,
            [1.0] * cv_iterations,
            [0.0] * cv_iterations,
            [1.0] * cv_iterations,
            [1.0] * cv_iterations,
            [20] * cv_iterations,
            [0.5] * cv_iterations,
            [20] * cv_iterations,
            [0.5] * cv_iterations,
            [1.0] * cv_iterations,
            [1.0] * cv_iterations,
            [20] * cv_iterations,
            [0.5] * cv_iterations,
            [20] * cv_iterations,
            [0.5] * cv_iterations,
        ]
        expected_df = pandas.DataFrame(
            {
                col: values
                for col, values in zip(expected_col_names, expected_values)
            },
            columns=expected_col_names)
        results_test_df = cv_results.drop(random_col_names + ignore_params,
                                          axis=1)
        expected_test_df = expected_df.drop(random_col_names + ignore_params,
                                            axis=1)
        assert_frame_equal(results_test_df, expected_test_df)