コード例 #1
0
    def test_metrics_helper_population_sds(self):
        df_new_features = pd.read_csv(join(self.test_dir, 'data', 'files', 'train.csv'))
        # compute the metrics when not specifying the population SDs
        computed_metrics1 = Analyzer.metrics_helper(df_new_features['score'],
                                                    df_new_features['score2'])
        expected_metrics1 = pd.Series({'N': 500.0,
                                       'R2': 0.65340566606389394,
                                       'RMSE': 0.47958315233127197,
                                       'SMD': 0.036736365006090885,
                                       'adj_agr': 100.0,
                                       'corr': 0.82789026370069529,
                                       'exact_agr': 77.0,
                                       'h_max': 6.0,
                                       'h_mean': 3.4199999999999999,
                                       'h_min': 1.0,
                                       'h_sd': 0.81543231461565147,
                                       'kappa': 0.6273493195074531,
                                       'sys_max': 6.0,
                                       'sys_mean': 3.4500000000000002,
                                       'sys_min': 1.0,
                                       'sys_sd': 0.81782496620652367,
                                       'wtkappa': 0.82732732732732728})

        # and now compute them specifying the population SDs
        computed_metrics2 = Analyzer.metrics_helper(df_new_features['score'],
                                                    df_new_features['score2'],
                                                    population_human_score_sd=0.5,
                                                    population_system_score_sd=0.4)
        # the only number that should change is the SMD
        expected_metrics2 = expected_metrics1.copy()
        expected_metrics2['SMD'] = 0.066259

        assert_series_equal(computed_metrics1.sort_index(), expected_metrics1.sort_index())
        assert_series_equal(computed_metrics2.sort_index(), expected_metrics2.sort_index())
コード例 #2
0
ファイル: test_analyzer.py プロジェクト: copperdong/rsmtool
    def test_metrics_helper_population_sds(self):
        df_new_features = pd.read_csv(join(self.test_dir, 'data', 'files', 'train.csv'))
        # compute the metrics when not specifying the population SDs
        computed_metrics1 = Analyzer.metrics_helper(df_new_features['score'],
                                                    df_new_features['score2'])
        expected_metrics1 = pd.Series({'N': 500.0,
                                       'R2': 0.65340566606389394,
                                       'RMSE': 0.47958315233127197,
                                       'SMD': 0.03679030063229779,
                                       'adj_agr': 100.0,
                                       'corr': 0.82789026370069529,
                                       'exact_agr': 77.0,
                                       'h_max': 6.0,
                                       'h_mean': 3.4199999999999999,
                                       'h_min': 1.0,
                                       'h_sd': 0.81543231461565147,
                                       'kappa': 0.6273493195074531,
                                       'sys_max': 6.0,
                                       'sys_mean': 3.4500000000000002,
                                       'sys_min': 1.0,
                                       'sys_sd': 0.81782496620652367,
                                       'wtkappa': 0.8273273273273274})

        # and now compute them specifying the population SDs
        computed_metrics2 = Analyzer.metrics_helper(df_new_features['score'],
                                                    df_new_features['score2'],
                                                    population_human_score_sd=0.5,
                                                    population_system_score_sd=0.4,
                                                    smd_method='williamson')
        # the only number that should change is the SMD
        expected_metrics2 = expected_metrics1.copy()
        expected_metrics2['SMD'] = 0.066259

        assert_series_equal(computed_metrics1.sort_index(), expected_metrics1.sort_index())
        assert_series_equal(computed_metrics2.sort_index(), expected_metrics2.sort_index())
コード例 #3
0
 def test_that_metrics_helper_works_for_data_with_one_row(self):
     # There should be NaNs for SMD, correlations and both sds
     with warnings.catch_warnings():
         warnings.filterwarnings('ignore', category=RuntimeWarning)
         evals = Analyzer.metrics_helper(self.human_scores[0:1],
                                         self.system_scores[0:1])
         assert_equal(evals.isnull().values.sum(), 4)
コード例 #4
0
ファイル: test_analyzer.py プロジェクト: jrosen48/rsmtool
 def test_that_metrics_helper_works_for_data_with_one_row(self):
     # There should be NaNs for SMD, correlations and both sds
     with warnings.catch_warnings():
         warnings.filterwarnings('ignore', category=RuntimeWarning)
         evals = Analyzer.metrics_helper(self.human_scores[0:1],
                                         self.system_scores[0:1])
         assert_equal(evals.isnull().values.sum(), 4)
コード例 #5
0
ファイル: test_analyzer.py プロジェクト: copperdong/rsmtool
 def test_that_metrics_helper_works_for_data_with_the_same_label(self):
     # There should be NaNs for correlation and SMD.
     # Note that for a dataset with a single response
     # kappas will be 0 or 1
     with warnings.catch_warnings():
         warnings.filterwarnings('ignore', category=RuntimeWarning)
         evals = Analyzer.metrics_helper(self.same_human_scores,
                                         self.system_scores)
         assert_equal(evals.isnull().values.sum(), 2)
コード例 #6
0
 def test_that_metrics_helper_works_for_data_with_the_same_label(self):
     # There should be NaNs for correlation.
     # Note that for a dataset with a single response
     # kappas will be 0 or 1
     with warnings.catch_warnings():
         warnings.filterwarnings('ignore', category=RuntimeWarning)
         evals = Analyzer.metrics_helper(self.same_human_scores,
                                         self.system_scores)
         assert_equal(evals.isnull().values.sum(), 1)
コード例 #7
0
ファイル: test_analyzer.py プロジェクト: copperdong/rsmtool
    def test_metrics_helper_zero_system_sd(self):
        human_scores = [1, 3, 4, 2, 3, 1, 3, 4, 2, 1]
        system_score = [2.54] * 10
        computed_metrics1 = Analyzer.metrics_helper(human_scores,
                                                    system_score)
        expected_metrics1 = pd.Series({'N': 10,
                                       'R2': -0.015806451612903283,
                                       'RMSE': 1.122319027727856,
                                       'SMD': 0.11927198519188371,
                                       'adj_agr': 50.0,
                                       'corr': None,
                                       'exact_agr': 0,
                                       'h_max': 4,
                                       'h_mean': 2.4,
                                       'h_min': 1.0,
                                       'h_sd': 1.1737877907772674,
                                       'kappa': 0,
                                       'sys_max': 2.54,
                                       'sys_mean': 2.54,
                                       'sys_min': 2.54,
                                       'sys_sd': 0,
                                       'wtkappa': 0})
        # now compute DSM
        computed_metrics2 = Analyzer.metrics_helper(human_scores,
                                                    system_score,
                                                    use_diff_std_means=True)

        # the only number that should change is the SMD
        expected_metrics2 = expected_metrics1.copy()
        expected_metrics2.drop("SMD", inplace=True)
        expected_metrics2['DSM'] = None
        assert_series_equal(computed_metrics1.sort_index(),
                            expected_metrics1.sort_index(),
                            check_dtype=False)
        assert_series_equal(computed_metrics2.sort_index(),
                            expected_metrics2.sort_index(),
                            check_dtype=False)
コード例 #8
0
def compute_agreement_one_system_one_rater_pair(df_scores,
                                                system_id,
                                                rater_id1,
                                                rater_id2,
                                                include_mean=False):
    """
    Evaluate the given system against the given pair of raters.

    This function computes the agreement metrics between the scores
    assigned by the given simulated system (``system_id``) against the scores
    assigned by the two simulated raters ``rater_id1`` and ``rater_id2``.

    The agreement metrics computed are: Pearson's correlation, adjusted R^2,
    quadratically-weighted kappa, and the difference between the human-human
    Pearson correlation and the human-machine Pearson correlation (commonly
    known as "degradation"). All 4 metrics are computed against the scores
    of the first rater in the pair and, if ``include_mean`` is ``True``, also
    against the average of the scores assigned by both raters in the pair.

    Parameters
    ----------
    df_scores : pandas.DataFrame
        The data frame containing the simulated scores.
        This is usually one of the data frames returned
        by the ``simulation.dataset.Dataset.to_frame()``
        method.
    system_id : str
        The ID for the simulated system to be evaluated.
        This must be a column in ``df_scores``.
    rater_id1 : str
        The ID for the first rater in the rater pair
        being used to evaluate the given system.
        This must be a column in ``df_scores``.
    rater_id2 : str
        The ID for the second rater in the rater pair
        being used to evaluate the given system.
    include_mean : bool, optional
        If set to ``True``, also include the metric values
        computed against the average of the scores assigned
        by both raters in the given pair.

    Returns
    -------
    metrics_series : list of pandas.Series
        A list containing 1 or 2 pandas series depending on the value
        of ``include_mean``. If it is ``True``, this list contains
        two series:  the first containing the values of the metrics
        against the average of the two rater scores and the second
        containing the value of the metrics against the scores of
        the first rater. If ``include_mean`` is ``False``, this list
        only contains a single series - that containing the metric
        values against the scores of the first rater. Any series
        returned will contains the following columns:
        1. "r" - the Pearson's correlation between the system score
           and the average and the first rater scores.
        2. "QWK" - the quadratically-weighted kapp between the system score
           and the average and the first rater scores.
        3. "R2" - the R^2 score between the system score
           and the average and the first rater scores.
        4. "degradation" - the difference between the human-human correlation
           score and the human-machine correlation score. Note that this column
           may not be included in the output if any of the scores for either of
           the two simulated raters are null, e.g., if some of the responses are
           single scored.
        5. "reference" - a column containing whether the metric values were
           computed against the average of the two rater scores (``h1-h2 mean``)
           or the first rater scores (``h1``).
    """
    # compute the inter-rater correlation that we need for degradation
    try:
        rater1_rater2_correlation = pearsonr(df_scores[rater_id1], df_scores[rater_id2])[0]
    except ValueError:
        rater1_rater2_correlation = None
        pass

    # we only want these 3 metrics to start with
    chosen_metrics = ['wtkappa', 'corr', 'R2']

    # compute the metrics against the first rater as a series
    h1_metric_values = Analyzer.metrics_helper(df_scores[rater_id1], df_scores[system_id])
    h1_metric_values = h1_metric_values[chosen_metrics]

    # compute the degradation values
    if rater1_rater2_correlation:
        h1_metric_values['degradation'] = rater1_rater2_correlation - h1_metric_values['corr']

    # add a new column called "reference" indicating whether we used
    # the h1-h2 average score for just the h1 score
    h1_metric_values['reference'] = 'h1'

    # rename some of the metrics to have more recognizable names
    h1_metric_values.rename({'wtkappa': 'QWK', 'corr': 'r'}, inplace=True)

    # compute the metrics against the average ot the two rater scores
    # as a series if it was requested
    if include_mean:
        mean_metric_values = Analyzer.metrics_helper(df_scores[[rater_id1, rater_id2]].mean(axis=1),
                                                     df_scores[system_id])
        mean_metric_values = mean_metric_values[chosen_metrics]

        if rater1_rater2_correlation:
            mean_metric_values['degradation'] = rater1_rater2_correlation - mean_metric_values['corr']
        mean_metric_values['reference'] = 'h1-h2 mean'
        mean_metric_values.rename({'wtkappa': 'QWK', 'corr': 'r'}, inplace=True)

    # return the right number of metric series
    ans = [mean_metric_values, h1_metric_values] if include_mean else [h1_metric_values]
    return ans