def test_metrics_helper_population_sds(self): df_new_features = pd.read_csv(join(self.test_dir, 'data', 'files', 'train.csv')) # compute the metrics when not specifying the population SDs computed_metrics1 = Analyzer.metrics_helper(df_new_features['score'], df_new_features['score2']) expected_metrics1 = pd.Series({'N': 500.0, 'R2': 0.65340566606389394, 'RMSE': 0.47958315233127197, 'SMD': 0.036736365006090885, 'adj_agr': 100.0, 'corr': 0.82789026370069529, 'exact_agr': 77.0, 'h_max': 6.0, 'h_mean': 3.4199999999999999, 'h_min': 1.0, 'h_sd': 0.81543231461565147, 'kappa': 0.6273493195074531, 'sys_max': 6.0, 'sys_mean': 3.4500000000000002, 'sys_min': 1.0, 'sys_sd': 0.81782496620652367, 'wtkappa': 0.82732732732732728}) # and now compute them specifying the population SDs computed_metrics2 = Analyzer.metrics_helper(df_new_features['score'], df_new_features['score2'], population_human_score_sd=0.5, population_system_score_sd=0.4) # the only number that should change is the SMD expected_metrics2 = expected_metrics1.copy() expected_metrics2['SMD'] = 0.066259 assert_series_equal(computed_metrics1.sort_index(), expected_metrics1.sort_index()) assert_series_equal(computed_metrics2.sort_index(), expected_metrics2.sort_index())
def test_metrics_helper_population_sds(self): df_new_features = pd.read_csv(join(self.test_dir, 'data', 'files', 'train.csv')) # compute the metrics when not specifying the population SDs computed_metrics1 = Analyzer.metrics_helper(df_new_features['score'], df_new_features['score2']) expected_metrics1 = pd.Series({'N': 500.0, 'R2': 0.65340566606389394, 'RMSE': 0.47958315233127197, 'SMD': 0.03679030063229779, 'adj_agr': 100.0, 'corr': 0.82789026370069529, 'exact_agr': 77.0, 'h_max': 6.0, 'h_mean': 3.4199999999999999, 'h_min': 1.0, 'h_sd': 0.81543231461565147, 'kappa': 0.6273493195074531, 'sys_max': 6.0, 'sys_mean': 3.4500000000000002, 'sys_min': 1.0, 'sys_sd': 0.81782496620652367, 'wtkappa': 0.8273273273273274}) # and now compute them specifying the population SDs computed_metrics2 = Analyzer.metrics_helper(df_new_features['score'], df_new_features['score2'], population_human_score_sd=0.5, population_system_score_sd=0.4, smd_method='williamson') # the only number that should change is the SMD expected_metrics2 = expected_metrics1.copy() expected_metrics2['SMD'] = 0.066259 assert_series_equal(computed_metrics1.sort_index(), expected_metrics1.sort_index()) assert_series_equal(computed_metrics2.sort_index(), expected_metrics2.sort_index())
def test_that_metrics_helper_works_for_data_with_one_row(self): # There should be NaNs for SMD, correlations and both sds with warnings.catch_warnings(): warnings.filterwarnings('ignore', category=RuntimeWarning) evals = Analyzer.metrics_helper(self.human_scores[0:1], self.system_scores[0:1]) assert_equal(evals.isnull().values.sum(), 4)
def test_that_metrics_helper_works_for_data_with_the_same_label(self): # There should be NaNs for correlation and SMD. # Note that for a dataset with a single response # kappas will be 0 or 1 with warnings.catch_warnings(): warnings.filterwarnings('ignore', category=RuntimeWarning) evals = Analyzer.metrics_helper(self.same_human_scores, self.system_scores) assert_equal(evals.isnull().values.sum(), 2)
def test_that_metrics_helper_works_for_data_with_the_same_label(self): # There should be NaNs for correlation. # Note that for a dataset with a single response # kappas will be 0 or 1 with warnings.catch_warnings(): warnings.filterwarnings('ignore', category=RuntimeWarning) evals = Analyzer.metrics_helper(self.same_human_scores, self.system_scores) assert_equal(evals.isnull().values.sum(), 1)
def test_metrics_helper_zero_system_sd(self): human_scores = [1, 3, 4, 2, 3, 1, 3, 4, 2, 1] system_score = [2.54] * 10 computed_metrics1 = Analyzer.metrics_helper(human_scores, system_score) expected_metrics1 = pd.Series({'N': 10, 'R2': -0.015806451612903283, 'RMSE': 1.122319027727856, 'SMD': 0.11927198519188371, 'adj_agr': 50.0, 'corr': None, 'exact_agr': 0, 'h_max': 4, 'h_mean': 2.4, 'h_min': 1.0, 'h_sd': 1.1737877907772674, 'kappa': 0, 'sys_max': 2.54, 'sys_mean': 2.54, 'sys_min': 2.54, 'sys_sd': 0, 'wtkappa': 0}) # now compute DSM computed_metrics2 = Analyzer.metrics_helper(human_scores, system_score, use_diff_std_means=True) # the only number that should change is the SMD expected_metrics2 = expected_metrics1.copy() expected_metrics2.drop("SMD", inplace=True) expected_metrics2['DSM'] = None assert_series_equal(computed_metrics1.sort_index(), expected_metrics1.sort_index(), check_dtype=False) assert_series_equal(computed_metrics2.sort_index(), expected_metrics2.sort_index(), check_dtype=False)
def compute_agreement_one_system_one_rater_pair(df_scores, system_id, rater_id1, rater_id2, include_mean=False): """ Evaluate the given system against the given pair of raters. This function computes the agreement metrics between the scores assigned by the given simulated system (``system_id``) against the scores assigned by the two simulated raters ``rater_id1`` and ``rater_id2``. The agreement metrics computed are: Pearson's correlation, adjusted R^2, quadratically-weighted kappa, and the difference between the human-human Pearson correlation and the human-machine Pearson correlation (commonly known as "degradation"). All 4 metrics are computed against the scores of the first rater in the pair and, if ``include_mean`` is ``True``, also against the average of the scores assigned by both raters in the pair. Parameters ---------- df_scores : pandas.DataFrame The data frame containing the simulated scores. This is usually one of the data frames returned by the ``simulation.dataset.Dataset.to_frame()`` method. system_id : str The ID for the simulated system to be evaluated. This must be a column in ``df_scores``. rater_id1 : str The ID for the first rater in the rater pair being used to evaluate the given system. This must be a column in ``df_scores``. rater_id2 : str The ID for the second rater in the rater pair being used to evaluate the given system. include_mean : bool, optional If set to ``True``, also include the metric values computed against the average of the scores assigned by both raters in the given pair. Returns ------- metrics_series : list of pandas.Series A list containing 1 or 2 pandas series depending on the value of ``include_mean``. If it is ``True``, this list contains two series: the first containing the values of the metrics against the average of the two rater scores and the second containing the value of the metrics against the scores of the first rater. If ``include_mean`` is ``False``, this list only contains a single series - that containing the metric values against the scores of the first rater. Any series returned will contains the following columns: 1. "r" - the Pearson's correlation between the system score and the average and the first rater scores. 2. "QWK" - the quadratically-weighted kapp between the system score and the average and the first rater scores. 3. "R2" - the R^2 score between the system score and the average and the first rater scores. 4. "degradation" - the difference between the human-human correlation score and the human-machine correlation score. Note that this column may not be included in the output if any of the scores for either of the two simulated raters are null, e.g., if some of the responses are single scored. 5. "reference" - a column containing whether the metric values were computed against the average of the two rater scores (``h1-h2 mean``) or the first rater scores (``h1``). """ # compute the inter-rater correlation that we need for degradation try: rater1_rater2_correlation = pearsonr(df_scores[rater_id1], df_scores[rater_id2])[0] except ValueError: rater1_rater2_correlation = None pass # we only want these 3 metrics to start with chosen_metrics = ['wtkappa', 'corr', 'R2'] # compute the metrics against the first rater as a series h1_metric_values = Analyzer.metrics_helper(df_scores[rater_id1], df_scores[system_id]) h1_metric_values = h1_metric_values[chosen_metrics] # compute the degradation values if rater1_rater2_correlation: h1_metric_values['degradation'] = rater1_rater2_correlation - h1_metric_values['corr'] # add a new column called "reference" indicating whether we used # the h1-h2 average score for just the h1 score h1_metric_values['reference'] = 'h1' # rename some of the metrics to have more recognizable names h1_metric_values.rename({'wtkappa': 'QWK', 'corr': 'r'}, inplace=True) # compute the metrics against the average ot the two rater scores # as a series if it was requested if include_mean: mean_metric_values = Analyzer.metrics_helper(df_scores[[rater_id1, rater_id2]].mean(axis=1), df_scores[system_id]) mean_metric_values = mean_metric_values[chosen_metrics] if rater1_rater2_correlation: mean_metric_values['degradation'] = rater1_rater2_correlation - mean_metric_values['corr'] mean_metric_values['reference'] = 'h1-h2 mean' mean_metric_values.rename({'wtkappa': 'QWK', 'corr': 'r'}, inplace=True) # return the right number of metric series ans = [mean_metric_values, h1_metric_values] if include_mean else [h1_metric_values] return ans