Ejemplo n.º 1
0
    def test_metrics_helper_population_sds(self):
        df_new_features = pd.read_csv(join(self.test_dir, 'data', 'files', 'train.csv'))
        # compute the metrics when not specifying the population SDs
        computed_metrics1 = Analyzer.metrics_helper(df_new_features['score'],
                                                    df_new_features['score2'])
        expected_metrics1 = pd.Series({'N': 500.0,
                                       'R2': 0.65340566606389394,
                                       'RMSE': 0.47958315233127197,
                                       'SMD': 0.03679030063229779,
                                       'adj_agr': 100.0,
                                       'corr': 0.82789026370069529,
                                       'exact_agr': 77.0,
                                       'h_max': 6.0,
                                       'h_mean': 3.4199999999999999,
                                       'h_min': 1.0,
                                       'h_sd': 0.81543231461565147,
                                       'kappa': 0.6273493195074531,
                                       'sys_max': 6.0,
                                       'sys_mean': 3.4500000000000002,
                                       'sys_min': 1.0,
                                       'sys_sd': 0.81782496620652367,
                                       'wtkappa': 0.8273273273273274})

        # and now compute them specifying the population SDs
        computed_metrics2 = Analyzer.metrics_helper(df_new_features['score'],
                                                    df_new_features['score2'],
                                                    population_human_score_sd=0.5,
                                                    population_system_score_sd=0.4,
                                                    smd_method='williamson')
        # the only number that should change is the SMD
        expected_metrics2 = expected_metrics1.copy()
        expected_metrics2['SMD'] = 0.066259

        assert_series_equal(computed_metrics1.sort_index(), expected_metrics1.sort_index())
        assert_series_equal(computed_metrics2.sort_index(), expected_metrics2.sort_index())
    def test_metrics_helper_population_sds(self):
        df_new_features = pd.read_csv(join(self.test_dir, 'data', 'files', 'train.csv'))
        # compute the metrics when not specifying the population SDs
        computed_metrics1 = Analyzer.metrics_helper(df_new_features['score'],
                                                    df_new_features['score2'])
        expected_metrics1 = pd.Series({'N': 500.0,
                                       'R2': 0.65340566606389394,
                                       'RMSE': 0.47958315233127197,
                                       'SMD': 0.036736365006090885,
                                       'adj_agr': 100.0,
                                       'corr': 0.82789026370069529,
                                       'exact_agr': 77.0,
                                       'h_max': 6.0,
                                       'h_mean': 3.4199999999999999,
                                       'h_min': 1.0,
                                       'h_sd': 0.81543231461565147,
                                       'kappa': 0.6273493195074531,
                                       'sys_max': 6.0,
                                       'sys_mean': 3.4500000000000002,
                                       'sys_min': 1.0,
                                       'sys_sd': 0.81782496620652367,
                                       'wtkappa': 0.82732732732732728})

        # and now compute them specifying the population SDs
        computed_metrics2 = Analyzer.metrics_helper(df_new_features['score'],
                                                    df_new_features['score2'],
                                                    population_human_score_sd=0.5,
                                                    population_system_score_sd=0.4)
        # the only number that should change is the SMD
        expected_metrics2 = expected_metrics1.copy()
        expected_metrics2['SMD'] = 0.066259

        assert_series_equal(computed_metrics1.sort_index(), expected_metrics1.sort_index())
        assert_series_equal(computed_metrics2.sort_index(), expected_metrics2.sort_index())
Ejemplo n.º 3
0
 def test_that_metrics_helper_works_for_data_with_one_row(self):
     # There should be NaNs for SMD, correlations and both sds
     with warnings.catch_warnings():
         warnings.filterwarnings('ignore', category=RuntimeWarning)
         evals = Analyzer.metrics_helper(self.human_scores[0:1],
                                         self.system_scores[0:1])
         assert_equal(evals.isnull().values.sum(), 4)
Ejemplo n.º 4
0
 def test_that_correlation_helper_works_for_data_with_four_rows(self):
     # this should compute marginal correlations and return a unity
     # matrix for partial correlations
     retval = Analyzer.correlation_helper(self.df_features[:4], 'sc1',
                                          'group')
     assert_equal(retval[0].isnull().values.sum(), 0)
     assert_almost_equal(abs(retval[1].values).sum(), 3)
Ejemplo n.º 5
0
 def test_that_correlation_helper_works_for_data_with_two_rows(self):
     # this should return 1/-1 for marginal correlations and nans for
     # partial correlations
     retval = Analyzer.correlation_helper(self.df_features[:2], 'sc1',
                                          'group')
     assert_equal(abs(retval[0].values).sum(), 3)
     assert_equal(retval[1].isnull().values.sum(), 3)
Ejemplo n.º 6
0
 def test_that_correlation_helper_works_for_data_with_three_rows(self):
     # this should compute marginal correlations but return Nans for
     # partial correlations
     retval = Analyzer.correlation_helper(self.df_features[:3], 'sc1',
                                          'group')
     assert_equal(retval[0].isnull().values.sum(), 0)
     assert_equal(retval[1].isnull().values.sum(), 3)
 def test_that_metrics_helper_works_for_data_with_one_row(self):
     # There should be NaNs for SMD, correlations and both sds
     with warnings.catch_warnings():
         warnings.filterwarnings('ignore', category=RuntimeWarning)
         evals = Analyzer.metrics_helper(self.human_scores[0:1],
                                         self.system_scores[0:1])
         assert_equal(evals.isnull().values.sum(), 4)
Ejemplo n.º 8
0
 def test_compute_disattenuated_correlations_negative_human(self):
     hm_corr = pd.Series([0.9, 0.8], index=['All data', 'GROUP1'])
     hh_corr = pd.Series([-0.03, 0.64], index=['All data', 'GROUP1'])
     df_dis_corr = Analyzer.compute_disattenuated_correlations(
         hm_corr, hh_corr)
     assert_equal(len(df_dis_corr), 2)
     assert_array_equal(df_dis_corr['corr_disattenuated'], [np.nan, 1.0])
Ejemplo n.º 9
0
 def test_correlation_helper_for_group_with_one_row_and_length(self):
     # this should return a data frames with nans for group with 1 row
     retval = Analyzer.correlation_helper(self.df_features_with_groups_and_length[:6],
                                          'sc1', 'group', include_length=True)
     for df in retval:
         assert_equal(len(df), 2)
         assert_equal(len(df.columns), 3)
Ejemplo n.º 10
0
 def test_compute_pca_less_components_than_features(self):
     # test pca when we have less components than features
     df = pd.DataFrame({'a': range(100)})
     for i in range(100):
         df[i] = df['a'] * i
     (components, variance) = Analyzer.compute_pca(df, df.columns)
     assert_equal(len(components.columns), 100)
     assert_equal(len(variance.columns), 100)
Ejemplo n.º 11
0
 def test_compute_disattenuated_correlations_mismatched_indices(self):
     hm_corr = pd.Series([0.9, 0.6], index=['All data', 'GROUP2'])
     hh_corr = pd.Series([0.81, 0.64], index=['All data', 'GROUP1'])
     df_dis_corr = Analyzer.compute_disattenuated_correlations(
         hm_corr, hh_corr)
     assert_equal(len(df_dis_corr), 3)
     assert_array_equal(df_dis_corr['corr_disattenuated'],
                        [1.0, np.nan, np.nan])
Ejemplo n.º 12
0
 def test_compute_disattenuated_correlations_single_human(self):
     hm_corr = pd.Series([0.9, 0.8, 0.6],
                         index=['raw', 'raw_trim', 'raw_trim_round'])
     hh_corr = pd.Series([0.81], index=[''])
     df_dis_corr = Analyzer.compute_disattenuated_correlations(hm_corr,
                                                               hh_corr)
     assert_equal(len(df_dis_corr), 3)
     assert_equal(df_dis_corr.loc['raw', 'corr_disattenuated'], 1.0)
    def test_that_correlation_helper_works_for_data_with_the_same_label(self):

        # this should return two data frames with nans
        with warnings.catch_warnings():
            warnings.filterwarnings('ignore', category=RuntimeWarning)
            retval = Analyzer.correlation_helper(self.df_features_same_score, 'sc1', 'group')
            assert_equal(retval[0].isnull().values.sum(), 3)
            assert_equal(retval[1].isnull().values.sum(), 3)
 def test_that_correlation_helper_works_for_data_with_one_row(self):
     # this should return two data frames with nans
     # we expect a runtime warning here so let's suppress it
     with warnings.catch_warnings():
         warnings.filterwarnings('ignore', category=RuntimeWarning)
         retval = Analyzer.correlation_helper(self.df_features[:1], 'sc1', 'group')
         assert_equal(retval[0].isnull().values.sum(), 3)
         assert_equal(retval[1].isnull().values.sum(), 3)
 def test_compute_disattenuated_correlations_single_human(self):
     hm_corr = pd.Series([0.9, 0.8, 0.6],
                         index=['raw', 'raw_trim', 'raw_trim_round'])
     hh_corr = pd.Series([0.81], index=[''])
     df_dis_corr = Analyzer.compute_disattenuated_correlations(hm_corr,
                                                               hh_corr)
     assert_equal(len(df_dis_corr), 3)
     assert_equal(df_dis_corr.loc['raw', 'corr_disattenuated'], 1.0)
 def test_compute_pca_less_components_than_features(self):
     # test pca when we have less components than features
     df = pd.DataFrame({'a': range(100)})
     for i in range(100):
         df[i] = df['a'] * i
     (components, variance) = Analyzer.compute_pca(df, df.columns)
     assert_equal(len(components.columns), 100)
     assert_equal(len(variance.columns), 100)
Ejemplo n.º 17
0
 def test_correlation_helper_for_data_with_four_rows(self):
     # this should compute marginal correlations and return a unity
     # matrix for partial correlations
     # it should also raise a UserWarning
     with warnings.catch_warnings(record=True) as warning_list:
         retval = Analyzer.correlation_helper(self.df_features[:4], 'sc1', 'group')
     assert_equal(retval[0].isnull().values.sum(), 0)
     assert_almost_equal(np.abs(retval[1].values).sum(), 0.9244288637889855)
     assert issubclass(warning_list[-1].category, UserWarning)
Ejemplo n.º 18
0
 def test_that_correlation_helper_works_for_data_with_one_row(self):
     # this should return two data frames with nans
     # we expect a runtime warning here so let's suppress it
     with warnings.catch_warnings():
         warnings.filterwarnings('ignore', category=RuntimeWarning)
         retval = Analyzer.correlation_helper(self.df_features[:1], 'sc1',
                                              'group')
         assert_equal(retval[0].isnull().values.sum(), 3)
         assert_equal(retval[1].isnull().values.sum(), 3)
 def test_compute_disattenuated_correlations_mismatched_indices(self):
     hm_corr = pd.Series([0.9, 0.6],
                         index=['All data', 'GROUP2'])
     hh_corr = pd.Series([0.81, 0.64],
                         index=['All data', 'GROUP1'])
     df_dis_corr = Analyzer.compute_disattenuated_correlations(hm_corr,
                                                               hh_corr)
     assert_equal(len(df_dis_corr), 3)
     assert_array_equal(df_dis_corr['corr_disattenuated'], [1.0, np.nan, np.nan])
 def test_compute_disattenuated_correlations_matching_human(self):
     hm_corr = pd.Series([0.9, 0.4, 0.6],
                         index=['All data', 'GROUP1', 'GROUP2'])
     hh_corr = pd.Series([0.81, 0.64, 0.36],
                         index=['All data', 'GROUP1', 'GROUP2'])
     df_dis_corr = Analyzer.compute_disattenuated_correlations(hm_corr,
                                                               hh_corr)
     assert_equal(len(df_dis_corr), 3)
     assert_array_equal(df_dis_corr['corr_disattenuated'], [1.0, 0.5, 1.0])
Ejemplo n.º 21
0
    def test_that_correlation_helper_works_for_data_with_the_same_label(self):

        # this should return two data frames with nans
        with warnings.catch_warnings():
            warnings.filterwarnings('ignore', category=RuntimeWarning)
            retval = Analyzer.correlation_helper(self.df_features_same_score,
                                                 'sc1', 'group')
            assert_equal(retval[0].isnull().values.sum(), 3)
            assert_equal(retval[1].isnull().values.sum(), 3)
Ejemplo n.º 22
0
 def test_compute_disattenuated_correlations_matching_human(self):
     hm_corr = pd.Series([0.9, 0.4, 0.6],
                         index=['All data', 'GROUP1', 'GROUP2'])
     hh_corr = pd.Series([0.81, 0.64, 0.36],
                         index=['All data', 'GROUP1', 'GROUP2'])
     df_dis_corr = Analyzer.compute_disattenuated_correlations(hm_corr,
                                                               hh_corr)
     assert_equal(len(df_dis_corr), 3)
     assert_array_equal(df_dis_corr['corr_disattenuated'], [1.0, 0.5, 1.0])
 def test_compute_disattenuated_correlations_negative_human(self):
     hm_corr = pd.Series([0.9, 0.8],
                         index=['All data', 'GROUP1'])
     hh_corr = pd.Series([-0.03, 0.64],
                         index=['All data', 'GROUP1'])
     df_dis_corr = Analyzer.compute_disattenuated_correlations(hm_corr,
                                                               hh_corr)
     assert_equal(len(df_dis_corr), 2)
     assert_array_equal(df_dis_corr['corr_disattenuated'], [np.nan, 1.0])
 def test_that_metrics_helper_works_for_data_with_the_same_label(self):
     # There should be NaNs for correlation.
     # Note that for a dataset with a single response
     # kappas will be 0 or 1
     with warnings.catch_warnings():
         warnings.filterwarnings('ignore', category=RuntimeWarning)
         evals = Analyzer.metrics_helper(self.same_human_scores,
                                         self.system_scores)
         assert_equal(evals.isnull().values.sum(), 1)
Ejemplo n.º 25
0
 def test_that_metrics_helper_works_for_data_with_the_same_label(self):
     # There should be NaNs for correlation and SMD.
     # Note that for a dataset with a single response
     # kappas will be 0 or 1
     with warnings.catch_warnings():
         warnings.filterwarnings('ignore', category=RuntimeWarning)
         evals = Analyzer.metrics_helper(self.same_human_scores,
                                         self.system_scores)
         assert_equal(evals.isnull().values.sum(), 2)
Ejemplo n.º 26
0
 def test_compute_pca_less_samples_than_features(self):
     # test pca when we have less samples than
     # features. In this case the number of components
     # equals to the number of samples.
     df = pd.DataFrame({'a': range(50)})
     for i in range(100):
         df[i] = df['a'] * i
     (components, variance) = Analyzer.compute_pca(df, df.columns)
     assert_equal(len(components.columns), 50)
     assert_equal(len(variance.columns), 50)
Ejemplo n.º 27
0
 def test_that_correlation_helper_works_for_data_with_the_same_human_score(self):
     # this test should raise UserWarning because the determinant is very close to
     # zero. It also raises Runtime warning because
     # variance of human scores is 0.
     with warnings.catch_warnings(record=True) as warning_list:
         warnings.filterwarnings('ignore', category=RuntimeWarning)
         retval = Analyzer.correlation_helper(self.df_features_same_score, 'sc1', 'group')
         assert_equal(retval[0].isnull().values.sum(), 3)
         assert_equal(retval[1].isnull().values.sum(), 3)
         assert issubclass(warning_list[-1].category, UserWarning)
Ejemplo n.º 28
0
 def test_compute_pca_less_samples_than_features(self):
     # test pca when we have less samples than
     # features. In this case the number of components
     # equals to the number of samples.
     dfs = []
     # to avoid inserting too many columns,
     # we create a list of data frames and then
     # concatenate them together
     for i in range(1, 101):
         dfs.append(pd.DataFrame({i: pd.Series(range(50)) * i}))
     df = pd.concat(dfs, axis=1)
     (components, variance) = Analyzer.compute_pca(df, df.columns)
     assert_equal(len(components.columns), 50)
     assert_equal(len(variance.columns), 50)
Ejemplo n.º 29
0
    def test_metrics_helper_zero_system_sd(self):
        human_scores = [1, 3, 4, 2, 3, 1, 3, 4, 2, 1]
        system_score = [2.54] * 10
        computed_metrics1 = Analyzer.metrics_helper(human_scores,
                                                    system_score)
        expected_metrics1 = pd.Series({'N': 10,
                                       'R2': -0.015806451612903283,
                                       'RMSE': 1.122319027727856,
                                       'SMD': 0.11927198519188371,
                                       'adj_agr': 50.0,
                                       'corr': None,
                                       'exact_agr': 0,
                                       'h_max': 4,
                                       'h_mean': 2.4,
                                       'h_min': 1.0,
                                       'h_sd': 1.1737877907772674,
                                       'kappa': 0,
                                       'sys_max': 2.54,
                                       'sys_mean': 2.54,
                                       'sys_min': 2.54,
                                       'sys_sd': 0,
                                       'wtkappa': 0})
        # now compute DSM
        computed_metrics2 = Analyzer.metrics_helper(human_scores,
                                                    system_score,
                                                    use_diff_std_means=True)

        # the only number that should change is the SMD
        expected_metrics2 = expected_metrics1.copy()
        expected_metrics2.drop("SMD", inplace=True)
        expected_metrics2['DSM'] = None
        assert_series_equal(computed_metrics1.sort_index(),
                            expected_metrics1.sort_index(),
                            check_dtype=False)
        assert_series_equal(computed_metrics2.sort_index(),
                            expected_metrics2.sort_index(),
                            check_dtype=False)
Ejemplo n.º 30
0
def run_evaluation(config_file_or_obj, output_dir):
    """
    Run an `rsmeval` experiment using the given configuration
    file and generate all outputs in the given directory.

    Parameters
    ----------
    config_file_or_obj : str or configuration_parser.Configuration
        Path to the experiment configuration file.
        Users can also pass a `Configuration` object that is in memory.
    output_dir : str
        Path to the experiment output directory.

    Raises
    ------
    ValueError
        If any of the required fields are missing or ill-specified.
    """
    logger = logging.getLogger(__name__)

    # create the 'output' and the 'figure' sub-directories
    # where all the experiment output such as the CSV files
    # and the box plots will be saved
    csvdir = abspath(join(output_dir, 'output'))
    figdir = abspath(join(output_dir, 'figure'))
    reportdir = abspath(join(output_dir, 'report'))
    os.makedirs(csvdir, exist_ok=True)
    os.makedirs(figdir, exist_ok=True)
    os.makedirs(reportdir, exist_ok=True)

    # Allow users to pass Configuration object to the
    # `config_file_or_obj` argument, rather than read file
    if not isinstance(config_file_or_obj, Configuration):

        # Instantiate configuration parser object
        parser = ConfigurationParser.get_configparser(config_file_or_obj)
        configuration = parser.read_normalize_validate_and_process_config(
            config_file_or_obj, context='rsmeval')

        # get the directory where the configuration file lives
        configpath = dirname(config_file_or_obj)

    else:

        configuration = config_file_or_obj
        if configuration.filepath is not None:
            configpath = dirname(configuration.filepath)
        else:
            configpath = os.getcwd()

    logger.info('Saving configuration file.')
    configuration.save(output_dir)

    # Get output format
    file_format = configuration.get('file_format', 'csv')

    # Get DataWriter object
    writer = DataWriter(configuration['experiment_id'])

    # Make sure prediction file can be located
    if not DataReader.locate_files(configuration['predictions_file'],
                                   configpath):
        raise FileNotFoundError('Error: Predictions file {} '
                                'not found.\n'.format(
                                    configuration['predictions_file']))

    scale_with = configuration.get('scale_with')

    # scale_with can be one of the following:
    # (a) None       : the predictions are assumed to be 'raw' and should be used as is
    #                  when computing the metrics; the names for the final columns are
    #                  'raw', 'raw_trim' and 'raw_trim_round'.
    # (b) 'asis'     : the predictions are assumed to be pre-scaled and should be used as is
    #                  when computing the metrics; the names for the final columns are
    #                  'scale', 'scale_trim' and 'scale_trim_round'.
    # (c) a CSV file : the predictions are assumed to be 'raw' and should be scaled
    #                  before computing the metrics; the names for the final columns are
    #                  'scale', 'scale_trim' and 'scale_trim_round'.

    # Check whether we want to do scaling
    do_scaling = (scale_with is not None and scale_with != 'asis')

    # The paths to files and names for data container properties
    paths = ['predictions_file']
    names = ['predictions']

    # If we want to do scaling, get the scale file
    if do_scaling:

        # Make sure scale file can be located
        scale_file_location = DataReader.locate_files(scale_with, configpath)
        if not scale_file_location:
            raise FileNotFoundError('Could not find scaling file {}.'
                                    ''.format(scale_file_location))

        paths.append('scale_with')
        names.append('scale')

    # Get the paths, names, and converters for the DataReader
    (file_names, file_paths) = configuration.get_names_and_paths(paths, names)

    file_paths = DataReader.locate_files(file_paths, configpath)

    converters = {'predictions': configuration.get_default_converter()}

    logger.info('Reading predictions: {}.'.format(
        configuration['predictions_file']))

    # Initialize the reader
    reader = DataReader(file_paths, file_names, converters)
    data_container = reader.read()

    logger.info('Preprocessing predictions.')

    # Initialize the processor
    processor = FeaturePreprocessor()

    (processed_config,
     processed_container) = processor.process_data(configuration,
                                                   data_container,
                                                   context='rsmeval')

    logger.info('Saving pre-processed predictions and metadata to disk.')
    writer.write_experiment_output(csvdir,
                                   processed_container,
                                   new_names_dict={
                                       'pred_test': 'pred_processed',
                                       'test_excluded':
                                       'test_excluded_responses'
                                   },
                                   file_format=file_format)

    # Initialize the analyzer
    analyzer = Analyzer()

    # do the data composition stats
    (analyzed_config,
     analyzed_container) = analyzer.run_data_composition_analyses_for_rsmeval(
         processed_container, processed_config)
    # Write out files
    writer.write_experiment_output(csvdir,
                                   analyzed_container,
                                   file_format=file_format)

    for_pred_data_container = analyzed_container + processed_container

    # run the analyses on the predictions of the model`
    logger.info('Running analyses on predictions.')
    (pred_analysis_config,
     pred_analysis_data_container) = analyzer.run_prediction_analyses(
         for_pred_data_container, analyzed_config)

    writer.write_experiment_output(csvdir,
                                   pred_analysis_data_container,
                                   reset_index=True,
                                   file_format=file_format)

    # Initialize reporter
    reporter = Reporter()

    # generate the report
    logger.info('Starting report generation.')
    reporter.create_report(processed_config, csvdir, figdir, context='rsmeval')
def compute_agreement_one_system_one_rater_pair(df_scores,
                                                system_id,
                                                rater_id1,
                                                rater_id2,
                                                include_mean=False):
    """
    Evaluate the given system against the given pair of raters.

    This function computes the agreement metrics between the scores
    assigned by the given simulated system (``system_id``) against the scores
    assigned by the two simulated raters ``rater_id1`` and ``rater_id2``.

    The agreement metrics computed are: Pearson's correlation, adjusted R^2,
    quadratically-weighted kappa, and the difference between the human-human
    Pearson correlation and the human-machine Pearson correlation (commonly
    known as "degradation"). All 4 metrics are computed against the scores
    of the first rater in the pair and, if ``include_mean`` is ``True``, also
    against the average of the scores assigned by both raters in the pair.

    Parameters
    ----------
    df_scores : pandas.DataFrame
        The data frame containing the simulated scores.
        This is usually one of the data frames returned
        by the ``simulation.dataset.Dataset.to_frame()``
        method.
    system_id : str
        The ID for the simulated system to be evaluated.
        This must be a column in ``df_scores``.
    rater_id1 : str
        The ID for the first rater in the rater pair
        being used to evaluate the given system.
        This must be a column in ``df_scores``.
    rater_id2 : str
        The ID for the second rater in the rater pair
        being used to evaluate the given system.
    include_mean : bool, optional
        If set to ``True``, also include the metric values
        computed against the average of the scores assigned
        by both raters in the given pair.

    Returns
    -------
    metrics_series : list of pandas.Series
        A list containing 1 or 2 pandas series depending on the value
        of ``include_mean``. If it is ``True``, this list contains
        two series:  the first containing the values of the metrics
        against the average of the two rater scores and the second
        containing the value of the metrics against the scores of
        the first rater. If ``include_mean`` is ``False``, this list
        only contains a single series - that containing the metric
        values against the scores of the first rater. Any series
        returned will contains the following columns:
        1. "r" - the Pearson's correlation between the system score
           and the average and the first rater scores.
        2. "QWK" - the quadratically-weighted kapp between the system score
           and the average and the first rater scores.
        3. "R2" - the R^2 score between the system score
           and the average and the first rater scores.
        4. "degradation" - the difference between the human-human correlation
           score and the human-machine correlation score. Note that this column
           may not be included in the output if any of the scores for either of
           the two simulated raters are null, e.g., if some of the responses are
           single scored.
        5. "reference" - a column containing whether the metric values were
           computed against the average of the two rater scores (``h1-h2 mean``)
           or the first rater scores (``h1``).
    """
    # compute the inter-rater correlation that we need for degradation
    try:
        rater1_rater2_correlation = pearsonr(df_scores[rater_id1], df_scores[rater_id2])[0]
    except ValueError:
        rater1_rater2_correlation = None
        pass

    # we only want these 3 metrics to start with
    chosen_metrics = ['wtkappa', 'corr', 'R2']

    # compute the metrics against the first rater as a series
    h1_metric_values = Analyzer.metrics_helper(df_scores[rater_id1], df_scores[system_id])
    h1_metric_values = h1_metric_values[chosen_metrics]

    # compute the degradation values
    if rater1_rater2_correlation:
        h1_metric_values['degradation'] = rater1_rater2_correlation - h1_metric_values['corr']

    # add a new column called "reference" indicating whether we used
    # the h1-h2 average score for just the h1 score
    h1_metric_values['reference'] = 'h1'

    # rename some of the metrics to have more recognizable names
    h1_metric_values.rename({'wtkappa': 'QWK', 'corr': 'r'}, inplace=True)

    # compute the metrics against the average ot the two rater scores
    # as a series if it was requested
    if include_mean:
        mean_metric_values = Analyzer.metrics_helper(df_scores[[rater_id1, rater_id2]].mean(axis=1),
                                                     df_scores[system_id])
        mean_metric_values = mean_metric_values[chosen_metrics]

        if rater1_rater2_correlation:
            mean_metric_values['degradation'] = rater1_rater2_correlation - mean_metric_values['corr']
        mean_metric_values['reference'] = 'h1-h2 mean'
        mean_metric_values.rename({'wtkappa': 'QWK', 'corr': 'r'}, inplace=True)

    # return the right number of metric series
    ans = [mean_metric_values, h1_metric_values] if include_mean else [h1_metric_values]
    return ans
Ejemplo n.º 32
0
def run_experiment(config_file_or_obj, output_dir):
    """
    Run RSMTool experiment using the given configuration
    file and generate all outputs in the given directory.

    Parameters
    ----------
    config_file_or_obj : str or Configuration
        Path to the experiment configuration file.
        Users can also pass a `Configuration` object that is in memory.
    output_dir : str
        Path to the experiment output directory.

    Raises
    ------
    ValueError
        If any of the required fields are missing or ill-specified.
    """

    logger = logging.getLogger(__name__)

    # create the 'output' and the 'figure' sub-directories
    # where all the experiment output such as the CSV files
    # and the box plots will be saved

    # Get absolute paths to output directories
    csvdir = abspath(join(output_dir, 'output'))
    figdir = abspath(join(output_dir, 'figure'))
    reportdir = abspath(join(output_dir, 'report'))
    featuredir = abspath(join(output_dir, 'feature'))

    # Make directories, if necessary
    makedirs(csvdir, exist_ok=True)
    makedirs(figdir, exist_ok=True)
    makedirs(reportdir, exist_ok=True)

    # Allow users to pass Configuration object to the
    # `config_file_or_obj` argument, rather than read from file
    if not isinstance(config_file_or_obj, Configuration):

        # Instantiate configuration parser object
        parser = ConfigurationParser.get_configparser(config_file_or_obj)
        configuration = parser.read_normalize_validate_and_process_config(
            config_file_or_obj)

        # get the directory where the configuration file lives
        configpath = dirname(config_file_or_obj)

    else:

        configuration = config_file_or_obj
        if configuration.filepath is not None:
            configpath = dirname(configuration.filepath)
        else:
            configpath = getcwd()

    logger.info('Saving configuration file.')
    configuration.save(output_dir)

    # Get output format
    file_format = configuration.get('file_format', 'csv')

    # Get DataWriter object
    writer = DataWriter(configuration['experiment_id'])

    # Get the paths and names for the DataReader

    (file_names, file_paths_org) = configuration.get_names_and_paths(
        ['train_file', 'test_file', 'features', 'feature_subset_file'],
        ['train', 'test', 'feature_specs', 'feature_subset_specs'])

    file_paths = DataReader.locate_files(file_paths_org, configpath)

    # if there are any missing files after trying to locate
    # all expected files, raise an error
    if None in file_paths:
        missing_file_paths = [
            file_paths_org[idx] for idx, path in enumerate(file_paths)
            if path is None
        ]
        raise FileNotFoundError('The following files were not found: '
                                '{}'.format(repr(missing_file_paths)))

    # Use the default converter for both train and test
    converters = {
        'train': configuration.get_default_converter(),
        'test': configuration.get_default_converter()
    }

    logger.info('Reading in all data from files.')

    # Initialize the reader
    reader = DataReader(file_paths, file_names, converters)
    data_container = reader.read()

    logger.info('Preprocessing all features.')

    # Initialize the processor
    processor = FeaturePreprocessor()

    (processed_config,
     processed_container) = processor.process_data(configuration,
                                                   data_container)

    # Rename certain frames with more descriptive names
    # for writing out experiment files
    rename_dict = {
        'train_excluded': 'train_excluded_responses',
        'test_excluded': 'test_excluded_responses',
        'train_length': 'train_response_lengths',
        'train_flagged': 'train_responses_with_excluded_flags',
        'test_flagged': 'test_responses_with_excluded_flags'
    }

    logger.info('Saving training and test set data to disk.')

    # Write out files
    writer.write_experiment_output(
        csvdir,
        processed_container, [
            'train_features', 'test_features', 'train_metadata',
            'test_metadata', 'train_other_columns', 'test_other_columns',
            'train_preprocessed_features', 'test_preprocessed_features',
            'train_excluded', 'test_excluded', 'train_length',
            'test_human_scores', 'train_flagged', 'test_flagged'
        ],
        rename_dict,
        file_format=file_format)

    # Initialize the analyzer
    analyzer = Analyzer()

    (analyzed_config,
     analyzed_container) = analyzer.run_data_composition_analyses_for_rsmtool(
         processed_container, processed_config)

    # Write out files
    writer.write_experiment_output(csvdir,
                                   analyzed_container,
                                   file_format=file_format)

    logger.info('Training {} model.'.format(processed_config['model_name']))

    # Initialize modeler
    modeler = Modeler()

    modeler.train(processed_config, processed_container, csvdir, figdir,
                  file_format)

    # Identify the features used by the model
    selected_features = modeler.get_feature_names()

    # Add selected features to processed configuration
    processed_config['selected_features'] = selected_features

    # Write out files
    writer.write_feature_csv(featuredir,
                             processed_container,
                             selected_features,
                             file_format=file_format)

    features_data_container = processed_container.copy()

    # Get selected feature info, and write out to file
    df_feature_info = features_data_container.feature_info.copy()
    df_selected_feature_info = df_feature_info[df_feature_info['feature'].isin(
        selected_features)]
    selected_feature_dataset_dict = {
        'name': 'selected_feature_info',
        'frame': df_selected_feature_info
    }

    features_data_container.add_dataset(selected_feature_dataset_dict,
                                        update=True)

    writer.write_experiment_output(
        csvdir,
        features_data_container,
        dataframe_names=['selected_feature_info'],
        new_names_dict={'selected_feature_info': 'feature'},
        file_format=file_format)

    logger.info('Running analyses on training set.')

    (train_analyzed_config,
     train_analyzed_container) = analyzer.run_training_analyses(
         processed_container, processed_config)

    # Write out files
    writer.write_experiment_output(csvdir,
                                   train_analyzed_container,
                                   reset_index=True,
                                   file_format=file_format)

    # Use only selected features for predictions
    columns_for_prediction = ['spkitemid', 'sc1'] + selected_features
    train_for_prediction = processed_container.train_preprocessed_features[
        columns_for_prediction]
    test_for_prediction = processed_container.test_preprocessed_features[
        columns_for_prediction]

    logged_str = 'Generating training and test set predictions'
    logged_str += ' (expected scores).' if configuration[
        'predict_expected_scores'] else '.'
    logger.info(logged_str)
    (pred_config, pred_data_container) = modeler.predict_train_and_test(
        train_for_prediction, test_for_prediction, processed_config)

    # Write out files
    writer.write_experiment_output(
        csvdir,
        pred_data_container,
        new_names_dict={'pred_test': 'pred_processed'},
        file_format=file_format)

    original_coef_file = join(
        csvdir, '{}_coefficients.{}'.format(pred_config['experiment_id'],
                                            file_format))

    # If coefficients file exists, then generate
    # scaled coefficients and save to file
    if exists(original_coef_file):
        logger.info('Scaling the coefficients and saving them to disk')
        try:

            # Scale coefficients, and return DataContainer w/ scaled coefficients
            scaled_data_container = modeler.scale_coefficients(pred_config)

            # Write out files to disk
            writer.write_experiment_output(csvdir,
                                           scaled_data_container,
                                           file_format=file_format)

        except AttributeError:
            raise ValueError(
                "It appears you are trying to save two different "
                "experiments to the same directory using the same "
                "ID. Please clear the content of the directory and "
                "rerun both experiments using different "
                "experiment IDs.")

    # Add processed data_container frames to pred_data_container
    new_pred_data_container = pred_data_container + processed_container

    logger.info('Running prediction analyses.')
    (pred_analysis_config,
     pred_analysis_data_container) = analyzer.run_prediction_analyses(
         new_pred_data_container, pred_config)

    # Write out files
    writer.write_experiment_output(csvdir,
                                   pred_analysis_data_container,
                                   reset_index=True,
                                   file_format=file_format)
    # Initialize reporter
    reporter = Reporter()

    # generate the report
    logger.info('Starting report generation.')
    reporter.create_report(processed_config, csvdir, figdir)
Ejemplo n.º 33
0
    def test_correlation_helper(self):

        # test that there are no nans for data frame with 10 values
        retval = Analyzer.correlation_helper(self.df_features, 'sc1', 'group')
        assert_equal(retval[0].isnull().values.sum(), 0)
        assert_equal(retval[1].isnull().values.sum(), 0)
Ejemplo n.º 34
0
 def test_correlation_helper_for_one_group_with_one_row(self):
     # this should return a data frames with nans for group with 1 row
     retval = Analyzer.correlation_helper(self.df_features_with_groups[:6], 'sc1', 'group')
     assert_equal(len(retval[0]), 2)
     assert_equal(len(retval[1]), 2)
     assert_equal(retval[0].isnull().values.sum(), 3)
 def test_that_correlation_helper_works_for_data_with_four_rows(self):
     # this should compute marginal correlations and return a unity
     # matrix for partial correlations
     retval = Analyzer.correlation_helper(self.df_features[:4], 'sc1', 'group')
     assert_equal(retval[0].isnull().values.sum(), 0)
     assert_almost_equal(abs(retval[1].values).sum(), 3)
 def test_that_correlation_helper_works_for_data_with_three_rows(self):
     # this should compute marginal correlations but return Nans for
     # partial correlations
     retval = Analyzer.correlation_helper(self.df_features[:3], 'sc1', 'group')
     assert_equal(retval[0].isnull().values.sum(), 0)
     assert_equal(retval[1].isnull().values.sum(), 3)
Ejemplo n.º 37
0
 def test_correlation_helper_for_data_with_one_row(self):
     # this should return two data frames with nans
     retval = Analyzer.correlation_helper(self.df_features[:1], 'sc1', 'group')
     assert_equal(retval[0].isnull().values.sum(), 3)
     assert_equal(retval[1].isnull().values.sum(), 3)
    def test_correlation_helper(self):

        # test that there are no nans for data frame with 10 values
        retval = Analyzer.correlation_helper(self.df_features, 'sc1', 'group')
        assert_equal(retval[0].isnull().values.sum(), 0)
        assert_equal(retval[1].isnull().values.sum(), 0)
Ejemplo n.º 39
0
 def test_correlation_helper_for_data_with_groups(self):
     retval = Analyzer.correlation_helper(self.df_features_with_groups, 'sc1', 'group')
     assert_equal(len(retval[0]), 2)
     assert_equal(len(retval[1]), 2)
 def test_that_correlation_helper_works_for_data_with_two_rows(self):
     # this should return 1/-1 for marginal correlations and nans for
     # partial correlations
     retval = Analyzer.correlation_helper(self.df_features[:2], 'sc1', 'group')
     assert_equal(abs(retval[0].values).sum(), 3)
     assert_equal(retval[1].isnull().values.sum(), 3)
Ejemplo n.º 41
0
 def test_correlation_helper_for_groups_and_length(self):
     retval = Analyzer.correlation_helper(self.df_features_with_groups_and_length,
                                          'sc1', 'group', include_length=True)
     for df in retval:
         assert_equal(len(df), 2)
         assert_equal(len(df.columns), 3)
Ejemplo n.º 42
0
def run_experiment(config_file_or_obj,
                   output_dir):
    """
    Run RSMTool experiment using the given configuration
    file and generate all outputs in the given directory.

    Parameters
    ----------
    config_file_or_obj : str or Configuration
        Path to the experiment configuration file.
        Users can also pass a `Configuration` object that is in memory.
    output_dir : str
        Path to the experiment output directory.

    Raises
    ------
    ValueError
        If any of the required fields are missing or ill-specified.
    """

    logger = logging.getLogger(__name__)

    # create the 'output' and the 'figure' sub-directories
    # where all the experiment output such as the CSV files
    # and the box plots will be saved

    # Get absolute paths to output directories
    csvdir = abspath(join(output_dir, 'output'))
    figdir = abspath(join(output_dir, 'figure'))
    reportdir = abspath(join(output_dir, 'report'))
    featuredir = abspath(join(output_dir, 'feature'))

    # Make directories, if necessary
    makedirs(csvdir, exist_ok=True)
    makedirs(figdir, exist_ok=True)
    makedirs(reportdir, exist_ok=True)

    # Allow users to pass Configuration object to the
    # `config_file_or_obj` argument, rather than read from file
    if not isinstance(config_file_or_obj, Configuration):

        # Instantiate configuration parser object
        parser = ConfigurationParser.get_configparser(config_file_or_obj)
        configuration = parser.read_normalize_validate_and_process_config(config_file_or_obj)

        # get the directory where the configuration file lives
        configpath = dirname(config_file_or_obj)

    else:

        configuration = config_file_or_obj
        if configuration.filepath is not None:
            configpath = dirname(configuration.filepath)
        else:
            configpath = getcwd()

    logger.info('Saving configuration file.')
    configuration.save(output_dir)

    # Get output format
    file_format = configuration.get('file_format', 'csv')

    # Get DataWriter object
    writer = DataWriter(configuration['experiment_id'])

    # Get the paths and names for the DataReader

    (file_names,
     file_paths_org) = configuration.get_names_and_paths(['train_file', 'test_file',
                                                          'features',
                                                          'feature_subset_file'],
                                                         ['train', 'test',
                                                          'feature_specs',
                                                          'feature_subset_specs'])

    file_paths = DataReader.locate_files(file_paths_org, configpath)

    # if there are any missing files after trying to locate
    # all expected files, raise an error
    if None in file_paths:
        missing_file_paths = [file_paths_org[idx] for idx, path in enumerate(file_paths)
                              if path is None]
        raise FileNotFoundError('The following files were not found: '
                                '{}'.format(repr(missing_file_paths)))

    # Use the default converter for both train and test
    converters = {'train': configuration.get_default_converter(),
                  'test': configuration.get_default_converter()}

    logger.info('Reading in all data from files.')

    # Initialize the reader
    reader = DataReader(file_paths, file_names, converters)
    data_container = reader.read()

    logger.info('Preprocessing all features.')

    # Initialize the processor
    processor = FeaturePreprocessor()

    (processed_config,
     processed_container) = processor.process_data(configuration,
                                                   data_container)

    # Rename certain frames with more descriptive names
    # for writing out experiment files
    rename_dict = {'train_excluded': 'train_excluded_responses',
                   'test_excluded': 'test_excluded_responses',
                   'train_length': 'train_response_lengths',
                   'train_flagged': 'train_responses_with_excluded_flags',
                   'test_flagged': 'test_responses_with_excluded_flags'}

    logger.info('Saving training and test set data to disk.')

    # Write out files
    writer.write_experiment_output(csvdir,
                                   processed_container,
                                   ['train_features',
                                    'test_features',
                                    'train_metadata',
                                    'test_metadata',
                                    'train_other_columns',
                                    'test_other_columns',
                                    'train_preprocessed_features',
                                    'test_preprocessed_features',
                                    'train_excluded',
                                    'test_excluded',
                                    'train_length',
                                    'test_human_scores',
                                    'train_flagged',
                                    'test_flagged'],
                                   rename_dict,
                                   file_format=file_format)

    # Initialize the analyzer
    analyzer = Analyzer()

    (analyzed_config,
     analyzed_container) = analyzer.run_data_composition_analyses_for_rsmtool(processed_container,
                                                                              processed_config)

    # Write out files
    writer.write_experiment_output(csvdir,
                                   analyzed_container,
                                   file_format=file_format)

    logger.info('Training {} model.'.format(processed_config['model_name']))

    # Initialize modeler
    modeler = Modeler()

    modeler.train(processed_config,
                  processed_container,
                  csvdir,
                  figdir,
                  file_format)

    # Identify the features used by the model
    selected_features = modeler.get_feature_names()

    # Add selected features to processed configuration
    processed_config['selected_features'] = selected_features

    # Write out files
    writer.write_feature_csv(featuredir,
                             processed_container,
                             selected_features,
                             file_format=file_format)

    features_data_container = processed_container.copy()

    # Get selected feature info, and write out to file
    df_feature_info = features_data_container.feature_info.copy()
    df_selected_feature_info = df_feature_info[df_feature_info['feature'].isin(selected_features)]
    selected_feature_dataset_dict = {'name': 'selected_feature_info',
                                     'frame': df_selected_feature_info}

    features_data_container.add_dataset(selected_feature_dataset_dict,
                                        update=True)

    writer.write_experiment_output(csvdir,
                                   features_data_container,
                                   dataframe_names=['selected_feature_info'],
                                   new_names_dict={'selected_feature_info': 'feature'},
                                   file_format=file_format)

    logger.info('Running analyses on training set.')

    (train_analyzed_config,
     train_analyzed_container) = analyzer.run_training_analyses(processed_container,
                                                                processed_config)

    # Write out files
    writer.write_experiment_output(csvdir,
                                   train_analyzed_container,
                                   reset_index=True,
                                   file_format=file_format)

    # Use only selected features for predictions
    columns_for_prediction = ['spkitemid', 'sc1'] + selected_features
    train_for_prediction = processed_container.train_preprocessed_features[columns_for_prediction]
    test_for_prediction = processed_container.test_preprocessed_features[columns_for_prediction]

    logged_str = 'Generating training and test set predictions'
    logged_str += ' (expected scores).' if configuration['predict_expected_scores'] else '.'
    logger.info(logged_str)
    (pred_config,
     pred_data_container) = modeler.predict_train_and_test(train_for_prediction,
                                                           test_for_prediction,
                                                           processed_config)

    # Write out files
    writer.write_experiment_output(csvdir,
                                   pred_data_container,
                                   new_names_dict={'pred_test': 'pred_processed'},
                                   file_format=file_format)

    original_coef_file = join(csvdir, '{}_coefficients.{}'.format(pred_config['experiment_id'],
                                                                  file_format))

    # If coefficients file exists, then generate
    # scaled coefficients and save to file
    if exists(original_coef_file):
        logger.info('Scaling the coefficients and saving them to disk')
        try:

            # Scale coefficients, and return DataContainer w/ scaled coefficients
            scaled_data_container = modeler.scale_coefficients(pred_config)

            # Write out files to disk
            writer.write_experiment_output(csvdir,
                                           scaled_data_container,
                                           file_format=file_format)

        except AttributeError:
            raise ValueError("It appears you are trying to save two different "
                             "experiments to the same directory using the same "
                             "ID. Please clear the content of the directory and "
                             "rerun both experiments using different "
                             "experiment IDs.")

    # Add processed data_container frames to pred_data_container
    new_pred_data_container = pred_data_container + processed_container

    logger.info('Running prediction analyses.')
    (pred_analysis_config,
     pred_analysis_data_container) = analyzer.run_prediction_analyses(new_pred_data_container,
                                                                      pred_config)

    # Write out files
    writer.write_experiment_output(csvdir,
                                   pred_analysis_data_container,
                                   reset_index=True,
                                   file_format=file_format)
    # Initialize reporter
    reporter = Reporter()

    # generate the report
    logger.info('Starting report generation.')
    reporter.create_report(processed_config,
                           csvdir,
                           figdir)