Beispiel #1
0
def test_check_flag_column_convert_to_list_keep_numeric():
    config = {"flag_column": {"advisories": 123}}
    flag_dict = check_flag_column(config)
    eq_(flag_dict, {"advisories": [123]})
Beispiel #2
0
def test_check_flag_column_wrong_format():
    config = {"flag_column": "[advisories]"}
    check_flag_column(config)
Beispiel #3
0
def test_check_flag_column_no_values():
    config = {"flag_column": None}
    flag_dict = check_flag_column(config)
    eq_(flag_dict, {})
Beispiel #4
0
def test_check_flag_column_convert_to_list():
    config = {"flag_column": {"advisories": "0"}}
    flag_dict = check_flag_column(config)
    eq_(flag_dict, {"advisories": ['0']})
Beispiel #5
0
def test_check_flag_column():
    input_dict = {"advisory flag": ['0']}
    config = {"flag_column": input_dict}
    output_dict = check_flag_column(config)
    eq_(input_dict, output_dict)
Beispiel #6
0
def test_check_flag_column_keep_numeric():
    input_dict = {"advisory flag": [1, 2, 3]}
    config = {"flag_column": input_dict}
    output_dict = check_flag_column(config)
    eq_(output_dict, {"advisory flag": [1, 2, 3]})
Beispiel #7
0
def test_check_flag_column_wrong_format():
    config = {"flag_column": "[advisories]"}
    check_flag_column(config)
Beispiel #8
0
def compute_and_save_predictions(config_file, output_file, feats_file):
    """
    Generate predictions using the information in the config file
    and save them into the given output file.
    """

    logger = logging.getLogger(__name__)

    # read in the main config file
    config_obj = read_json_file(config_file)
    config_obj = check_main_config(config_obj, context='rsmpredict')

    # get the directory where the config file lives
    # if this is the 'expm' directory, then go
    # up one level.
    configpath = dirname(config_file)

    # get the input file containing the feature values
    # for which we want to generate the predictions
    input_features_file = locate_file(config_obj['input_features_file'], configpath)
    if not input_features_file:
        raise FileNotFoundError('Input file {} does not exist'.format(config_obj['input_features_file']))

    # get the experiment ID
    experiment_id = config_obj['experiment_id']

    # get the column name that will hold the ID
    id_column = config_obj['id_column']

    # get the column name for human score (if any)
    human_score_column = config_obj['human_score_column']

    # get the column name for second human score (if any)
    second_human_score_column = config_obj['second_human_score_column']

    # get the column name for subgroups (if any)
    subgroups = config_obj['subgroups']

    # get the column names for flag columns (if any)
    flag_column_dict = check_flag_column(config_obj)

    # get the name for the candidate_column (if any)
    candidate_column = config_obj['candidate_column']

    # get the directory of the experiment
    experiment_dir = locate_file(config_obj['experiment_dir'], configpath)
    if not experiment_dir:
        raise FileNotFoundError('The directory {} does not exist.'.format(config_obj['experiment_dir']))
    else:
        experiment_output_dir = normpath(join(experiment_dir, 'output'))
        if not exists(experiment_output_dir):
            raise FileNotFoundError('The directory {} does not contain '
                                    'the output of an rsmtool experiment.'.format(experiment_dir))

    # find all the .model files in the experiment output directory
    model_files = glob.glob(join(experiment_output_dir, '*.model'))
    if not model_files:
        raise FileNotFoundError('The directory {} does not contain any rsmtool models.'.format(experiment_output_dir))

    experiment_ids = [splitext(basename(mf))[0] for mf in model_files]
    if experiment_id not in experiment_ids:
        raise FileNotFoundError('{} does not contain a model for the experiment "{}". '                                 'The following experiments are contained in this '
                                'directory: {}'.format(experiment_output_dir,
                                                       experiment_id,
                                                       experiment_ids))

    # check that the directory contains outher required files
    required_file_types = ['feature', 'postprocessing_params']
    for file_type in required_file_types:
        expected_file_name = "{}_{}.csv".format(experiment_id, file_type)
        if not exists(join(experiment_output_dir, expected_file_name)):
            raise FileNotFoundError('{} does not contain the required file '
                                    '{} that was generated during the '
                                    'original model training'.format(experiment_output_dir,
                                                                     expected_file_name))

    # read in the given features but make sure that the
    # `id_column`, `candidate_column` and subgroups are read in as a string
    logger.info('Reading features from {}'.format(input_features_file))
    string_columns = [id_column, candidate_column] + subgroups
    converter_dict = dict([(column, str) for column in string_columns if column])

    df_input = pd.read_csv(input_features_file, converters=converter_dict)

    # make sure that the columns specified in the config file actually exist
    columns_to_check = [id_column] + subgroups + list(flag_column_dict.keys())

    # add subgroups and the flag columns to the list of columns
    # that will be added to the final file
    columns_to_copy = subgroups + list(flag_column_dict.keys())

    # human_score_column will be set to sc1 by default
    # we only raise an error if it's set to something else.
    # However, since we cannot distinguish whether the column was set
    # to sc1 by default or specified as such in the config file
    # we append it to output anyway as long as
    # it is in the input file

    if human_score_column != 'sc1' or 'sc1' in df_input.columns:
        columns_to_check.append(human_score_column)
        columns_to_copy.append('sc1')

    if candidate_column:
        columns_to_check.append(candidate_column)
        columns_to_copy.append('candidate')

    if second_human_score_column:
        columns_to_check.append(second_human_score_column)
        columns_to_copy.append('sc2')

    missing_columns = set(columns_to_check).difference(df_input.columns)
    if missing_columns:
        raise KeyError("Columns {} from the config file "
                       "do not exist in the data.".format(missing_columns))

    # rename all columns
    df_input = rename_default_columns(df_input,
                                      [],
                                      id_column,
                                      human_score_column,
                                      second_human_score_column,
                                      None,
                                      None,
                                      candidate_column=candidate_column)

    # check that the id_column contains unique values
    if df_input['spkitemid'].size != df_input['spkitemid'].unique().size:
        raise ValueError("The data contains repeated response IDs in {}. Please make sure all response IDs are unique and re-run the tool.".format(id_column))



    # now we need to pre-process these features using
    # the parameters that are already stored in the
    # _features.csv file.
    df_feature_info = pd.read_csv(join(experiment_output_dir,
                                       '{}_feature.csv'.format(experiment_id)),
                                  index_col=0)
    required_features = df_feature_info.index.tolist()

    # ensure that all the features that are needed by the model
    # are present in the input file
    input_feature_columns = [c for c in df_input if c != id_column]
    missing_features = set(required_features).difference(input_feature_columns)
    if missing_features:
        raise KeyError('{} is missing the following features: {}'.format(feats_file, missing_features))
    extra_features = set(input_feature_columns).difference(required_features + [id_column])
    if extra_features:
        logging.warning('The following extraenous features will be ignored: {}'.format(extra_features))

    # keep the required features plus the id
    features_to_keep = ['spkitemid'] + required_features

    # check if actually have the human scores for this data and add
    # sc1 to preprocessed features for consistency with other tools
    has_human_scores = 'sc1' in df_input
    if has_human_scores:
        features_to_keep.append('sc1')

    df_features = df_input[features_to_keep]

    # preprocess the feature values
    logger.info('Pre-processing input features')

    # first we need to filter out NaNs and any other
    # weird features, the same way we did for rsmtool.
    df_filtered = df_features.copy()
    df_excluded = pd.DataFrame(columns=df_filtered.columns)

    for feature_name in required_features:
        newdf, newdf_excluded = filter_on_column(df_filtered, feature_name, 'spkitemid',
                                                 exclude_zeros=False,
                                                 exclude_zero_sd=False)
        del df_filtered
        df_filtered = newdf
        df_excluded = pd.merge(df_excluded, newdf_excluded, how='outer')

    # make sure that the remaining data frame is not empty
    if len(df_filtered) == 0:
        raise ValueError("There are no responses left after "
                         "filtering out non-numeric feature values. No analysis "
                         "will be run")

    df_features = df_filtered.copy()
    df_features_preprocessed = df_features.copy()
    for feature_name in required_features:

        feature_values = df_features[feature_name].values

        feature_transformation = df_feature_info.loc[feature_name]['transform']
        feature_weight = df_feature_info.loc[feature_name]['sign']

        train_feature_mean = df_feature_info.loc[feature_name]['train_mean']
        train_feature_sd = df_feature_info.loc[feature_name]['train_sd']

        train_transformed_mean = df_feature_info.loc[feature_name]['train_transformed_mean']
        train_transformed_sd = df_feature_info.loc[feature_name]['train_transformed_sd']

        # transform the feature values and remove outliers
        df_features_preprocessed[feature_name] = preprocess_feature(feature_values,
                                                                    feature_name,
                                                                    feature_transformation,
                                                                    train_feature_mean,
                                                                    train_feature_sd,
                                                                    exclude_zero_sd=False)

        # now standardize the feature values
        df_features_preprocessed[feature_name] = (df_features_preprocessed[feature_name] - train_transformed_mean) / train_transformed_sd

        # Multiply features by weight. Within the
        # current SR timeline, the mean of the transformed train
        # feature used to standardize test features has to be
        # computed before multiplying the train feature by the weight.
        df_features_preprocessed[feature_name] = df_features_preprocessed[feature_name] * feature_weight

    # save the pre-processed features to disk if we were asked to
    if feats_file:
        logger.info('Saving pre-processed feature values to {}'.format(feats_file))

        # create any directories needed for the output file
        os.makedirs(dirname(feats_file), exist_ok=True)
        df_features_preprocessed.to_csv(feats_file, index=False)

    # now load the SKLL model to generate the predictions
    model = Learner.from_file(join(experiment_output_dir, '{}.model'.format(experiment_id)))

    # now generate the predictions for the features using this model
    logger.info('Generating predictions')
    df_predictions = predict_with_model(model, df_features_preprocessed)

    # read in the post-processing parameters from disk
    df_postproc_params = pd.read_csv(join(experiment_output_dir, '{}_postprocessing_params.csv'.format(experiment_id)))
    trim_min = df_postproc_params['trim_min'].values[0]
    trim_max = df_postproc_params['trim_max'].values[0]
    h1_mean = df_postproc_params['h1_mean'].values[0]
    h1_sd = df_postproc_params['h1_sd'].values[0]
    train_predictions_mean = df_postproc_params['train_predictions_mean'].values[0]
    train_predictions_sd = df_postproc_params['train_predictions_sd'].values[0]

    # now scale the predictions
    logger.info('Rescaling predictions')
    scaled_predictions = (df_predictions['raw'] - train_predictions_mean) / train_predictions_sd
    scaled_predictions = scaled_predictions * h1_sd + h1_mean
    df_predictions['scale'] = scaled_predictions

    # trim and round the predictions
    logger.info('Trimming and rounding predictions')
    df_predictions['raw_trim'] = trim(df_predictions['raw'], trim_min, trim_max)
    df_predictions['raw_trim_round'] = np.rint(df_predictions['raw_trim']).astype('int64')
    df_predictions['scale_trim'] = trim(df_predictions['scale'], trim_min, trim_max)
    df_predictions['scale_trim_round'] = np.rint(df_predictions['scale_trim']).astype('int64')

    # add back the columns that we were requested to copy if any
    if columns_to_copy:
        df_predictions_with_metadata = pd.merge(df_predictions,
                                                df_input[['spkitemid'] + columns_to_copy])
        assert(len(df_predictions) == len(df_predictions_with_metadata))
    else:
        df_predictions_with_metadata = df_predictions.copy()

    # create any directories needed for the output file
    os.makedirs(dirname(output_file), exist_ok=True)

    # save the predictions to disk
    logger.info('Saving predictions to {}'.format(output_file))
    df_predictions_with_metadata.to_csv(output_file, index=False)

    # save excluded responses to disk
    if not df_excluded.empty:
        excluded_output_file = '{}_excluded_responses{}'.format(*splitext(output_file))
        logger.info('Saving excluded responses to {}'.format(excluded_output_file))
        df_excluded.to_csv(excluded_output_file, index=False)
Beispiel #9
0
def test_check_flag_column_convert_to_list():
    config = {"flag_column": {"advisories": "0"}}
    flag_dict = check_flag_column(config)
    eq_(flag_dict, {"advisories": ['0']})
Beispiel #10
0
def test_check_flag_column_convert_to_list_keep_numeric():
    config = {"flag_column": {"advisories": 123}}
    flag_dict = check_flag_column(config)
    eq_(flag_dict, {"advisories": [123]})
Beispiel #11
0
def test_check_flag_column_no_values():
    config = {"flag_column": None}
    flag_dict = check_flag_column(config)
    eq_(flag_dict, {})
Beispiel #12
0
def test_check_flag_column_keep_numeric():
    input_dict = {"advisory flag": [1, 2, 3]}
    config = {"flag_column": input_dict}
    output_dict = check_flag_column(config)
    eq_(output_dict, {"advisory flag": [1, 2, 3]})
Beispiel #13
0
def test_check_flag_column():
    input_dict = {"advisory flag": ['0']}
    config = {"flag_column": input_dict}
    output_dict = check_flag_column(config)
    eq_(input_dict, output_dict)
Beispiel #14
0
def run_evaluation(config_file, output_dir):
    """
    Run RSMTool evaluation experiment using the given configuration
    file and generate all evaluation outputs in the given directory.
    """

    logger = logging.getLogger(__name__)

    # create the 'output' and the 'figure' sub-directories
    # where all the experiment output such as the CSV files
    # and the box plots will be saved
    csvdir = abspath(join(output_dir, 'output'))
    figdir = abspath(join(output_dir, 'figure'))
    reportdir = abspath(join(output_dir, 'report'))
    os.makedirs(csvdir, exist_ok=True)
    os.makedirs(figdir, exist_ok=True)
    os.makedirs(reportdir, exist_ok=True)

    # load the information from the config file
    # read in the main config file
    config_obj = read_json_file(config_file)
    config_obj = check_main_config(config_obj, context='rsmeval')

    # get the directory where the config file lives
    # if this is the 'expm' directory, then go
    # up one level.
    configpath = dirname(config_file)

    # get the experiment ID
    experiment_id = config_obj['experiment_id']

    # get the description
    description = config_obj['description']

    # get the column name for the labels for the training and testing data
    human_score_column = config_obj['human_score_column']
    system_score_column = config_obj['system_score_column']

    # get the name of the optional column that
    # contains the second human score
    second_human_score_column = config_obj['second_human_score_column']

    # if the human score column is the same as the
    # second human score column, raise an error
    if human_score_column == second_human_score_column:
        raise ValueError("'human_score_column' and "
                         "'second_human_score_column' "
                         "cannot have the same value.")

    # get the column name that will hold the ID for
    # both the training and the test data
    id_column = config_obj['id_column']

    # get the specified trim min and max, if any
    # and make sure they are numeric
    spec_trim_min, spec_trim_max = get_trim_min_max(config_obj)

    # get the subgroups if any
    subgroups = config_obj.get('subgroups')

    # get the candidate column if any and convert it to string
    candidate_column = config_obj['candidate_column']

    general_report_sections = config_obj['general_sections']

    # get any special sections that the user might have specified
    special_report_sections = config_obj['special_sections']

    # get any custom sections and locate them to make sure
    # that they exist, otherwise raise an exception
    custom_report_section_paths = config_obj['custom_sections']
    if custom_report_section_paths:
        logger.info('Locating custom report sections')
        custom_report_sections = locate_custom_sections(
            custom_report_section_paths, configpath)
    else:
        custom_report_sections = []

    section_order = config_obj['section_order']

    #  check all sections values and order and get the
    # ordered list of notebook files
    chosen_notebook_files = get_ordered_notebook_files(general_report_sections,
                                                       special_report_sections,
                                                       custom_report_sections,
                                                       section_order,
                                                       subgroups,
                                                       model_type=None,
                                                       context='rsmeval')
    # are we excluding zero scores?
    exclude_zero_scores = config_obj['exclude_zero_scores']

    # if we are excluding zero scores but trim_min
    # is set to 0, then we need to warn the user
    if exclude_zero_scores and spec_trim_min == 0:
        logger.warning("'exclude_zero_scores' is set to True but "
                       " 'trim_min' is set to 0. This may cause "
                       " unexpected behavior.")

    # are we filtering on any other columns?
    flag_column_dict = check_flag_column(config_obj)

    # do we have the training set predictions and human scores CSV file
    scale_with = config_obj.get('scale_with')

    # scale_with can be one of the following:
    # (a) None       : the predictions are assumed to be 'raw' and should be used as is
    #                  when computing the metrics; the names for the final columns are
    #                  'raw', 'raw_trim' and 'raw_trim_round'.
    # (b) 'asis'     : the predictions are assumed to be pre-scaled and should be used as is
    #                  when computing the metrics; the names for the final columns are
    #                  'scale', 'scale_trim' and 'scale_trim_round'.
    # (c) a CSV file : the predictions are assumed to be 'raw' and should be scaled
    #                  before computing the metrics; the names for the final columns are
    #                  'scale', 'scale_trim' and 'scale_trim_round'.

    # we need to scale if and only if a CSV file is specified
    do_scaling = (scale_with != None and scale_with != 'asis')

    # use scaled predictions for the analyses unless
    # we were told not to
    use_scaled_predictions = (scale_with != None)

    # log an appropriate message
    if scale_with is None:
        message = ('Assuming given system predictions '
                   'are unscaled and will be used as such.')
    elif scale_with == 'asis':
        message = ('Assuming given system predictions '
                   'are already scaled and will be used as such.')
    else:
        message = ('Assuming given system predictions '
                   'are unscaled and will be scaled before use.')
    logger.info(message)

    # load the predictions from disk and make sure that the `id_column`
    # is read in as a string
    predictions_file_location = locate_file(config_obj['predictions_file'],
                                            configpath)
    if not predictions_file_location:
        raise FileNotFoundError('Error: Predictions file {} '
                                'not found.\n'.format(
                                    config_obj['predictions_file']))
    else:
        logger.info(
            'Reading predictions: {}'.format(predictions_file_location))
        string_columns = [id_column, candidate_column] + subgroups
        converter_dict = dict([(column, str) for column in string_columns
                               if column])

        df_pred = pd.read_csv(predictions_file_location,
                              converters=converter_dict)

    # make sure that the columns specified in the config file actually exist
    missing_columns = set([id_column, human_score_column,
                           system_score_column]).difference(df_pred.columns)
    if missing_columns:
        raise KeyError('Columns {} from the config file do not exist '
                       'in the predictions file.'.format(missing_columns))

    df_pred = rename_default_columns(df_pred, [], id_column,
                                     human_score_column,
                                     second_human_score_column, None,
                                     system_score_column, candidate_column)

    # check that the id_column contains unique values
    if df_pred['spkitemid'].size != df_pred['spkitemid'].unique().size:
        raise ValueError("The data contains duplicate response IDs "
                         "in '{}'. Please make sure all response IDs "
                         "are unique and re-run the tool.".format(id_column))

    df_pred = check_subgroups(df_pred, subgroups)

    # filter out the responses based on flag columns
    (df_responses_with_requested_flags,
     df_responses_with_excluded_flags) = filter_on_flag_columns(
         df_pred, flag_column_dict)

    # filter out rows that have non-numeric or zero human scores
    df_filtered, df_excluded = filter_on_column(
        df_responses_with_requested_flags,
        'sc1',
        'spkitemid',
        exclude_zeros=exclude_zero_scores)

    # make sure that the remaining data frame is not empty
    if len(df_filtered) == 0:
        raise ValueError("No responses remaining after filtering out "
                         "non-numeric human scores. No further analysis "
                         "can be run. ")

    # Change all non-numeric machine scores in excluded
    # data to NaNs for consistency with rsmtool.
    # NOTE: This will *not* work if *all* of the values
    # in column are non-numeric. This is a known bug in
    # pandas: https://github.com/pydata/pandas/issues/9589
    # Therefore, we need add an additional check after this.
    df_excluded['raw'] = pd.to_numeric(df_excluded['raw'],
                                       errors='coerce').astype(float)

    # filter out the non-numeric machine scores from the rest of the data
    newdf, newdf_excluded = filter_on_column(df_filtered,
                                             'raw',
                                             'spkitemid',
                                             exclude_zeros=False)

    del df_filtered
    df_filtered_pred = newdf

    # make sure that the remaining data frame is not empty
    if len(df_filtered_pred) == 0:
        raise ValueError("No responses remaining after filtering out "
                         "non-numeric machine scores. No further analysis "
                         "can be run. ")

    df_excluded = pd.merge(df_excluded, newdf_excluded, how='outer')

    # set default values for scaling
    scale_pred_mean = 0
    scale_pred_sd = 1
    scale_human_mean = 0
    scale_human_sd = 1

    if do_scaling:
        scale_file_location = locate_file(scale_with, configpath)
        if not scale_file_location:
            raise FileNotFoundError(
                'Error: scaling file {} not found.\n'.format(scale_with))
        else:
            logger.info('Reading scaling file: {}'.format(scale_file_location))
            df_scale_with = pd.read_csv(scale_file_location)

        if 'sc1' not in df_scale_with.columns and 'prediction' not in df_scale_with.columns:
            raise KeyError(
                'The CSV file specified for scaling ',
                'must have the "prediction" and the "sc1" '
                'columns.')
        else:
            scale_pred_mean, scale_pred_sd = (
                df_scale_with['prediction'].mean(),
                df_scale_with['prediction'].std())
            scale_human_mean, scale_human_sd = (df_scale_with['sc1'].mean(),
                                                df_scale_with['sc1'].std())

    logger.info('Processing predictions')
    df_pred_processed = process_predictions(df_filtered_pred, scale_pred_mean,
                                            scale_pred_sd, scale_human_mean,
                                            scale_human_sd, spec_trim_min,
                                            spec_trim_max)
    if not scale_with:
        expected_score_types = ['raw', 'raw_trim', 'raw_trim_round']
    elif scale_with == 'asis':
        expected_score_types = ['scale', 'scale_trim', 'scale_trim_round']
    else:
        expected_score_types = [
            'raw', 'raw_trim', 'raw_trim_round', 'scale', 'scale_trim',
            'scale_trim_round'
        ]

    # extract separated data frames that we will write out
    # as separate files
    not_other_columns = set()

    prediction_columns = ['spkitemid', 'sc1'] + expected_score_types
    df_predictions_only = df_pred_processed[prediction_columns]
    not_other_columns.update(prediction_columns)

    metadata_columns = ['spkitemid'] + subgroups
    if candidate_column:
        metadata_columns.append('candidate')
    df_test_metadata = df_filtered_pred[metadata_columns]
    not_other_columns.update(metadata_columns)

    df_test_human_scores = pd.DataFrame()
    human_score_columns = ['spkitemid', 'sc1', 'sc2']
    if second_human_score_column and 'sc2' in df_filtered_pred:
        df_test_human_scores = df_filtered_pred[human_score_columns].copy()
        not_other_columns.update(['sc2'])
        # filter out any non-numeric values nows
        # as well as zeros, if we were asked to
        df_test_human_scores['sc2'] = pd.to_numeric(
            df_test_human_scores['sc2'], errors='coerce').astype(float)
        if exclude_zero_scores:
            df_test_human_scores['sc2'] = df_test_human_scores['sc2'].replace(
                0, np.nan)

    # remove 'spkitemid' from `not_other_columns`
    # because we want that in the other columns
    # data frame
    not_other_columns.remove('spkitemid')

    # extract all of the other columns in the predictions file
    other_columns = [
        column for column in df_filtered_pred.columns
        if column not in not_other_columns
    ]
    df_pred_other_columns = df_filtered_pred[other_columns]

    logger.info('Saving pre-processed predictions and the metadata to disk')
    write_experiment_output([
        df_predictions_only, df_test_metadata, df_pred_other_columns,
        df_test_human_scores, df_excluded, df_responses_with_excluded_flags
    ], [
        'pred_processed', 'test_metadata', 'test_other_columns',
        'test_human_scores', 'test_excluded_responses',
        'test_responses_with_excluded_flags'
    ], experiment_id, csvdir)

    # do the data composition stats
    (df_test_excluded_analysis, df_data_composition,
     data_composition_by_group_dict
     ) = run_data_composition_analyses_for_rsmeval(
         df_test_metadata,
         df_excluded,
         subgroups,
         candidate_column,
         exclude_zero_scores=exclude_zero_scores)

    write_experiment_output([df_test_excluded_analysis, df_data_composition],
                            ['test_excluded_composition', 'data_composition'],
                            experiment_id, csvdir)

    # write the results of data composition analysis by group
    if subgroups:
        for group in subgroups:
            write_experiment_output([data_composition_by_group_dict[group]],
                                    ['data_composition_by_{}'.format(group)],
                                    experiment_id, csvdir)

    # run the analyses on the predictions of the modelx`
    logger.info('Running analyses on predictions')
    (df_human_machine_eval, df_human_machine_eval_short, df_human_human_eval,
     eval_by_group_dict, df_degradation, df_confmatrix,
     df_score_dist) = run_prediction_analyses(
         df_predictions_only,
         df_test_metadata,
         df_test_human_scores,
         subgroups,
         second_human_score_column,
         exclude_zero_scores=exclude_zero_scores,
         use_scaled_predictions=use_scaled_predictions)

    write_experiment_output([
        df_human_machine_eval, df_human_machine_eval_short,
        df_human_human_eval, df_degradation, df_confmatrix, df_score_dist
    ], [
        'eval', 'eval_short', 'consistency', 'degradation', 'confMatrix',
        'score_dist'
    ],
                            experiment_id,
                            csvdir,
                            reset_index=True)

    # if we are using subgroups, then write out the subgroup
    # specific output and include the by group section
    # in the final report
    if subgroups:
        for group in subgroups:
            eval_by_group, consistency_by_group = eval_by_group_dict[group]
            write_experiment_output([eval_by_group, consistency_by_group], [
                'eval_by_{}'.format(group), 'consistency_by_{}'.format(group)
            ],
                                    experiment_id,
                                    csvdir,
                                    reset_index=True)

    # generate the report
    logger.info('Starting report generation')
    create_report(experiment_id,
                  description,
                  '',
                  '',
                  '',
                  predictions_file_location,
                  csvdir,
                  figdir,
                  subgroups,
                  None,
                  second_human_score_column,
                  chosen_notebook_files,
                  exclude_zero_scores=exclude_zero_scores,
                  use_scaled_predictions=use_scaled_predictions)
Beispiel #15
0
def compute_and_save_predictions(config_file, output_file, feats_file):
    """
    Generate predictions using the information in the config file
    and save them into the given output file.
    """

    logger = logging.getLogger(__name__)

    # read in the main config file
    config_obj = read_json_file(config_file)
    config_obj = check_main_config(config_obj, context='rsmpredict')

    # get the directory where the config file lives
    # if this is the 'expm' directory, then go
    # up one level.
    configpath = dirname(config_file)

    # get the input file containing the feature values
    # for which we want to generate the predictions
    input_features_file = locate_file(config_obj['input_features_file'],
                                      configpath)
    if not input_features_file:
        raise FileNotFoundError('Input file {} does not exist'.format(
            config_obj['input_features_file']))

    # get the experiment ID
    experiment_id = config_obj['experiment_id']

    # get the column name that will hold the ID
    id_column = config_obj['id_column']

    # get the column name for human score (if any)
    human_score_column = config_obj['human_score_column']

    # get the column name for second human score (if any)
    second_human_score_column = config_obj['second_human_score_column']

    # get the column name for subgroups (if any)
    subgroups = config_obj['subgroups']

    # get the column names for flag columns (if any)
    flag_column_dict = check_flag_column(config_obj)

    # get the name for the candidate_column (if any)
    candidate_column = config_obj['candidate_column']

    # get the directory of the experiment
    experiment_dir = locate_file(config_obj['experiment_dir'], configpath)
    if not experiment_dir:
        raise FileNotFoundError('The directory {} does not exist.'.format(
            config_obj['experiment_dir']))
    else:
        experiment_output_dir = normpath(join(experiment_dir, 'output'))
        if not exists(experiment_output_dir):
            raise FileNotFoundError(
                'The directory {} does not contain '
                'the output of an rsmtool experiment.'.format(experiment_dir))

    # find all the .model files in the experiment output directory
    model_files = glob.glob(join(experiment_output_dir, '*.model'))
    if not model_files:
        raise FileNotFoundError(
            'The directory {} does not contain any rsmtool models.'.format(
                experiment_output_dir))

    experiment_ids = [splitext(basename(mf))[0] for mf in model_files]
    if experiment_id not in experiment_ids:
        raise FileNotFoundError(
            '{} does not contain a model for the experiment "{}". '
            'The following experiments are contained in this '
            'directory: {}'.format(experiment_output_dir, experiment_id,
                                   experiment_ids))

    # check that the directory contains outher required files
    required_file_types = ['feature', 'postprocessing_params']
    for file_type in required_file_types:
        expected_file_name = "{}_{}.csv".format(experiment_id, file_type)
        if not exists(join(experiment_output_dir, expected_file_name)):
            raise FileNotFoundError('{} does not contain the required file '
                                    '{} that was generated during the '
                                    'original model training'.format(
                                        experiment_output_dir,
                                        expected_file_name))

    # read in the given features but make sure that the
    # `id_column`, `candidate_column` and subgroups are read in as a string
    logger.info('Reading features from {}'.format(input_features_file))
    string_columns = [id_column, candidate_column] + subgroups
    converter_dict = dict([(column, str) for column in string_columns
                           if column])

    df_input = pd.read_csv(input_features_file, converters=converter_dict)

    # make sure that the columns specified in the config file actually exist
    columns_to_check = [id_column] + subgroups + list(flag_column_dict.keys())

    # add subgroups and the flag columns to the list of columns
    # that will be added to the final file
    columns_to_copy = subgroups + list(flag_column_dict.keys())

    # human_score_column will be set to sc1 by default
    # we only raise an error if it's set to something else.
    # However, since we cannot distinguish whether the column was set
    # to sc1 by default or specified as such in the config file
    # we append it to output anyway as long as
    # it is in the input file

    if human_score_column != 'sc1' or 'sc1' in df_input.columns:
        columns_to_check.append(human_score_column)
        columns_to_copy.append('sc1')

    if candidate_column:
        columns_to_check.append(candidate_column)
        columns_to_copy.append('candidate')

    if second_human_score_column:
        columns_to_check.append(second_human_score_column)
        columns_to_copy.append('sc2')

    missing_columns = set(columns_to_check).difference(df_input.columns)
    if missing_columns:
        raise KeyError("Columns {} from the config file "
                       "do not exist in the data.".format(missing_columns))

    # rename all columns
    df_input = rename_default_columns(df_input, [],
                                      id_column,
                                      human_score_column,
                                      second_human_score_column,
                                      None,
                                      None,
                                      candidate_column=candidate_column)

    # check that the id_column contains unique values
    if df_input['spkitemid'].size != df_input['spkitemid'].unique().size:
        raise ValueError(
            "The data contains repeated response IDs in {}. Please make sure all response IDs are unique and re-run the tool."
            .format(id_column))

    # now we need to pre-process these features using
    # the parameters that are already stored in the
    # _features.csv file.
    df_feature_info = pd.read_csv(join(experiment_output_dir,
                                       '{}_feature.csv'.format(experiment_id)),
                                  index_col=0)
    required_features = df_feature_info.index.tolist()

    # ensure that all the features that are needed by the model
    # are present in the input file
    input_feature_columns = [c for c in df_input if c != id_column]
    missing_features = set(required_features).difference(input_feature_columns)
    if missing_features:
        raise KeyError('{} is missing the following features: {}'.format(
            feats_file, missing_features))
    extra_features = set(input_feature_columns).difference(required_features +
                                                           [id_column])
    if extra_features:
        logging.warning(
            'The following extraenous features will be ignored: {}'.format(
                extra_features))

    # keep the required features plus the id
    features_to_keep = ['spkitemid'] + required_features

    # check if actually have the human scores for this data and add
    # sc1 to preprocessed features for consistency with other tools
    has_human_scores = 'sc1' in df_input
    if has_human_scores:
        features_to_keep.append('sc1')

    df_features = df_input[features_to_keep]

    # preprocess the feature values
    logger.info('Pre-processing input features')

    # first we need to filter out NaNs and any other
    # weird features, the same way we did for rsmtool.
    df_filtered = df_features.copy()
    df_excluded = pd.DataFrame(columns=df_filtered.columns)

    for feature_name in required_features:
        newdf, newdf_excluded = filter_on_column(df_filtered,
                                                 feature_name,
                                                 'spkitemid',
                                                 exclude_zeros=False,
                                                 exclude_zero_sd=False)
        del df_filtered
        df_filtered = newdf
        df_excluded = pd.merge(df_excluded, newdf_excluded, how='outer')

    # make sure that the remaining data frame is not empty
    if len(df_filtered) == 0:
        raise ValueError(
            "There are no responses left after "
            "filtering out non-numeric feature values. No analysis "
            "will be run")

    df_features = df_filtered.copy()
    df_features_preprocessed = df_features.copy()
    for feature_name in required_features:

        feature_values = df_features[feature_name].values

        feature_transformation = df_feature_info.loc[feature_name]['transform']
        feature_weight = df_feature_info.loc[feature_name]['sign']

        train_feature_mean = df_feature_info.loc[feature_name]['train_mean']
        train_feature_sd = df_feature_info.loc[feature_name]['train_sd']

        train_transformed_mean = df_feature_info.loc[feature_name][
            'train_transformed_mean']
        train_transformed_sd = df_feature_info.loc[feature_name][
            'train_transformed_sd']

        # transform the feature values and remove outliers
        df_features_preprocessed[feature_name] = preprocess_feature(
            feature_values,
            feature_name,
            feature_transformation,
            train_feature_mean,
            train_feature_sd,
            exclude_zero_sd=False)

        # now standardize the feature values
        df_features_preprocessed[feature_name] = (
            df_features_preprocessed[feature_name] -
            train_transformed_mean) / train_transformed_sd

        # Multiply features by weight. Within the
        # current SR timeline, the mean of the transformed train
        # feature used to standardize test features has to be
        # computed before multiplying the train feature by the weight.
        df_features_preprocessed[feature_name] = df_features_preprocessed[
            feature_name] * feature_weight

    # save the pre-processed features to disk if we were asked to
    if feats_file:
        logger.info(
            'Saving pre-processed feature values to {}'.format(feats_file))

        # create any directories needed for the output file
        os.makedirs(dirname(feats_file), exist_ok=True)
        df_features_preprocessed.to_csv(feats_file, index=False)

    # now load the SKLL model to generate the predictions
    model = Learner.from_file(
        join(experiment_output_dir, '{}.model'.format(experiment_id)))

    # now generate the predictions for the features using this model
    logger.info('Generating predictions')
    df_predictions = predict_with_model(model, df_features_preprocessed)

    # read in the post-processing parameters from disk
    df_postproc_params = pd.read_csv(
        join(experiment_output_dir,
             '{}_postprocessing_params.csv'.format(experiment_id)))
    trim_min = df_postproc_params['trim_min'].values[0]
    trim_max = df_postproc_params['trim_max'].values[0]
    h1_mean = df_postproc_params['h1_mean'].values[0]
    h1_sd = df_postproc_params['h1_sd'].values[0]
    train_predictions_mean = df_postproc_params[
        'train_predictions_mean'].values[0]
    train_predictions_sd = df_postproc_params['train_predictions_sd'].values[0]

    # now scale the predictions
    logger.info('Rescaling predictions')
    scaled_predictions = (df_predictions['raw'] -
                          train_predictions_mean) / train_predictions_sd
    scaled_predictions = scaled_predictions * h1_sd + h1_mean
    df_predictions['scale'] = scaled_predictions

    # trim and round the predictions
    logger.info('Trimming and rounding predictions')
    df_predictions['raw_trim'] = trim(df_predictions['raw'], trim_min,
                                      trim_max)
    df_predictions['raw_trim_round'] = np.rint(
        df_predictions['raw_trim']).astype('int64')
    df_predictions['scale_trim'] = trim(df_predictions['scale'], trim_min,
                                        trim_max)
    df_predictions['scale_trim_round'] = np.rint(
        df_predictions['scale_trim']).astype('int64')

    # add back the columns that we were requested to copy if any
    if columns_to_copy:
        df_predictions_with_metadata = pd.merge(
            df_predictions, df_input[['spkitemid'] + columns_to_copy])
        assert (len(df_predictions) == len(df_predictions_with_metadata))
    else:
        df_predictions_with_metadata = df_predictions.copy()

    # create any directories needed for the output file
    os.makedirs(dirname(output_file), exist_ok=True)

    # save the predictions to disk
    logger.info('Saving predictions to {}'.format(output_file))
    df_predictions_with_metadata.to_csv(output_file, index=False)

    # save excluded responses to disk
    if not df_excluded.empty:
        excluded_output_file = '{}_excluded_responses{}'.format(
            *splitext(output_file))
        logger.info(
            'Saving excluded responses to {}'.format(excluded_output_file))
        df_excluded.to_csv(excluded_output_file, index=False)