def test_check_flag_column_convert_to_list_keep_numeric(): config = {"flag_column": {"advisories": 123}} flag_dict = check_flag_column(config) eq_(flag_dict, {"advisories": [123]})
def test_check_flag_column_wrong_format(): config = {"flag_column": "[advisories]"} check_flag_column(config)
def test_check_flag_column_no_values(): config = {"flag_column": None} flag_dict = check_flag_column(config) eq_(flag_dict, {})
def test_check_flag_column_convert_to_list(): config = {"flag_column": {"advisories": "0"}} flag_dict = check_flag_column(config) eq_(flag_dict, {"advisories": ['0']})
def test_check_flag_column(): input_dict = {"advisory flag": ['0']} config = {"flag_column": input_dict} output_dict = check_flag_column(config) eq_(input_dict, output_dict)
def test_check_flag_column_keep_numeric(): input_dict = {"advisory flag": [1, 2, 3]} config = {"flag_column": input_dict} output_dict = check_flag_column(config) eq_(output_dict, {"advisory flag": [1, 2, 3]})
def compute_and_save_predictions(config_file, output_file, feats_file): """ Generate predictions using the information in the config file and save them into the given output file. """ logger = logging.getLogger(__name__) # read in the main config file config_obj = read_json_file(config_file) config_obj = check_main_config(config_obj, context='rsmpredict') # get the directory where the config file lives # if this is the 'expm' directory, then go # up one level. configpath = dirname(config_file) # get the input file containing the feature values # for which we want to generate the predictions input_features_file = locate_file(config_obj['input_features_file'], configpath) if not input_features_file: raise FileNotFoundError('Input file {} does not exist'.format(config_obj['input_features_file'])) # get the experiment ID experiment_id = config_obj['experiment_id'] # get the column name that will hold the ID id_column = config_obj['id_column'] # get the column name for human score (if any) human_score_column = config_obj['human_score_column'] # get the column name for second human score (if any) second_human_score_column = config_obj['second_human_score_column'] # get the column name for subgroups (if any) subgroups = config_obj['subgroups'] # get the column names for flag columns (if any) flag_column_dict = check_flag_column(config_obj) # get the name for the candidate_column (if any) candidate_column = config_obj['candidate_column'] # get the directory of the experiment experiment_dir = locate_file(config_obj['experiment_dir'], configpath) if not experiment_dir: raise FileNotFoundError('The directory {} does not exist.'.format(config_obj['experiment_dir'])) else: experiment_output_dir = normpath(join(experiment_dir, 'output')) if not exists(experiment_output_dir): raise FileNotFoundError('The directory {} does not contain ' 'the output of an rsmtool experiment.'.format(experiment_dir)) # find all the .model files in the experiment output directory model_files = glob.glob(join(experiment_output_dir, '*.model')) if not model_files: raise FileNotFoundError('The directory {} does not contain any rsmtool models.'.format(experiment_output_dir)) experiment_ids = [splitext(basename(mf))[0] for mf in model_files] if experiment_id not in experiment_ids: raise FileNotFoundError('{} does not contain a model for the experiment "{}". ' 'The following experiments are contained in this ' 'directory: {}'.format(experiment_output_dir, experiment_id, experiment_ids)) # check that the directory contains outher required files required_file_types = ['feature', 'postprocessing_params'] for file_type in required_file_types: expected_file_name = "{}_{}.csv".format(experiment_id, file_type) if not exists(join(experiment_output_dir, expected_file_name)): raise FileNotFoundError('{} does not contain the required file ' '{} that was generated during the ' 'original model training'.format(experiment_output_dir, expected_file_name)) # read in the given features but make sure that the # `id_column`, `candidate_column` and subgroups are read in as a string logger.info('Reading features from {}'.format(input_features_file)) string_columns = [id_column, candidate_column] + subgroups converter_dict = dict([(column, str) for column in string_columns if column]) df_input = pd.read_csv(input_features_file, converters=converter_dict) # make sure that the columns specified in the config file actually exist columns_to_check = [id_column] + subgroups + list(flag_column_dict.keys()) # add subgroups and the flag columns to the list of columns # that will be added to the final file columns_to_copy = subgroups + list(flag_column_dict.keys()) # human_score_column will be set to sc1 by default # we only raise an error if it's set to something else. # However, since we cannot distinguish whether the column was set # to sc1 by default or specified as such in the config file # we append it to output anyway as long as # it is in the input file if human_score_column != 'sc1' or 'sc1' in df_input.columns: columns_to_check.append(human_score_column) columns_to_copy.append('sc1') if candidate_column: columns_to_check.append(candidate_column) columns_to_copy.append('candidate') if second_human_score_column: columns_to_check.append(second_human_score_column) columns_to_copy.append('sc2') missing_columns = set(columns_to_check).difference(df_input.columns) if missing_columns: raise KeyError("Columns {} from the config file " "do not exist in the data.".format(missing_columns)) # rename all columns df_input = rename_default_columns(df_input, [], id_column, human_score_column, second_human_score_column, None, None, candidate_column=candidate_column) # check that the id_column contains unique values if df_input['spkitemid'].size != df_input['spkitemid'].unique().size: raise ValueError("The data contains repeated response IDs in {}. Please make sure all response IDs are unique and re-run the tool.".format(id_column)) # now we need to pre-process these features using # the parameters that are already stored in the # _features.csv file. df_feature_info = pd.read_csv(join(experiment_output_dir, '{}_feature.csv'.format(experiment_id)), index_col=0) required_features = df_feature_info.index.tolist() # ensure that all the features that are needed by the model # are present in the input file input_feature_columns = [c for c in df_input if c != id_column] missing_features = set(required_features).difference(input_feature_columns) if missing_features: raise KeyError('{} is missing the following features: {}'.format(feats_file, missing_features)) extra_features = set(input_feature_columns).difference(required_features + [id_column]) if extra_features: logging.warning('The following extraenous features will be ignored: {}'.format(extra_features)) # keep the required features plus the id features_to_keep = ['spkitemid'] + required_features # check if actually have the human scores for this data and add # sc1 to preprocessed features for consistency with other tools has_human_scores = 'sc1' in df_input if has_human_scores: features_to_keep.append('sc1') df_features = df_input[features_to_keep] # preprocess the feature values logger.info('Pre-processing input features') # first we need to filter out NaNs and any other # weird features, the same way we did for rsmtool. df_filtered = df_features.copy() df_excluded = pd.DataFrame(columns=df_filtered.columns) for feature_name in required_features: newdf, newdf_excluded = filter_on_column(df_filtered, feature_name, 'spkitemid', exclude_zeros=False, exclude_zero_sd=False) del df_filtered df_filtered = newdf df_excluded = pd.merge(df_excluded, newdf_excluded, how='outer') # make sure that the remaining data frame is not empty if len(df_filtered) == 0: raise ValueError("There are no responses left after " "filtering out non-numeric feature values. No analysis " "will be run") df_features = df_filtered.copy() df_features_preprocessed = df_features.copy() for feature_name in required_features: feature_values = df_features[feature_name].values feature_transformation = df_feature_info.loc[feature_name]['transform'] feature_weight = df_feature_info.loc[feature_name]['sign'] train_feature_mean = df_feature_info.loc[feature_name]['train_mean'] train_feature_sd = df_feature_info.loc[feature_name]['train_sd'] train_transformed_mean = df_feature_info.loc[feature_name]['train_transformed_mean'] train_transformed_sd = df_feature_info.loc[feature_name]['train_transformed_sd'] # transform the feature values and remove outliers df_features_preprocessed[feature_name] = preprocess_feature(feature_values, feature_name, feature_transformation, train_feature_mean, train_feature_sd, exclude_zero_sd=False) # now standardize the feature values df_features_preprocessed[feature_name] = (df_features_preprocessed[feature_name] - train_transformed_mean) / train_transformed_sd # Multiply features by weight. Within the # current SR timeline, the mean of the transformed train # feature used to standardize test features has to be # computed before multiplying the train feature by the weight. df_features_preprocessed[feature_name] = df_features_preprocessed[feature_name] * feature_weight # save the pre-processed features to disk if we were asked to if feats_file: logger.info('Saving pre-processed feature values to {}'.format(feats_file)) # create any directories needed for the output file os.makedirs(dirname(feats_file), exist_ok=True) df_features_preprocessed.to_csv(feats_file, index=False) # now load the SKLL model to generate the predictions model = Learner.from_file(join(experiment_output_dir, '{}.model'.format(experiment_id))) # now generate the predictions for the features using this model logger.info('Generating predictions') df_predictions = predict_with_model(model, df_features_preprocessed) # read in the post-processing parameters from disk df_postproc_params = pd.read_csv(join(experiment_output_dir, '{}_postprocessing_params.csv'.format(experiment_id))) trim_min = df_postproc_params['trim_min'].values[0] trim_max = df_postproc_params['trim_max'].values[0] h1_mean = df_postproc_params['h1_mean'].values[0] h1_sd = df_postproc_params['h1_sd'].values[0] train_predictions_mean = df_postproc_params['train_predictions_mean'].values[0] train_predictions_sd = df_postproc_params['train_predictions_sd'].values[0] # now scale the predictions logger.info('Rescaling predictions') scaled_predictions = (df_predictions['raw'] - train_predictions_mean) / train_predictions_sd scaled_predictions = scaled_predictions * h1_sd + h1_mean df_predictions['scale'] = scaled_predictions # trim and round the predictions logger.info('Trimming and rounding predictions') df_predictions['raw_trim'] = trim(df_predictions['raw'], trim_min, trim_max) df_predictions['raw_trim_round'] = np.rint(df_predictions['raw_trim']).astype('int64') df_predictions['scale_trim'] = trim(df_predictions['scale'], trim_min, trim_max) df_predictions['scale_trim_round'] = np.rint(df_predictions['scale_trim']).astype('int64') # add back the columns that we were requested to copy if any if columns_to_copy: df_predictions_with_metadata = pd.merge(df_predictions, df_input[['spkitemid'] + columns_to_copy]) assert(len(df_predictions) == len(df_predictions_with_metadata)) else: df_predictions_with_metadata = df_predictions.copy() # create any directories needed for the output file os.makedirs(dirname(output_file), exist_ok=True) # save the predictions to disk logger.info('Saving predictions to {}'.format(output_file)) df_predictions_with_metadata.to_csv(output_file, index=False) # save excluded responses to disk if not df_excluded.empty: excluded_output_file = '{}_excluded_responses{}'.format(*splitext(output_file)) logger.info('Saving excluded responses to {}'.format(excluded_output_file)) df_excluded.to_csv(excluded_output_file, index=False)
def run_evaluation(config_file, output_dir): """ Run RSMTool evaluation experiment using the given configuration file and generate all evaluation outputs in the given directory. """ logger = logging.getLogger(__name__) # create the 'output' and the 'figure' sub-directories # where all the experiment output such as the CSV files # and the box plots will be saved csvdir = abspath(join(output_dir, 'output')) figdir = abspath(join(output_dir, 'figure')) reportdir = abspath(join(output_dir, 'report')) os.makedirs(csvdir, exist_ok=True) os.makedirs(figdir, exist_ok=True) os.makedirs(reportdir, exist_ok=True) # load the information from the config file # read in the main config file config_obj = read_json_file(config_file) config_obj = check_main_config(config_obj, context='rsmeval') # get the directory where the config file lives # if this is the 'expm' directory, then go # up one level. configpath = dirname(config_file) # get the experiment ID experiment_id = config_obj['experiment_id'] # get the description description = config_obj['description'] # get the column name for the labels for the training and testing data human_score_column = config_obj['human_score_column'] system_score_column = config_obj['system_score_column'] # get the name of the optional column that # contains the second human score second_human_score_column = config_obj['second_human_score_column'] # if the human score column is the same as the # second human score column, raise an error if human_score_column == second_human_score_column: raise ValueError("'human_score_column' and " "'second_human_score_column' " "cannot have the same value.") # get the column name that will hold the ID for # both the training and the test data id_column = config_obj['id_column'] # get the specified trim min and max, if any # and make sure they are numeric spec_trim_min, spec_trim_max = get_trim_min_max(config_obj) # get the subgroups if any subgroups = config_obj.get('subgroups') # get the candidate column if any and convert it to string candidate_column = config_obj['candidate_column'] general_report_sections = config_obj['general_sections'] # get any special sections that the user might have specified special_report_sections = config_obj['special_sections'] # get any custom sections and locate them to make sure # that they exist, otherwise raise an exception custom_report_section_paths = config_obj['custom_sections'] if custom_report_section_paths: logger.info('Locating custom report sections') custom_report_sections = locate_custom_sections( custom_report_section_paths, configpath) else: custom_report_sections = [] section_order = config_obj['section_order'] # check all sections values and order and get the # ordered list of notebook files chosen_notebook_files = get_ordered_notebook_files(general_report_sections, special_report_sections, custom_report_sections, section_order, subgroups, model_type=None, context='rsmeval') # are we excluding zero scores? exclude_zero_scores = config_obj['exclude_zero_scores'] # if we are excluding zero scores but trim_min # is set to 0, then we need to warn the user if exclude_zero_scores and spec_trim_min == 0: logger.warning("'exclude_zero_scores' is set to True but " " 'trim_min' is set to 0. This may cause " " unexpected behavior.") # are we filtering on any other columns? flag_column_dict = check_flag_column(config_obj) # do we have the training set predictions and human scores CSV file scale_with = config_obj.get('scale_with') # scale_with can be one of the following: # (a) None : the predictions are assumed to be 'raw' and should be used as is # when computing the metrics; the names for the final columns are # 'raw', 'raw_trim' and 'raw_trim_round'. # (b) 'asis' : the predictions are assumed to be pre-scaled and should be used as is # when computing the metrics; the names for the final columns are # 'scale', 'scale_trim' and 'scale_trim_round'. # (c) a CSV file : the predictions are assumed to be 'raw' and should be scaled # before computing the metrics; the names for the final columns are # 'scale', 'scale_trim' and 'scale_trim_round'. # we need to scale if and only if a CSV file is specified do_scaling = (scale_with != None and scale_with != 'asis') # use scaled predictions for the analyses unless # we were told not to use_scaled_predictions = (scale_with != None) # log an appropriate message if scale_with is None: message = ('Assuming given system predictions ' 'are unscaled and will be used as such.') elif scale_with == 'asis': message = ('Assuming given system predictions ' 'are already scaled and will be used as such.') else: message = ('Assuming given system predictions ' 'are unscaled and will be scaled before use.') logger.info(message) # load the predictions from disk and make sure that the `id_column` # is read in as a string predictions_file_location = locate_file(config_obj['predictions_file'], configpath) if not predictions_file_location: raise FileNotFoundError('Error: Predictions file {} ' 'not found.\n'.format( config_obj['predictions_file'])) else: logger.info( 'Reading predictions: {}'.format(predictions_file_location)) string_columns = [id_column, candidate_column] + subgroups converter_dict = dict([(column, str) for column in string_columns if column]) df_pred = pd.read_csv(predictions_file_location, converters=converter_dict) # make sure that the columns specified in the config file actually exist missing_columns = set([id_column, human_score_column, system_score_column]).difference(df_pred.columns) if missing_columns: raise KeyError('Columns {} from the config file do not exist ' 'in the predictions file.'.format(missing_columns)) df_pred = rename_default_columns(df_pred, [], id_column, human_score_column, second_human_score_column, None, system_score_column, candidate_column) # check that the id_column contains unique values if df_pred['spkitemid'].size != df_pred['spkitemid'].unique().size: raise ValueError("The data contains duplicate response IDs " "in '{}'. Please make sure all response IDs " "are unique and re-run the tool.".format(id_column)) df_pred = check_subgroups(df_pred, subgroups) # filter out the responses based on flag columns (df_responses_with_requested_flags, df_responses_with_excluded_flags) = filter_on_flag_columns( df_pred, flag_column_dict) # filter out rows that have non-numeric or zero human scores df_filtered, df_excluded = filter_on_column( df_responses_with_requested_flags, 'sc1', 'spkitemid', exclude_zeros=exclude_zero_scores) # make sure that the remaining data frame is not empty if len(df_filtered) == 0: raise ValueError("No responses remaining after filtering out " "non-numeric human scores. No further analysis " "can be run. ") # Change all non-numeric machine scores in excluded # data to NaNs for consistency with rsmtool. # NOTE: This will *not* work if *all* of the values # in column are non-numeric. This is a known bug in # pandas: https://github.com/pydata/pandas/issues/9589 # Therefore, we need add an additional check after this. df_excluded['raw'] = pd.to_numeric(df_excluded['raw'], errors='coerce').astype(float) # filter out the non-numeric machine scores from the rest of the data newdf, newdf_excluded = filter_on_column(df_filtered, 'raw', 'spkitemid', exclude_zeros=False) del df_filtered df_filtered_pred = newdf # make sure that the remaining data frame is not empty if len(df_filtered_pred) == 0: raise ValueError("No responses remaining after filtering out " "non-numeric machine scores. No further analysis " "can be run. ") df_excluded = pd.merge(df_excluded, newdf_excluded, how='outer') # set default values for scaling scale_pred_mean = 0 scale_pred_sd = 1 scale_human_mean = 0 scale_human_sd = 1 if do_scaling: scale_file_location = locate_file(scale_with, configpath) if not scale_file_location: raise FileNotFoundError( 'Error: scaling file {} not found.\n'.format(scale_with)) else: logger.info('Reading scaling file: {}'.format(scale_file_location)) df_scale_with = pd.read_csv(scale_file_location) if 'sc1' not in df_scale_with.columns and 'prediction' not in df_scale_with.columns: raise KeyError( 'The CSV file specified for scaling ', 'must have the "prediction" and the "sc1" ' 'columns.') else: scale_pred_mean, scale_pred_sd = ( df_scale_with['prediction'].mean(), df_scale_with['prediction'].std()) scale_human_mean, scale_human_sd = (df_scale_with['sc1'].mean(), df_scale_with['sc1'].std()) logger.info('Processing predictions') df_pred_processed = process_predictions(df_filtered_pred, scale_pred_mean, scale_pred_sd, scale_human_mean, scale_human_sd, spec_trim_min, spec_trim_max) if not scale_with: expected_score_types = ['raw', 'raw_trim', 'raw_trim_round'] elif scale_with == 'asis': expected_score_types = ['scale', 'scale_trim', 'scale_trim_round'] else: expected_score_types = [ 'raw', 'raw_trim', 'raw_trim_round', 'scale', 'scale_trim', 'scale_trim_round' ] # extract separated data frames that we will write out # as separate files not_other_columns = set() prediction_columns = ['spkitemid', 'sc1'] + expected_score_types df_predictions_only = df_pred_processed[prediction_columns] not_other_columns.update(prediction_columns) metadata_columns = ['spkitemid'] + subgroups if candidate_column: metadata_columns.append('candidate') df_test_metadata = df_filtered_pred[metadata_columns] not_other_columns.update(metadata_columns) df_test_human_scores = pd.DataFrame() human_score_columns = ['spkitemid', 'sc1', 'sc2'] if second_human_score_column and 'sc2' in df_filtered_pred: df_test_human_scores = df_filtered_pred[human_score_columns].copy() not_other_columns.update(['sc2']) # filter out any non-numeric values nows # as well as zeros, if we were asked to df_test_human_scores['sc2'] = pd.to_numeric( df_test_human_scores['sc2'], errors='coerce').astype(float) if exclude_zero_scores: df_test_human_scores['sc2'] = df_test_human_scores['sc2'].replace( 0, np.nan) # remove 'spkitemid' from `not_other_columns` # because we want that in the other columns # data frame not_other_columns.remove('spkitemid') # extract all of the other columns in the predictions file other_columns = [ column for column in df_filtered_pred.columns if column not in not_other_columns ] df_pred_other_columns = df_filtered_pred[other_columns] logger.info('Saving pre-processed predictions and the metadata to disk') write_experiment_output([ df_predictions_only, df_test_metadata, df_pred_other_columns, df_test_human_scores, df_excluded, df_responses_with_excluded_flags ], [ 'pred_processed', 'test_metadata', 'test_other_columns', 'test_human_scores', 'test_excluded_responses', 'test_responses_with_excluded_flags' ], experiment_id, csvdir) # do the data composition stats (df_test_excluded_analysis, df_data_composition, data_composition_by_group_dict ) = run_data_composition_analyses_for_rsmeval( df_test_metadata, df_excluded, subgroups, candidate_column, exclude_zero_scores=exclude_zero_scores) write_experiment_output([df_test_excluded_analysis, df_data_composition], ['test_excluded_composition', 'data_composition'], experiment_id, csvdir) # write the results of data composition analysis by group if subgroups: for group in subgroups: write_experiment_output([data_composition_by_group_dict[group]], ['data_composition_by_{}'.format(group)], experiment_id, csvdir) # run the analyses on the predictions of the modelx` logger.info('Running analyses on predictions') (df_human_machine_eval, df_human_machine_eval_short, df_human_human_eval, eval_by_group_dict, df_degradation, df_confmatrix, df_score_dist) = run_prediction_analyses( df_predictions_only, df_test_metadata, df_test_human_scores, subgroups, second_human_score_column, exclude_zero_scores=exclude_zero_scores, use_scaled_predictions=use_scaled_predictions) write_experiment_output([ df_human_machine_eval, df_human_machine_eval_short, df_human_human_eval, df_degradation, df_confmatrix, df_score_dist ], [ 'eval', 'eval_short', 'consistency', 'degradation', 'confMatrix', 'score_dist' ], experiment_id, csvdir, reset_index=True) # if we are using subgroups, then write out the subgroup # specific output and include the by group section # in the final report if subgroups: for group in subgroups: eval_by_group, consistency_by_group = eval_by_group_dict[group] write_experiment_output([eval_by_group, consistency_by_group], [ 'eval_by_{}'.format(group), 'consistency_by_{}'.format(group) ], experiment_id, csvdir, reset_index=True) # generate the report logger.info('Starting report generation') create_report(experiment_id, description, '', '', '', predictions_file_location, csvdir, figdir, subgroups, None, second_human_score_column, chosen_notebook_files, exclude_zero_scores=exclude_zero_scores, use_scaled_predictions=use_scaled_predictions)
def compute_and_save_predictions(config_file, output_file, feats_file): """ Generate predictions using the information in the config file and save them into the given output file. """ logger = logging.getLogger(__name__) # read in the main config file config_obj = read_json_file(config_file) config_obj = check_main_config(config_obj, context='rsmpredict') # get the directory where the config file lives # if this is the 'expm' directory, then go # up one level. configpath = dirname(config_file) # get the input file containing the feature values # for which we want to generate the predictions input_features_file = locate_file(config_obj['input_features_file'], configpath) if not input_features_file: raise FileNotFoundError('Input file {} does not exist'.format( config_obj['input_features_file'])) # get the experiment ID experiment_id = config_obj['experiment_id'] # get the column name that will hold the ID id_column = config_obj['id_column'] # get the column name for human score (if any) human_score_column = config_obj['human_score_column'] # get the column name for second human score (if any) second_human_score_column = config_obj['second_human_score_column'] # get the column name for subgroups (if any) subgroups = config_obj['subgroups'] # get the column names for flag columns (if any) flag_column_dict = check_flag_column(config_obj) # get the name for the candidate_column (if any) candidate_column = config_obj['candidate_column'] # get the directory of the experiment experiment_dir = locate_file(config_obj['experiment_dir'], configpath) if not experiment_dir: raise FileNotFoundError('The directory {} does not exist.'.format( config_obj['experiment_dir'])) else: experiment_output_dir = normpath(join(experiment_dir, 'output')) if not exists(experiment_output_dir): raise FileNotFoundError( 'The directory {} does not contain ' 'the output of an rsmtool experiment.'.format(experiment_dir)) # find all the .model files in the experiment output directory model_files = glob.glob(join(experiment_output_dir, '*.model')) if not model_files: raise FileNotFoundError( 'The directory {} does not contain any rsmtool models.'.format( experiment_output_dir)) experiment_ids = [splitext(basename(mf))[0] for mf in model_files] if experiment_id not in experiment_ids: raise FileNotFoundError( '{} does not contain a model for the experiment "{}". ' 'The following experiments are contained in this ' 'directory: {}'.format(experiment_output_dir, experiment_id, experiment_ids)) # check that the directory contains outher required files required_file_types = ['feature', 'postprocessing_params'] for file_type in required_file_types: expected_file_name = "{}_{}.csv".format(experiment_id, file_type) if not exists(join(experiment_output_dir, expected_file_name)): raise FileNotFoundError('{} does not contain the required file ' '{} that was generated during the ' 'original model training'.format( experiment_output_dir, expected_file_name)) # read in the given features but make sure that the # `id_column`, `candidate_column` and subgroups are read in as a string logger.info('Reading features from {}'.format(input_features_file)) string_columns = [id_column, candidate_column] + subgroups converter_dict = dict([(column, str) for column in string_columns if column]) df_input = pd.read_csv(input_features_file, converters=converter_dict) # make sure that the columns specified in the config file actually exist columns_to_check = [id_column] + subgroups + list(flag_column_dict.keys()) # add subgroups and the flag columns to the list of columns # that will be added to the final file columns_to_copy = subgroups + list(flag_column_dict.keys()) # human_score_column will be set to sc1 by default # we only raise an error if it's set to something else. # However, since we cannot distinguish whether the column was set # to sc1 by default or specified as such in the config file # we append it to output anyway as long as # it is in the input file if human_score_column != 'sc1' or 'sc1' in df_input.columns: columns_to_check.append(human_score_column) columns_to_copy.append('sc1') if candidate_column: columns_to_check.append(candidate_column) columns_to_copy.append('candidate') if second_human_score_column: columns_to_check.append(second_human_score_column) columns_to_copy.append('sc2') missing_columns = set(columns_to_check).difference(df_input.columns) if missing_columns: raise KeyError("Columns {} from the config file " "do not exist in the data.".format(missing_columns)) # rename all columns df_input = rename_default_columns(df_input, [], id_column, human_score_column, second_human_score_column, None, None, candidate_column=candidate_column) # check that the id_column contains unique values if df_input['spkitemid'].size != df_input['spkitemid'].unique().size: raise ValueError( "The data contains repeated response IDs in {}. Please make sure all response IDs are unique and re-run the tool." .format(id_column)) # now we need to pre-process these features using # the parameters that are already stored in the # _features.csv file. df_feature_info = pd.read_csv(join(experiment_output_dir, '{}_feature.csv'.format(experiment_id)), index_col=0) required_features = df_feature_info.index.tolist() # ensure that all the features that are needed by the model # are present in the input file input_feature_columns = [c for c in df_input if c != id_column] missing_features = set(required_features).difference(input_feature_columns) if missing_features: raise KeyError('{} is missing the following features: {}'.format( feats_file, missing_features)) extra_features = set(input_feature_columns).difference(required_features + [id_column]) if extra_features: logging.warning( 'The following extraenous features will be ignored: {}'.format( extra_features)) # keep the required features plus the id features_to_keep = ['spkitemid'] + required_features # check if actually have the human scores for this data and add # sc1 to preprocessed features for consistency with other tools has_human_scores = 'sc1' in df_input if has_human_scores: features_to_keep.append('sc1') df_features = df_input[features_to_keep] # preprocess the feature values logger.info('Pre-processing input features') # first we need to filter out NaNs and any other # weird features, the same way we did for rsmtool. df_filtered = df_features.copy() df_excluded = pd.DataFrame(columns=df_filtered.columns) for feature_name in required_features: newdf, newdf_excluded = filter_on_column(df_filtered, feature_name, 'spkitemid', exclude_zeros=False, exclude_zero_sd=False) del df_filtered df_filtered = newdf df_excluded = pd.merge(df_excluded, newdf_excluded, how='outer') # make sure that the remaining data frame is not empty if len(df_filtered) == 0: raise ValueError( "There are no responses left after " "filtering out non-numeric feature values. No analysis " "will be run") df_features = df_filtered.copy() df_features_preprocessed = df_features.copy() for feature_name in required_features: feature_values = df_features[feature_name].values feature_transformation = df_feature_info.loc[feature_name]['transform'] feature_weight = df_feature_info.loc[feature_name]['sign'] train_feature_mean = df_feature_info.loc[feature_name]['train_mean'] train_feature_sd = df_feature_info.loc[feature_name]['train_sd'] train_transformed_mean = df_feature_info.loc[feature_name][ 'train_transformed_mean'] train_transformed_sd = df_feature_info.loc[feature_name][ 'train_transformed_sd'] # transform the feature values and remove outliers df_features_preprocessed[feature_name] = preprocess_feature( feature_values, feature_name, feature_transformation, train_feature_mean, train_feature_sd, exclude_zero_sd=False) # now standardize the feature values df_features_preprocessed[feature_name] = ( df_features_preprocessed[feature_name] - train_transformed_mean) / train_transformed_sd # Multiply features by weight. Within the # current SR timeline, the mean of the transformed train # feature used to standardize test features has to be # computed before multiplying the train feature by the weight. df_features_preprocessed[feature_name] = df_features_preprocessed[ feature_name] * feature_weight # save the pre-processed features to disk if we were asked to if feats_file: logger.info( 'Saving pre-processed feature values to {}'.format(feats_file)) # create any directories needed for the output file os.makedirs(dirname(feats_file), exist_ok=True) df_features_preprocessed.to_csv(feats_file, index=False) # now load the SKLL model to generate the predictions model = Learner.from_file( join(experiment_output_dir, '{}.model'.format(experiment_id))) # now generate the predictions for the features using this model logger.info('Generating predictions') df_predictions = predict_with_model(model, df_features_preprocessed) # read in the post-processing parameters from disk df_postproc_params = pd.read_csv( join(experiment_output_dir, '{}_postprocessing_params.csv'.format(experiment_id))) trim_min = df_postproc_params['trim_min'].values[0] trim_max = df_postproc_params['trim_max'].values[0] h1_mean = df_postproc_params['h1_mean'].values[0] h1_sd = df_postproc_params['h1_sd'].values[0] train_predictions_mean = df_postproc_params[ 'train_predictions_mean'].values[0] train_predictions_sd = df_postproc_params['train_predictions_sd'].values[0] # now scale the predictions logger.info('Rescaling predictions') scaled_predictions = (df_predictions['raw'] - train_predictions_mean) / train_predictions_sd scaled_predictions = scaled_predictions * h1_sd + h1_mean df_predictions['scale'] = scaled_predictions # trim and round the predictions logger.info('Trimming and rounding predictions') df_predictions['raw_trim'] = trim(df_predictions['raw'], trim_min, trim_max) df_predictions['raw_trim_round'] = np.rint( df_predictions['raw_trim']).astype('int64') df_predictions['scale_trim'] = trim(df_predictions['scale'], trim_min, trim_max) df_predictions['scale_trim_round'] = np.rint( df_predictions['scale_trim']).astype('int64') # add back the columns that we were requested to copy if any if columns_to_copy: df_predictions_with_metadata = pd.merge( df_predictions, df_input[['spkitemid'] + columns_to_copy]) assert (len(df_predictions) == len(df_predictions_with_metadata)) else: df_predictions_with_metadata = df_predictions.copy() # create any directories needed for the output file os.makedirs(dirname(output_file), exist_ok=True) # save the predictions to disk logger.info('Saving predictions to {}'.format(output_file)) df_predictions_with_metadata.to_csv(output_file, index=False) # save excluded responses to disk if not df_excluded.empty: excluded_output_file = '{}_excluded_responses{}'.format( *splitext(output_file)) logger.info( 'Saving excluded responses to {}'.format(excluded_output_file)) df_excluded.to_csv(excluded_output_file, index=False)