def test_check_section_order_not_enough_sections(): general_sections = ['evaluation', 'sysinfo'] special_sections = ['placeholder_special_section'] custom_sections = ['custom.ipynb'] subgroups = ['prompt', 'gender'] section_order = general_sections get_ordered_notebook_files(general_sections, special_sections=special_sections, custom_sections=custom_sections, section_order=section_order, subgroups=subgroups)
def test_check_section_order_not_enough_sections(): general_sections = ['evaluation', 'sysinfo'] special_sections = ['placeholder_special_section'] custom_sections = ['custom.ipynb'] subgroups = ['prompt', 'gender'] section_order = general_sections get_ordered_notebook_files(general_sections, special_sections=special_sections, custom_sections=custom_sections, section_order=section_order, subgroups=subgroups)
def test_check_section_order_wrong_sections(): general_sections = ['evaluation', 'sysinfo'] special_sections = ['placeholder_special_section'] custom_sections = ['custom.ipynb'] subgroups = [] section_order = ['extra_section1', 'extra_section2'] get_ordered_notebook_files(general_sections, special_sections=special_sections, custom_sections=custom_sections, section_order=section_order, subgroups=subgroups)
def test_check_section_order_wrong_sections(): general_sections = ['evaluation', 'sysinfo'] special_sections = ['placeholder_special_section'] custom_sections = ['custom.ipynb'] subgroups = [] section_order = ['extra_section1', 'extra_section2'] get_ordered_notebook_files(general_sections, special_sections=special_sections, custom_sections=custom_sections, section_order=section_order, subgroups=subgroups)
def test_get_ordered_notebook_files_custom_rsmcompare(): # custom and general sections, custom order and subgroups general_sections = ['feature_descriptives', 'score_distributions', 'features_by_group'] custom_sections = ['/test_path/custom.ipynb'] subgroups = ['prompt'] section_order = ['feature_descriptives', 'score_distributions', 'custom', 'features_by_group'] comparison_notebook_path = notebook_path_dict['general']['rsmcompare'] notebook_files = get_ordered_notebook_files(general_sections, custom_sections=custom_sections, section_order=section_order, subgroups=subgroups, context='rsmcompare') expected_notebook_files = ([join(comparison_notebook_path, 'header.ipynb')] + [join(comparison_notebook_path, s)+'.ipynb' for s in ['feature_descriptives', 'score_distributions']] + ['/test_path/custom.ipynb'] + [join(comparison_notebook_path, 'features_by_group.ipynb')] + [join(comparison_notebook_path, 'footer.ipynb')]) eq_(notebook_files, expected_notebook_files)
def test_get_ordered_notebook_files_custom_rsmeval(): # custom and general sections, custom order and subgroups general_sections = ['evaluation', 'consistency', 'evaluation_by_group'] custom_sections = ['/test_path/custom.ipynb'] subgroups = ['prompt'] section_order = ['evaluation', 'consistency', 'custom', 'evaluation_by_group'] notebook_path = notebook_path_dict['general']['rsmeval'] notebook_files = get_ordered_notebook_files(general_sections, custom_sections=custom_sections, section_order=section_order, subgroups=subgroups, context='rsmeval') expected_notebook_files = ([join(notebook_path, 'header.ipynb')] + [join(notebook_path, s)+'.ipynb' for s in ['evaluation', 'consistency']] + ['/test_path/custom.ipynb'] + [join(notebook_path, 'evaluation_by_group.ipynb')] + [join(notebook_path, 'footer.ipynb')]) eq_(notebook_files, expected_notebook_files)
def test_get_ordered_notebook_files_custom_rsmeval(): # custom and general sections, custom order and subgroups general_sections = ['evaluation', 'consistency', 'evaluation_by_group'] custom_sections = ['/test_path/custom.ipynb'] subgroups = ['prompt'] section_order = [ 'evaluation', 'consistency', 'custom', 'evaluation_by_group' ] notebook_path = notebook_path_dict['general']['rsmeval'] notebook_files = get_ordered_notebook_files( general_sections, custom_sections=custom_sections, section_order=section_order, subgroups=subgroups, context='rsmeval') expected_notebook_files = ( [join(notebook_path, 'header.ipynb')] + [ join(notebook_path, s) + '.ipynb' for s in ['evaluation', 'consistency'] ] + ['/test_path/custom.ipynb'] + [join(notebook_path, 'evaluation_by_group.ipynb')] + [join(notebook_path, 'footer.ipynb')]) eq_(notebook_files, expected_notebook_files)
def test_get_ordered_notebook_files_custom_rsmtool(): # custom and general sections, custom order and subgroups general_sections = ['data_description', 'pca', 'data_description_by_group'] custom_sections = ['/test_path/custom.ipynb'] special_sections = ['placeholder_special_section'] subgroups = ['prompt'] section_order = ['custom', 'data_description', 'pca', 'data_description_by_group', 'placeholder_special_section'] special_notebook_path = notebook_path_dict['special']['rsmtool'] notebook_files = get_ordered_notebook_files(general_sections, custom_sections=custom_sections, special_sections=special_sections, section_order=section_order, subgroups=subgroups, model_type='skll', context='rsmtool') expected_notebook_files = ([join(notebook_path, 'header.ipynb')] + ['/test_path/custom.ipynb'] + [join(notebook_path, s)+'.ipynb' for s in ['data_description', 'pca', 'data_description_by_group']] + [join(special_notebook_path, 'placeholder_special_section.ipynb')] + [join(notebook_path, 'footer.ipynb')]) eq_(notebook_files, expected_notebook_files)
def test_get_ordered_notebook_files_custom_rsmcompare(): # custom and general sections, custom order and subgroups general_sections = [ 'feature_descriptives', 'score_distributions', 'features_by_group' ] custom_sections = ['/test_path/custom.ipynb'] subgroups = ['prompt'] section_order = [ 'feature_descriptives', 'score_distributions', 'custom', 'features_by_group' ] comparison_notebook_path = notebook_path_dict['general']['rsmcompare'] notebook_files = get_ordered_notebook_files( general_sections, custom_sections=custom_sections, section_order=section_order, subgroups=subgroups, context='rsmcompare') expected_notebook_files = ( [join(comparison_notebook_path, 'header.ipynb')] + [ join(comparison_notebook_path, s) + '.ipynb' for s in ['feature_descriptives', 'score_distributions'] ] + ['/test_path/custom.ipynb'] + [join(comparison_notebook_path, 'features_by_group.ipynb')] + [join(comparison_notebook_path, 'footer.ipynb')]) eq_(notebook_files, expected_notebook_files)
def test_get_ordered_notebook_files_custom_rsmtool(): # custom and general sections, custom order and subgroups general_sections = ['data_description', 'pca', 'data_description_by_group'] custom_sections = ['/test_path/custom.ipynb'] special_sections = ['placeholder_special_section'] subgroups = ['prompt'] section_order = [ 'custom', 'data_description', 'pca', 'data_description_by_group', 'placeholder_special_section' ] special_notebook_path = notebook_path_dict['special']['rsmtool'] notebook_files = get_ordered_notebook_files( general_sections, custom_sections=custom_sections, special_sections=special_sections, section_order=section_order, subgroups=subgroups, model_type='skll', context='rsmtool') expected_notebook_files = ( [join(notebook_path, 'header.ipynb')] + ['/test_path/custom.ipynb'] + [ join(notebook_path, s) + '.ipynb' for s in ['data_description', 'pca', 'data_description_by_group'] ] + [join(special_notebook_path, 'placeholder_special_section.ipynb')] + [join(notebook_path, 'footer.ipynb')]) eq_(notebook_files, expected_notebook_files)
def test_get_ordered_notebook_files_default_rsmcompare(): general_sections = ['all'] comparison_notebook_path = notebook_path_dict['general']['rsmcompare'] notebook_files = get_ordered_notebook_files(general_sections, context='rsmcompare') no_subgroup_list = [s for s in general_section_list_rsmcompare if not s.endswith('by_group')] section_list = ['header'] + no_subgroup_list + ['footer'] general_section_plus_extension = [s+'.ipynb' for s in section_list] expected_notebook_files = [join(comparison_notebook_path, s) for s in general_section_plus_extension] eq_(notebook_files, expected_notebook_files)
def test_get_ordered_notebook_files_default_rsmeval(): general_sections = ['all'] notebook_files = get_ordered_notebook_files(general_sections, context='rsmeval') no_subgroup_list = [s for s in general_section_list_rsmeval if not s.endswith('by_group')] section_list = ['header'] + no_subgroup_list + ['footer'] general_section_plus_extension = ['{}.ipynb'.format(s) for s in section_list] expected_notebook_files = [join(notebook_path_dict['general']['rsmeval'], s) for s in general_section_plus_extension] eq_(notebook_files, expected_notebook_files)
def test_get_ordered_notebook_files_default_rsmeval(): general_sections = ['all'] notebook_files = get_ordered_notebook_files(general_sections, context='rsmeval') no_subgroup_list = [s for s in general_section_list_rsmeval if not s.endswith('by_group')] section_list = ['header'] + no_subgroup_list + ['footer'] # replace data_description section with data_description_eval updated_section_list = [sname+'_eval' if sname == 'data_description' else sname for sname in section_list] general_section_plus_extension = ['{}.ipynb'.format(s) for s in updated_section_list] expected_notebook_files = [join(notebook_path_dict['general']['rsmeval'], s) for s in general_section_plus_extension] eq_(notebook_files, expected_notebook_files)
def test_get_ordered_notebook_files_default_rsmtool(): general_sections = ['all'] notebook_files = get_ordered_notebook_files(general_sections, model_type='skll', context='rsmtool') no_subgroup_list = [s for s in general_section_list_rsmtool if not s.endswith('by_group')] section_list = ['header'] + no_subgroup_list + ['footer'] # replace model section with skll_model. updated_section_list = ['skll_'+sname if sname == 'model' else sname for sname in section_list] general_section_plus_extension = [s+'.ipynb' for s in updated_section_list] expected_notebook_files = [join(notebook_path, s) for s in general_section_plus_extension] eq_(notebook_files, expected_notebook_files)
def test_get_ordered_notebook_files_default_rsmcompare(): general_sections = ['all'] comparison_notebook_path = notebook_path_dict['general']['rsmcompare'] notebook_files = get_ordered_notebook_files(general_sections, context='rsmcompare') no_subgroup_list = [ s for s in general_section_list_rsmcompare if not s.endswith('by_group') ] section_list = ['header'] + no_subgroup_list + ['footer'] general_section_plus_extension = [s + '.ipynb' for s in section_list] expected_notebook_files = [ join(comparison_notebook_path, s) for s in general_section_plus_extension ] eq_(notebook_files, expected_notebook_files)
def test_get_ordered_notebook_files_default_rsmeval(): general_sections = ['all'] notebook_files = get_ordered_notebook_files(general_sections, context='rsmeval') no_subgroup_list = [ s for s in general_section_list_rsmeval if not s.endswith('by_group') ] section_list = ['header'] + no_subgroup_list + ['footer'] # replace data_description section with data_description_eval updated_section_list = [ sname + '_eval' if sname == 'data_description' else sname for sname in section_list ] general_section_plus_extension = [ '{}.ipynb'.format(s) for s in updated_section_list ] expected_notebook_files = [ join(notebook_path_dict['general']['rsmeval'], s) for s in general_section_plus_extension ] eq_(notebook_files, expected_notebook_files)
def test_get_ordered_notebook_files_default_rsmtool(): general_sections = ['all'] notebook_files = get_ordered_notebook_files(general_sections, model_type='skll', context='rsmtool') no_subgroup_list = [ s for s in general_section_list_rsmtool if not s.endswith('by_group') ] section_list = ['header'] + no_subgroup_list + ['footer'] # replace model section with skll_model. updated_section_list = [ 'skll_' + sname if sname == 'model' else sname for sname in section_list ] general_section_plus_extension = [ s + '.ipynb' for s in updated_section_list ] expected_notebook_files = [ join(notebook_path, s) for s in general_section_plus_extension ] eq_(notebook_files, expected_notebook_files)
def run_comparison(config_file, output_dir): """ Run a comparison between the two RSMTool experiments specified in the config file and write out the comparison report to the output directory. """ logger = logging.getLogger(__name__) # load the information from the config file # read in the main config file config_obj = read_json_file(config_file) config_obj = check_main_config(config_obj, context='rsmcompare') # get the subgroups if any subgroups = config_obj.get('subgroups') # get the directory where the config file lives configpath = dirname(config_file) # get the information about the "old" experiment description_old = config_obj['description_old'] experiment_id_old = config_obj['experiment_id_old'] experiment_dir_old = locate_file(config_obj['experiment_dir_old'], configpath) if not experiment_dir_old: raise FileNotFoundError("The directory {} " "does not exist.".format( config_obj['experiment_dir_old'])) else: csvdir_old = normpath(join(experiment_dir_old, 'output')) figdir_old = normpath(join(experiment_dir_old, 'figure')) if not exists(csvdir_old) or not exists(figdir_old): raise FileNotFoundError("The directory {} does not contain " "the output of an rsmtool " "experiment.".format(experiment_dir_old)) use_scaled_predictions_old = config_obj['use_scaled_predictions_old'] # get the information about the "new" experiment description_new = config_obj['description_new'] experiment_id_new = config_obj['experiment_id_new'] experiment_dir_new = locate_file(config_obj['experiment_dir_new'], configpath) if not experiment_dir_new: raise FileNotFoundError("The directory {} " "does not exist.".format( config_obj['experiment_dir_new'])) else: csvdir_new = normpath(join(experiment_dir_new, 'output')) figdir_new = normpath(join(experiment_dir_new, 'figure')) if not exists(csvdir_new) or not exists(figdir_new): raise FileNotFoundError("The directory {} does not contain " "the output of an rsmtool " "experiment.".format(experiment_dir_new)) use_scaled_predictions_new = config_obj['use_scaled_predictions_new'] # are there specific general report sections we want to include? general_report_sections = config_obj['general_sections'] # what about the special or custom sections? special_report_sections = config_obj['special_sections'] custom_report_section_paths = config_obj['custom_sections'] if custom_report_section_paths: logger.info('Locating custom report sections') custom_report_sections = locate_custom_sections( custom_report_section_paths, configpath) else: custom_report_sections = [] section_order = config_obj['section_order'] chosen_notebook_files = get_ordered_notebook_files(general_report_sections, special_report_sections, custom_report_sections, section_order, subgroups, model_type=None, context='rsmcompare') # now generate the comparison report logger.info('Starting report generation') create_comparison_report( experiment_id_old, description_old, csvdir_old, figdir_old, experiment_id_new, description_new, csvdir_new, figdir_new, output_dir, subgroups, chosen_notebook_files, use_scaled_predictions_old=use_scaled_predictions_old, use_scaled_predictions_new=use_scaled_predictions_new)
def run_comparison(config_file, output_dir): """ Run a comparison between the two RSMTool experiments specified in the config file and write out the comparison report to the output directory. """ logger = logging.getLogger(__name__) # load the information from the config file # read in the main config file config_obj = read_json_file(config_file) config_obj = check_main_config(config_obj, context='rsmcompare') # get the subgroups if any subgroups = config_obj.get('subgroups') # get the directory where the config file lives configpath = dirname(config_file) # get the information about the "old" experiment description_old = config_obj['description_old'] experiment_id_old = config_obj['experiment_id_old'] experiment_dir_old = locate_file(config_obj['experiment_dir_old'], configpath) if not experiment_dir_old: raise FileNotFoundError("The directory {} " "does not exist.".format(config_obj['experiment_dir_old'])) else: csvdir_old = normpath(join(experiment_dir_old, 'output')) figdir_old = normpath(join(experiment_dir_old, 'figure')) if not exists(csvdir_old) or not exists(figdir_old): raise FileNotFoundError("The directory {} does not contain " "the output of an rsmtool " "experiment.".format(experiment_dir_old)) use_scaled_predictions_old = config_obj['use_scaled_predictions_old'] # get the information about the "new" experiment description_new = config_obj['description_new'] experiment_id_new = config_obj['experiment_id_new'] experiment_dir_new = locate_file(config_obj['experiment_dir_new'], configpath) if not experiment_dir_new: raise FileNotFoundError("The directory {} " "does not exist.".format(config_obj['experiment_dir_new'])) else: csvdir_new = normpath(join(experiment_dir_new, 'output')) figdir_new = normpath(join(experiment_dir_new, 'figure')) if not exists(csvdir_new) or not exists(figdir_new): raise FileNotFoundError("The directory {} does not contain " "the output of an rsmtool " "experiment.".format(experiment_dir_new)) use_scaled_predictions_new = config_obj['use_scaled_predictions_new'] # are there specific general report sections we want to include? general_report_sections = config_obj['general_sections'] # what about the special or custom sections? special_report_sections = config_obj['special_sections'] custom_report_section_paths = config_obj['custom_sections'] if custom_report_section_paths: logger.info('Locating custom report sections') custom_report_sections = locate_custom_sections(custom_report_section_paths, configpath) else: custom_report_sections = [] section_order = config_obj['section_order'] chosen_notebook_files = get_ordered_notebook_files(general_report_sections, special_report_sections, custom_report_sections, section_order, subgroups, model_type=None, context='rsmcompare') # now generate the comparison report logger.info('Starting report generation') create_comparison_report(experiment_id_old, description_old, csvdir_old, figdir_old, experiment_id_new, description_new, csvdir_new, figdir_new, output_dir, subgroups, chosen_notebook_files, use_scaled_predictions_old=use_scaled_predictions_old, use_scaled_predictions_new=use_scaled_predictions_new)
def load_experiment_data(main_config_file, outdir): """ Set up the experiment by loading the training and evaluation data sets and preprocessing them. """ logger = logging.getLogger(__name__) # read in the main config file logger.info('Reading configuration file: {}'.format(main_config_file)) config_obj = read_json_file(main_config_file) config_obj = check_main_config(config_obj) # get the directory where the config file lives configpath = dirname(main_config_file) # get the experiment ID experiment_id = config_obj['experiment_id'] # get the description description = config_obj['description'] # get the column name for the labels for the training and testing data train_label_column = config_obj['train_label_column'] test_label_column = config_obj['test_label_column'] # get the column name that will hold the ID for # both the training and the test data id_column = config_obj['id_column'] # get the specified trim min and max values spec_trim_min, spec_trim_max = get_trim_min_max(config_obj) # get the name of the optional column that # contains response length. length_column = config_obj['length_column'] # get the name of the optional column that # contains the second human score second_human_score_column = config_obj['second_human_score_column'] # get the name of the optional column that # contains the candidate ID candidate_column = config_obj['candidate_column'] # if the test label column is the same as the # second human score column, raise an error if test_label_column == second_human_score_column: raise ValueError("'test_label_column' and " "'second_human_score_column' cannot have the " "same value.") # get the name of the model that we want to train and # check that it's valid model_name = config_obj['model'] model_type = check_model_name(model_name) # are we excluding zero scores? exclude_zero_scores = config_obj['exclude_zero_scores'] # if we are excluding zero scores but trim_min # is set to 0, then we need to warn the user if exclude_zero_scores and spec_trim_min == 0: logger.warning("'exclude_zero_scores' is set to True but " "'trim_min' is set to 0. This may cause " " unexpected behavior.") # are we filtering on any other columns? flag_column_dict = check_flag_column(config_obj) # are we generating fake labels? use_fake_train_labels = train_label_column == 'fake' use_fake_test_labels = test_label_column == 'fake' # are we analyzing scaled or raw prediction values use_scaled_predictions = config_obj['use_scaled_predictions'] # get the subgroups if any subgroups = config_obj.get('subgroups') # are there specific general report sections we want to include? general_report_sections = config_obj['general_sections'] # what about the special or custom sections? special_report_sections = config_obj['special_sections'] custom_report_section_paths = config_obj['custom_sections'] if custom_report_section_paths: logger.info('Locating custom report sections') custom_report_sections = locate_custom_sections(custom_report_section_paths, configpath) else: custom_report_sections = [] section_order = config_obj['section_order'] chosen_notebook_files = get_ordered_notebook_files(general_report_sections, special_report_sections, custom_report_sections, section_order, subgroups, model_type=model_type, context='rsmtool') # Read in the feature configurations. # Location of feature file feature_field = config_obj['features'] # Check whether feature subset file exists and whether we are using # feature subset of prefix feature_subset_file = config_obj['feature_subset_file'] if feature_subset_file: feature_subset_file_location = locate_file(feature_subset_file, configpath) if not feature_subset_file_location: raise FileNotFoundError('Feature subset file {} not ' 'found.\n'.format(config_obj['feature_subset_file'])) feature_subset = config_obj['feature_subset'] feature_prefix = config_obj['feature_prefix'] # if the user requested feature_subset file and feature subset, # read the file and check its format if feature_subset_file and feature_subset: feature_subset_specs = pd.read_csv(feature_subset_file_location) check_feature_subset_file(feature_subset_specs, feature_subset) else: feature_subset_specs = None # Do we need to automatically find the best transformations/change sign? select_transformations = config_obj['select_transformations'] feature_sign = config_obj['sign'] requested_features = [] feature_specs = {} select_features_automatically = True # For backward compatibility, we check whether this field can # be set to all and set the select_transformations to true # as was done in the previous version. if feature_field == 'all': select_transformations = True elif feature_field is not None: feature_file_location = locate_file(feature_field, configpath) select_features_automatically = False if not feature_file_location: raise FileNotFoundError('Feature file {} not ' 'found.\n'.format(config_obj['features'])) else: logger.info('Reading feature file: {}'.format(feature_file_location)) feature_json = read_json_file(feature_file_location) feature_specs = normalize_and_validate_feature_file(feature_json) requested_features = [fdict['feature'] for fdict in feature_specs['features']] # check to make sure that `length_column` or `second_human_score_column` # are not also included in the requested features, if they are specified if (length_column and length_column in requested_features): raise ValueError("The value of 'length_column' ('{}') cannot be " "used as a model feature.".format(length_column)) if (second_human_score_column and second_human_score_column in requested_features): raise ValueError("The value of 'second_human_score_column' ('{}') cannot be " "used as a model feature.".format(second_human_score_column)) # Specify column names that cannot be used as features reserved_column_names = list(set(['spkitemid', 'spkitemlab', 'itemType', 'r1', 'r2', 'score', 'sc', 'sc1', 'adj', train_label_column, test_label_column, id_column] + subgroups + list(flag_column_dict.keys()))) # if `second_human_score_column` is specified, then # we need to add `sc2` to the list of reserved column # names. And same for 'length' and 'candidate', if `length_column` # and `candidate_column` are specified if second_human_score_column: reserved_column_names.append('sc2') if length_column: reserved_column_names.append('length') if candidate_column: reserved_column_names.append('candidate') # Make sure that the training data as specified in the # config file actually exists on disk and if it does, # load it and filter out the bad rows and features with # zero standard deviation. Also double check that the requested # features exist in the data or obtain the feature names if # no feature file was given. train_file_location = locate_file(config_obj['train_file'], configpath) if not train_file_location: raise FileNotFoundError('Error: Training file {} ' 'not found.\n'.format(config_obj['train_file'])) else: logger.info('Reading training data: {}'.format(train_file_location)) (df_train_features, df_train_metadata, df_train_other_columns, df_train_excluded, df_train_length, _, df_train_flagged_responses, used_trim_min, used_trim_max, feature_names) = load_and_filter_data(train_file_location, train_label_column, id_column, length_column, None, candidate_column, requested_features, reserved_column_names, spec_trim_min, spec_trim_max, flag_column_dict, subgroups, exclude_zero_scores=exclude_zero_scores, exclude_zero_sd=True, feature_subset_specs=feature_subset_specs, feature_subset=feature_subset, feature_prefix=feature_prefix, use_fake_labels=use_fake_train_labels) # Generate feature specifications now that we # know what features are selected if select_features_automatically: if select_transformations is False: feature_specs = generate_default_specs(feature_names) else: feature_specs = generate_specs_from_data(feature_names, 'sc1', df_train_features, feature_subset_specs=feature_subset_specs, feature_sign=feature_sign) # Sanity check to make sure the function returned the # same feature names as specified in feature json file, # if there was one elif not select_features_automatically: assert feature_names == requested_features # Do the same for the test data except we can ignore the trim min # and max since we already have that from the training data and # we have the feature_names when no feature file was specified. # We also allow features with 0 standard deviation in the test file. test_file_location = locate_file(config_obj['test_file'], configpath) if not test_file_location: raise FileNotFoundError('Error: Evaluation file ' '{} not found.\n'.format(config_obj['test_file'])) elif (test_file_location == train_file_location and train_label_column == test_label_column): logging.warning('The same data file and label ' 'column are used for both training ' 'and evaluating the model. No second ' 'score analysis will be performed, even ' 'if requested.') df_test_features = df_train_features.copy() df_test_metadata = df_train_metadata.copy() df_test_excluded = df_train_excluded.copy() df_test_other_columns = df_train_other_columns.copy() df_test_flagged_responses = df_train_flagged_responses.copy() df_test_human_scores = pd.DataFrame() else: logger.info('Reading evaluation data: {}'.format(test_file_location)) (df_test_features, df_test_metadata, df_test_other_columns, df_test_excluded, _, df_test_human_scores, df_test_flagged_responses, _, _, _) = load_and_filter_data(test_file_location, test_label_column, id_column, None, second_human_score_column, candidate_column, feature_names, reserved_column_names, used_trim_min, used_trim_max, flag_column_dict, subgroups, exclude_zero_scores=exclude_zero_scores, exclude_zero_sd=False, use_fake_labels=use_fake_test_labels) return (df_train_features, df_test_features, df_train_metadata, df_test_metadata, df_train_other_columns, df_test_other_columns, df_train_excluded, df_test_excluded, df_train_length, df_test_human_scores, df_train_flagged_responses, df_test_flagged_responses, experiment_id, description, train_file_location, test_file_location, feature_specs, model_name, model_type, train_label_column, test_label_column, id_column, length_column, second_human_score_column, candidate_column, subgroups, feature_subset_file, used_trim_min, used_trim_max, use_scaled_predictions, exclude_zero_scores, select_features_automatically, chosen_notebook_files)
def run_evaluation(config_file, output_dir): """ Run RSMTool evaluation experiment using the given configuration file and generate all evaluation outputs in the given directory. """ logger = logging.getLogger(__name__) # create the 'output' and the 'figure' sub-directories # where all the experiment output such as the CSV files # and the box plots will be saved csvdir = abspath(join(output_dir, 'output')) figdir = abspath(join(output_dir, 'figure')) reportdir = abspath(join(output_dir, 'report')) os.makedirs(csvdir, exist_ok=True) os.makedirs(figdir, exist_ok=True) os.makedirs(reportdir, exist_ok=True) # load the information from the config file # read in the main config file config_obj = read_json_file(config_file) config_obj = check_main_config(config_obj, context='rsmeval') # get the directory where the config file lives # if this is the 'expm' directory, then go # up one level. configpath = dirname(config_file) # get the experiment ID experiment_id = config_obj['experiment_id'] # get the description description = config_obj['description'] # get the column name for the labels for the training and testing data human_score_column = config_obj['human_score_column'] system_score_column = config_obj['system_score_column'] # get the name of the optional column that # contains the second human score second_human_score_column = config_obj['second_human_score_column'] # if the human score column is the same as the # second human score column, raise an error if human_score_column == second_human_score_column: raise ValueError("'human_score_column' and " "'second_human_score_column' " "cannot have the same value.") # get the column name that will hold the ID for # both the training and the test data id_column = config_obj['id_column'] # get the specified trim min and max, if any # and make sure they are numeric spec_trim_min, spec_trim_max = get_trim_min_max(config_obj) # get the subgroups if any subgroups = config_obj.get('subgroups') # get the candidate column if any and convert it to string candidate_column = config_obj['candidate_column'] general_report_sections = config_obj['general_sections'] # get any special sections that the user might have specified special_report_sections = config_obj['special_sections'] # get any custom sections and locate them to make sure # that they exist, otherwise raise an exception custom_report_section_paths = config_obj['custom_sections'] if custom_report_section_paths: logger.info('Locating custom report sections') custom_report_sections = locate_custom_sections( custom_report_section_paths, configpath) else: custom_report_sections = [] section_order = config_obj['section_order'] # check all sections values and order and get the # ordered list of notebook files chosen_notebook_files = get_ordered_notebook_files(general_report_sections, special_report_sections, custom_report_sections, section_order, subgroups, model_type=None, context='rsmeval') # are we excluding zero scores? exclude_zero_scores = config_obj['exclude_zero_scores'] # if we are excluding zero scores but trim_min # is set to 0, then we need to warn the user if exclude_zero_scores and spec_trim_min == 0: logger.warning("'exclude_zero_scores' is set to True but " " 'trim_min' is set to 0. This may cause " " unexpected behavior.") # are we filtering on any other columns? flag_column_dict = check_flag_column(config_obj) # do we have the training set predictions and human scores CSV file scale_with = config_obj.get('scale_with') # scale_with can be one of the following: # (a) None : the predictions are assumed to be 'raw' and should be used as is # when computing the metrics; the names for the final columns are # 'raw', 'raw_trim' and 'raw_trim_round'. # (b) 'asis' : the predictions are assumed to be pre-scaled and should be used as is # when computing the metrics; the names for the final columns are # 'scale', 'scale_trim' and 'scale_trim_round'. # (c) a CSV file : the predictions are assumed to be 'raw' and should be scaled # before computing the metrics; the names for the final columns are # 'scale', 'scale_trim' and 'scale_trim_round'. # we need to scale if and only if a CSV file is specified do_scaling = (scale_with != None and scale_with != 'asis') # use scaled predictions for the analyses unless # we were told not to use_scaled_predictions = (scale_with != None) # log an appropriate message if scale_with is None: message = ('Assuming given system predictions ' 'are unscaled and will be used as such.') elif scale_with == 'asis': message = ('Assuming given system predictions ' 'are already scaled and will be used as such.') else: message = ('Assuming given system predictions ' 'are unscaled and will be scaled before use.') logger.info(message) # load the predictions from disk and make sure that the `id_column` # is read in as a string predictions_file_location = locate_file(config_obj['predictions_file'], configpath) if not predictions_file_location: raise FileNotFoundError('Error: Predictions file {} ' 'not found.\n'.format( config_obj['predictions_file'])) else: logger.info( 'Reading predictions: {}'.format(predictions_file_location)) string_columns = [id_column, candidate_column] + subgroups converter_dict = dict([(column, str) for column in string_columns if column]) df_pred = pd.read_csv(predictions_file_location, converters=converter_dict) # make sure that the columns specified in the config file actually exist missing_columns = set([id_column, human_score_column, system_score_column]).difference(df_pred.columns) if missing_columns: raise KeyError('Columns {} from the config file do not exist ' 'in the predictions file.'.format(missing_columns)) df_pred = rename_default_columns(df_pred, [], id_column, human_score_column, second_human_score_column, None, system_score_column, candidate_column) # check that the id_column contains unique values if df_pred['spkitemid'].size != df_pred['spkitemid'].unique().size: raise ValueError("The data contains duplicate response IDs " "in '{}'. Please make sure all response IDs " "are unique and re-run the tool.".format(id_column)) df_pred = check_subgroups(df_pred, subgroups) # filter out the responses based on flag columns (df_responses_with_requested_flags, df_responses_with_excluded_flags) = filter_on_flag_columns( df_pred, flag_column_dict) # filter out rows that have non-numeric or zero human scores df_filtered, df_excluded = filter_on_column( df_responses_with_requested_flags, 'sc1', 'spkitemid', exclude_zeros=exclude_zero_scores) # make sure that the remaining data frame is not empty if len(df_filtered) == 0: raise ValueError("No responses remaining after filtering out " "non-numeric human scores. No further analysis " "can be run. ") # Change all non-numeric machine scores in excluded # data to NaNs for consistency with rsmtool. # NOTE: This will *not* work if *all* of the values # in column are non-numeric. This is a known bug in # pandas: https://github.com/pydata/pandas/issues/9589 # Therefore, we need add an additional check after this. df_excluded['raw'] = pd.to_numeric(df_excluded['raw'], errors='coerce').astype(float) # filter out the non-numeric machine scores from the rest of the data newdf, newdf_excluded = filter_on_column(df_filtered, 'raw', 'spkitemid', exclude_zeros=False) del df_filtered df_filtered_pred = newdf # make sure that the remaining data frame is not empty if len(df_filtered_pred) == 0: raise ValueError("No responses remaining after filtering out " "non-numeric machine scores. No further analysis " "can be run. ") df_excluded = pd.merge(df_excluded, newdf_excluded, how='outer') # set default values for scaling scale_pred_mean = 0 scale_pred_sd = 1 scale_human_mean = 0 scale_human_sd = 1 if do_scaling: scale_file_location = locate_file(scale_with, configpath) if not scale_file_location: raise FileNotFoundError( 'Error: scaling file {} not found.\n'.format(scale_with)) else: logger.info('Reading scaling file: {}'.format(scale_file_location)) df_scale_with = pd.read_csv(scale_file_location) if 'sc1' not in df_scale_with.columns and 'prediction' not in df_scale_with.columns: raise KeyError( 'The CSV file specified for scaling ', 'must have the "prediction" and the "sc1" ' 'columns.') else: scale_pred_mean, scale_pred_sd = ( df_scale_with['prediction'].mean(), df_scale_with['prediction'].std()) scale_human_mean, scale_human_sd = (df_scale_with['sc1'].mean(), df_scale_with['sc1'].std()) logger.info('Processing predictions') df_pred_processed = process_predictions(df_filtered_pred, scale_pred_mean, scale_pred_sd, scale_human_mean, scale_human_sd, spec_trim_min, spec_trim_max) if not scale_with: expected_score_types = ['raw', 'raw_trim', 'raw_trim_round'] elif scale_with == 'asis': expected_score_types = ['scale', 'scale_trim', 'scale_trim_round'] else: expected_score_types = [ 'raw', 'raw_trim', 'raw_trim_round', 'scale', 'scale_trim', 'scale_trim_round' ] # extract separated data frames that we will write out # as separate files not_other_columns = set() prediction_columns = ['spkitemid', 'sc1'] + expected_score_types df_predictions_only = df_pred_processed[prediction_columns] not_other_columns.update(prediction_columns) metadata_columns = ['spkitemid'] + subgroups if candidate_column: metadata_columns.append('candidate') df_test_metadata = df_filtered_pred[metadata_columns] not_other_columns.update(metadata_columns) df_test_human_scores = pd.DataFrame() human_score_columns = ['spkitemid', 'sc1', 'sc2'] if second_human_score_column and 'sc2' in df_filtered_pred: df_test_human_scores = df_filtered_pred[human_score_columns].copy() not_other_columns.update(['sc2']) # filter out any non-numeric values nows # as well as zeros, if we were asked to df_test_human_scores['sc2'] = pd.to_numeric( df_test_human_scores['sc2'], errors='coerce').astype(float) if exclude_zero_scores: df_test_human_scores['sc2'] = df_test_human_scores['sc2'].replace( 0, np.nan) # remove 'spkitemid' from `not_other_columns` # because we want that in the other columns # data frame not_other_columns.remove('spkitemid') # extract all of the other columns in the predictions file other_columns = [ column for column in df_filtered_pred.columns if column not in not_other_columns ] df_pred_other_columns = df_filtered_pred[other_columns] logger.info('Saving pre-processed predictions and the metadata to disk') write_experiment_output([ df_predictions_only, df_test_metadata, df_pred_other_columns, df_test_human_scores, df_excluded, df_responses_with_excluded_flags ], [ 'pred_processed', 'test_metadata', 'test_other_columns', 'test_human_scores', 'test_excluded_responses', 'test_responses_with_excluded_flags' ], experiment_id, csvdir) # do the data composition stats (df_test_excluded_analysis, df_data_composition, data_composition_by_group_dict ) = run_data_composition_analyses_for_rsmeval( df_test_metadata, df_excluded, subgroups, candidate_column, exclude_zero_scores=exclude_zero_scores) write_experiment_output([df_test_excluded_analysis, df_data_composition], ['test_excluded_composition', 'data_composition'], experiment_id, csvdir) # write the results of data composition analysis by group if subgroups: for group in subgroups: write_experiment_output([data_composition_by_group_dict[group]], ['data_composition_by_{}'.format(group)], experiment_id, csvdir) # run the analyses on the predictions of the modelx` logger.info('Running analyses on predictions') (df_human_machine_eval, df_human_machine_eval_short, df_human_human_eval, eval_by_group_dict, df_degradation, df_confmatrix, df_score_dist) = run_prediction_analyses( df_predictions_only, df_test_metadata, df_test_human_scores, subgroups, second_human_score_column, exclude_zero_scores=exclude_zero_scores, use_scaled_predictions=use_scaled_predictions) write_experiment_output([ df_human_machine_eval, df_human_machine_eval_short, df_human_human_eval, df_degradation, df_confmatrix, df_score_dist ], [ 'eval', 'eval_short', 'consistency', 'degradation', 'confMatrix', 'score_dist' ], experiment_id, csvdir, reset_index=True) # if we are using subgroups, then write out the subgroup # specific output and include the by group section # in the final report if subgroups: for group in subgroups: eval_by_group, consistency_by_group = eval_by_group_dict[group] write_experiment_output([eval_by_group, consistency_by_group], [ 'eval_by_{}'.format(group), 'consistency_by_{}'.format(group) ], experiment_id, csvdir, reset_index=True) # generate the report logger.info('Starting report generation') create_report(experiment_id, description, '', '', '', predictions_file_location, csvdir, figdir, subgroups, None, second_human_score_column, chosen_notebook_files, exclude_zero_scores=exclude_zero_scores, use_scaled_predictions=use_scaled_predictions)