def check_scaled_coefficients(source, experiment_id, file_format='csv'): """ Check that the predictions generated using scaled coefficients match the scaled scores. Raises an AssertionError if they do not. Parameters ---------- source : str Path to the source directory on disk. experiment_id : str The experiment ID. file_format : str, optional The format of the output files. Defaults to 'csv'. """ preprocessed_test_file = join('test_outputs', source, 'output', '{}_test_preprocessed_features.{}'.format(experiment_id, file_format)) scaled_coefficients_file = join('test_outputs', source, 'output', '{}_coefficients_scaled.{}'.format(experiment_id, file_format)) predictions_file = join('test_outputs', source, 'output', '{}_pred_processed.{}'.format(experiment_id, file_format)) postprocessing_params_file = join('test_outputs', source, 'output', '{}_postprocessing_params.{}'.format(experiment_id, file_format)) postproc_params = DataReader.read_from_file(postprocessing_params_file).loc[0] df_preprocessed_test_data = DataReader.read_from_file(preprocessed_test_file) df_old_predictions = DataReader.read_from_file(predictions_file) df_old_predictions = df_old_predictions[['spkitemid', 'sc1', 'scale']] # create fake skll objects with new coefficients df_coef = DataReader.read_from_file(scaled_coefficients_file) learner = Modeler.create_fake_skll_learner(df_coef) modeler = Modeler.load_from_learner(learner) # generate new predictions and rename the prediction column to 'scale' df_new_predictions = modeler.predict(df_preprocessed_test_data, postproc_params['trim_min'], postproc_params['trim_max']) df_new_predictions.rename(columns={'raw': 'scale'}, inplace=True) # check that new predictions match the scaled old predictions assert_frame_equal(df_new_predictions.sort_index(axis=1), df_old_predictions.sort_index(axis=1), check_exact=False, check_less_precise=True)
def check_subgroup_outputs(output_dir, experiment_id, subgroups, file_format='csv'): """ Check to make sure that the subgroup outputs look okay. Raise an AssertionError if they do not. Parameters ---------- output_dir : str Path to the `output` experiment output directory for a test. experiment_id : str The experiment ID. subgroups : list of str List of column names that contain grouping information. file_format : str, optional The format of the output files. Defaults to 'csv'. """ train_preprocessed_file = join(output_dir, '{}_train_metadata.{}'.format(experiment_id, file_format)) train_preprocessed = DataReader.read_from_file(train_preprocessed_file, index_col=0) test_preprocessed_file = join(output_dir, '{}_test_metadata.{}'.format(experiment_id, file_format)) test_preprocessed = DataReader.read_from_file(test_preprocessed_file, index_col=0) for group in subgroups: ok_(group in train_preprocessed.columns) ok_(group in test_preprocessed.columns) # check that the total sum of N per category matches the total N # in data composition and the total N categories matches what is # in overall data composition file_data_composition_all = join(output_dir, '{}_data_composition.{}'.format(experiment_id, file_format)) df_data_composition_all = DataReader.read_from_file(file_data_composition_all) for group in subgroups: file_composition_by_group = join(output_dir, '{}_data_composition_by_{}.{}'.format(experiment_id, group, file_format)) composition_by_group = DataReader.read_from_file(file_composition_by_group) for partition in ['Training', 'Evaluation']: partition_info = df_data_composition_all.loc[df_data_composition_all['partition'] == partition] summation = sum(composition_by_group['{} set' ''.format(partition)]) ok_(summation == partition_info.iloc[0]['responses']) length = len(composition_by_group.loc[composition_by_group['{} set' ''.format(partition)] != 0]) ok_(length == partition_info.iloc[0][group])
def check_scaled_coefficients(source, experiment_id, file_format='csv'): """ Check that the predictions generated using scaled coefficients match the scaled scores. Raises an AssertionError if they do not. Parameters ---------- source : str Path to the source directory on disk. experiment_id : str The experiment ID. file_format : str, optional The format of the output files. Defaults to 'csv'. """ preprocessed_test_file = join( 'test_outputs', source, 'output', '{}_test_preprocessed_features.{}'.format(experiment_id, file_format)) scaled_coefficients_file = join( 'test_outputs', source, 'output', '{}_coefficients_scaled.{}'.format(experiment_id, file_format)) predictions_file = join( 'test_outputs', source, 'output', '{}_pred_processed.{}'.format(experiment_id, file_format)) postprocessing_params_file = join( 'test_outputs', source, 'output', '{}_postprocessing_params.{}'.format(experiment_id, file_format)) postproc_params = DataReader.read_from_file( postprocessing_params_file).loc[0] df_preprocessed_test_data = DataReader.read_from_file( preprocessed_test_file) df_old_predictions = DataReader.read_from_file(predictions_file) df_old_predictions = df_old_predictions[['spkitemid', 'sc1', 'scale']] # create fake skll objects with new coefficients df_coef = DataReader.read_from_file(scaled_coefficients_file) learner = Modeler.create_fake_skll_learner(df_coef) modeler = Modeler.load_from_learner(learner) # generate new predictions and rename the prediction column to 'scale' df_new_predictions = modeler.predict(df_preprocessed_test_data, postproc_params['trim_min'], postproc_params['trim_max']) df_new_predictions.rename(columns={'raw': 'scale'}, inplace=True) # check that new predictions match the scaled old predictions assert_frame_equal(df_new_predictions.sort_index(axis=1), df_old_predictions.sort_index(axis=1), check_exact=False, check_less_precise=True)
def check_read_from_file(self, extension): """Test whether ``read_from_file()`` works as expected.""" name = TestDataReader.make_file_from_ext(self.df_train, extension) # now read in the file using `read_data_file()` df_read = DataReader.read_from_file(name, converters={'id': str, 'candidate': str}) # Make sure we get rid of the file at the end, # at least if we get to this point (i.e. no errors raised) self.filepaths.append(name) assert_frame_equal(self.df_train, df_read)
def check_read_from_file(self, extension): """ Test whether the ``read_from_file()`` method works as expected. """ name = TestDataReader.make_file_from_ext(self.df_train, extension) # now read in the file using `read_data_file()` df_read = DataReader.read_from_file(name, converters={'id': str, 'candidate': str}) # Make sure we get rid of the file at the end, # at least if we get to this point (i.e. no errors raised) self.filepaths.append(name) assert_frame_equal(self.df_train, df_read)
def check_subgroup_outputs(output_dir, experiment_id, subgroups, file_format='csv'): """ Check to make sure that the subgroup outputs look okay. Raise an AssertionError if they do not. Parameters ---------- output_dir : str Path to the `output` experiment output directory for a test. experiment_id : str The experiment ID. subgroups : list of str List of column names that contain grouping information. file_format : str, optional The format of the output files. Defaults to 'csv'. """ train_preprocessed_file = join( output_dir, '{}_train_metadata.{}'.format(experiment_id, file_format)) train_preprocessed = DataReader.read_from_file(train_preprocessed_file, index_col=0) test_preprocessed_file = join( output_dir, '{}_test_metadata.{}'.format(experiment_id, file_format)) test_preprocessed = DataReader.read_from_file(test_preprocessed_file, index_col=0) for group in subgroups: ok_(group in train_preprocessed.columns) ok_(group in test_preprocessed.columns) # check that the total sum of N per category matches the total N # in data composition and the total N categories matches what is # in overall data composition file_data_composition_all = join( output_dir, '{}_data_composition.{}'.format(experiment_id, file_format)) df_data_composition_all = DataReader.read_from_file( file_data_composition_all) for group in subgroups: file_composition_by_group = join( output_dir, '{}_data_composition_by_{}.{}'.format(experiment_id, group, file_format)) composition_by_group = DataReader.read_from_file( file_composition_by_group) for partition in ['Training', 'Evaluation']: partition_info = df_data_composition_all.loc[ df_data_composition_all['partition'] == partition] summation = sum(composition_by_group['{} set' ''.format(partition)]) ok_(summation == partition_info.iloc[0]['responses']) length = len(composition_by_group.loc[ composition_by_group['{} set' ''.format(partition)] != 0]) ok_(length == partition_info.iloc[0][group])
def check_file_output(file1, file2, file_format='csv'): """ Check if two experiment files have values that are the same to within three decimal places. Raises an AssertionError if they are not. Parameters ---------- file1 : str Path to the first file. file2 : str Path to the second files. file_format : str, optional The format of the output files. Defaults to 'csv'. """ # make sure that the main id columns are read as strings since # this may affect merging in custom notebooks string_columns = ['spkitemid', 'candidate'] converter_dict = {column: str for column in string_columns} df1 = DataReader.read_from_file(file1, converters=converter_dict) df2 = DataReader.read_from_file(file2, converters=converter_dict) # if the first column is numeric, just force the index to string; # however, if it is non-numeric, set it as the index and then # force it to string. We do this to ensure string indices are # preserved as such for df in [df1, df2]: if np.issubdtype(df[df.columns[0]].dtype, np.number): df.index = df.index.map(str) else: df.index = df[df.columns[0]] df.index = df.index.map(str) # sort all the indices alphabetically df1.sort_index(inplace=True) df2.sort_index(inplace=True) # convert any integer columns to floats in either data frame for df in [df1, df2]: for c in df.columns: if df[c].dtype == np.int64: df[c] = df[c].astype(np.float64) # do the same for indices for df in [df1, df2]: if df.index.dtype == np.int64: df.index = df.index.astype(np.float64) # for pca and factor correlations convert all values to absolutes # because the sign may not always be the same if (file1.endswith('pca.{}'.format(file_format)) or file1.endswith('factor_correlations.{}'.format(file_format))): for df in [df1, df2]: msk = df.dtypes == np.float64 df.loc[:, msk] = df.loc[:, msk].abs() try: assert_frame_equal(df1.sort_index(axis=1), df2.sort_index(axis=1), check_exact=False, check_less_precise=True) except AssertionError as e: message = e.args[0] new_message = 'File {} - {}'.format(basename(file1), message) e.args = (new_message, ) raise
def load_rsmtool_output(self, filedir, figdir, experiment_id, prefix, groups_eval): """ Function to load all of the outputs of an rsmtool experiment. For each type of output, we first check whether the file exists to allow comparing experiments with different sets of outputs. Parameters ---------- filedir : str Path to the directory containing output files. figdir : str Path to the directory containing output figures. experiment_id : str Original ``experiment_id`` used to generate the output files. prefix: str Must be set to ``scale`` or ``raw``. Indicates whether the score is scaled or not. groups_eval: list List of subgroup names used for subgroup evaluation. Returns ------- files : dict A dictionary with outputs converted to pandas data frames. If a particular type of output did not exist for the experiment, its value will be an empty data frame. figs: dict A dictionary with experiment figures. """ file_format = get_output_directory_extension(filedir, experiment_id) files = defaultdict(pd.DataFrame) figs = {} # feature distributions and the inter-feature correlations feature_train_file = join(filedir, '{}_train_features.{}'.format(experiment_id, file_format)) if exists(feature_train_file): files['df_train_features'] = DataReader.read_from_file(feature_train_file) feature_distplots_file = join(figdir, '{}_distrib.svg'.format(experiment_id)) if exists(feature_distplots_file): figs['feature_distplots'] = feature_distplots_file feature_cors_file = join(filedir, '{}_cors_processed.{}'.format(experiment_id, file_format)) if exists(feature_cors_file): files['df_feature_cors'] = DataReader.read_from_file(feature_cors_file, index_col=0) # df_scores scores_file = join(filedir, '{}_pred_processed.{}'.format(experiment_id, file_format)) if exists(scores_file): df_scores = DataReader.read_from_file(scores_file, converters={'spkitemid': str}) files['df_scores'] = df_scores[['spkitemid', 'sc1', prefix]] # model coefficients if present betas_file = join(filedir, '{}_betas.{}'.format(experiment_id, file_format)) if exists(betas_file): files['df_coef'] = DataReader.read_from_file(betas_file, index_col=0) files['df_coef'].index.name = None # read in the model fit files if present model_fit_file = join(filedir, '{}_model_fit.{}'.format(experiment_id, file_format)) if exists(model_fit_file): files['df_model_fit'] = DataReader.read_from_file(model_fit_file) # human human agreement consistency_file = join(filedir, '{}_consistency.{}'.format(experiment_id, file_format)) # load if consistency file is present if exists(consistency_file): df_consistency = DataReader.read_from_file(consistency_file, index_col=0) files['df_consistency'] = df_consistency # degradation degradation_file = join(filedir, "{}_degradation.{}".format(experiment_id, file_format)) # load if degradation file is present if exists(degradation_file): df_degradation = DataReader.read_from_file(degradation_file, index_col=0) files['df_degradation'] = df_degradation # disattenuated correlations dis_corr_file = join(filedir, "{}_disattenuated_correlations.{}".format(experiment_id, file_format)) # load if disattenuated correlations is present if exists(dis_corr_file): df_dis_corr = DataReader.read_from_file(dis_corr_file, index_col=0) # we only use the row for raw_trim or scale_trim score files['df_disattenuated_correlations'] = df_dis_corr.loc[['{}_trim'.format(prefix)]] # read in disattenuated correlations by group for group in groups_eval: group_dis_corr_file = join(filedir, '{}_disattenuated_correlations_by_{}.{}'.format(experiment_id, group, file_format)) if exists(group_dis_corr_file): df_dis_cor_group = DataReader.read_from_file(group_dis_corr_file, index_col=0) files['df_disattenuated_correlations_by_{}'.format(group)] = df_dis_cor_group files['df_disattenuated_correlations_by_{}_overview'.format(group)] = self.make_summary_stat_df(df_dis_cor_group) # true score evaluations true_score_eval_file = join(filedir, "{}_true_score_eval.{}".format(experiment_id, file_format)) # load true score evaluations if present if exists(true_score_eval_file): df_true_score_eval = DataReader.read_from_file(true_score_eval_file, index_col=0) # we only use the row for raw_trim or scale_trim score files['df_true_score_eval'] = df_true_score_eval.loc[['{}_trim'.format(prefix)]] # use the raw columns or the scale columns depending on the prefix existing_eval_cols = (_df_eval_columns_existing_raw if prefix == 'raw' else _df_eval_columns_existing_scale) rename_dict = raw_rename_dict if prefix == 'raw' else scale_rename_dict # read in the short version of the evaluation metrics for all data short_metrics_list = ["N", "Adj. Agmt.(br)", "Agmt.(br)", "K(br)", "Pearson(b)", "QWK(b)", "R2(b)", "RMSE(b)"] eval_file_short = join(filedir, '{}_eval_short.{}'.format(experiment_id, file_format)) if exists(eval_file_short): df_eval = DataReader.read_from_file(eval_file_short, index_col=0) (rename_dict_new, existing_eval_cols_new, short_metrics_list_new, _) = self._modify_eval_columns_to_ensure_version_compatibilty(df_eval, rename_dict, existing_eval_cols, short_metrics_list) df_eval = df_eval[existing_eval_cols_new] df_eval = df_eval.rename(columns=rename_dict_new) files['df_eval'] = df_eval[short_metrics_list_new] files['df_eval'].index.name = None eval_file = join(filedir, '{}_eval.{}'.format(experiment_id, file_format)) if exists(eval_file): files['df_eval_for_degradation'] = DataReader.read_from_file(eval_file, index_col=0) # read in the evaluation metrics by subgroup, if we are asked to for group in groups_eval: group_eval_file = join(filedir, '{}_eval_by_{}.{}'.format(experiment_id, group, file_format)) if exists(group_eval_file): df_eval = DataReader.read_from_file(group_eval_file, index_col=0) (rename_dict_new, existing_eval_cols_new, short_metrics_list_new, smd_name ) = self._modify_eval_columns_to_ensure_version_compatibilty(df_eval, rename_dict, existing_eval_cols, short_metrics_list, raise_warnings=False) # if `SMD` is being used, rather than `DSM`, we print a note for the user; we don't # want to go so far as to raise a warning, but we do want to give the user some info if smd_name == 'SMD': warnings.warn("The subgroup evaluations in `{}` use 'SMD'. Please note " "that newer versions of RSMTool (7.0 or greater) use 'DSM' with subgroup " "evaluations. For additional details on how these metrics " "differ, see the RSMTool documentation. Comparisons with experiments " "using SMD for subgroup calculations will be deprecated in the next major " "release.".format(group_eval_file), category=DeprecationWarning) df_eval = df_eval[existing_eval_cols_new] df_eval = df_eval.rename(columns=rename_dict_new) files['df_eval_by_{}'.format(group)] = df_eval[short_metrics_list_new] files['df_eval_by_{}'.format(group)].index.name = None series = files['df_eval_by_{}'.format(group)] files['df_eval_by_{}_overview'.format(group)] = self.make_summary_stat_df(series) # set the ordering of mean/SD/SMD statistics files['df_eval_by_{}_m_sd'.format(group)] = df_eval[['N', 'H1 mean', 'H1 SD', 'score mean(br)', 'score SD(br)', 'score mean(b)', 'score SD(b)', '{}(br)'.format(smd_name), '{}(b)'.format(smd_name)]] files['df_eval_by_{}_m_sd'.format(group)].index.name = None # read in the partial correlations vs. score for all data pcor_score_file = join(filedir, '{}_pcor_score_all_data.{}'.format(experiment_id, file_format)) if exists(pcor_score_file): files['df_pcor_sc1'] = DataReader.read_from_file(pcor_score_file, index_col=0) files['df_pcor_sc1_overview'] = self.make_summary_stat_df(files['df_pcor_sc1']) # read in the partial correlations by subgroups, if we are asked to for group in groups_eval: group_pcor_file = join(filedir, '{}_pcor_score_by_{}.{}'.format(experiment_id, group, file_format)) if exists(group_pcor_file): files['df_pcor_sc1_by_{}' ''.format(group)] = DataReader.read_from_file(group_pcor_file, index_col=0) series = files['df_pcor_sc1_by_{}'.format(group)] files['df_pcor_sc1_{}_overview'.format(group)] = self.make_summary_stat_df(series) # read in the marginal correlations vs. score for all data mcor_score_file = join(filedir, '{}_margcor_score_all_data.{}'.format(experiment_id, file_format)) if exists(mcor_score_file): files['df_mcor_sc1'] = DataReader.read_from_file(mcor_score_file, index_col=0) files['df_mcor_sc1_overview'] = self.make_summary_stat_df(files['df_mcor_sc1']) # read in the partial correlations by subgroups, if we are asked to for group in groups_eval: group_mcor_file = join(filedir, '{}_margcor_score_by_{}.{}'.format(experiment_id, group, file_format)) if exists(group_mcor_file): files['df_mcor_sc1_by_{}' ''.format(group)] = DataReader.read_from_file(group_mcor_file, index_col=0) series = files['df_mcor_sc1_by_{}'.format(group)] files['df_mcor_sc1_{}_overview'.format(group)] = self.make_summary_stat_df(series) pca_file = join(filedir, '{}_pca.{}'.format(experiment_id, file_format)) if exists(pca_file): files['df_pca'] = DataReader.read_from_file(pca_file, index_col=0) files['df_pcavar'] = DataReader.read_from_file(join(filedir, '{}_pcavar.{}'.format(experiment_id, file_format)), index_col=0) descriptives_file = join(filedir, '{}_feature_descriptives.{}'.format(experiment_id, file_format)) if exists(descriptives_file): # we read all files pertaining to the descriptive analysis together # since we merge the outputs files['df_descriptives'] = DataReader.read_from_file(descriptives_file, index_col=0) # this df contains only the number of features. this is used later # for another two tables to show the number of features df_features_n_values = files['df_descriptives'][['N', 'min', 'max']] files['df_descriptives'] = files['df_descriptives'][['N', 'mean', 'std. dev.', 'skewness', 'kurtosis']] outliers_file = join(filedir, '{}_feature_outliers.{}'.format(experiment_id, file_format)) df_outliers = DataReader.read_from_file(outliers_file, index_col=0) df_outliers = df_outliers.rename(columns={'upper': 'Upper', 'lower': 'Lower', 'both': 'Both', 'upperperc': 'Upper %', 'lowerperc': 'Lower %', 'bothperc': 'Both %'}) df_outliers_columns = df_outliers.columns.tolist() files['df_outliers'] = df_outliers # join with df_features_n_values to get the value of N files['df_outliers'] = pd.merge(files['df_outliers'], df_features_n_values, left_index=True, right_index=True)[['N'] + df_outliers_columns] # join with df_features_n_values to get the value of N percentiles_file = join(filedir, '{}_feature_descriptives' 'Extra.{}'.format(experiment_id, file_format)) files['df_percentiles'] = DataReader.read_from_file(percentiles_file, index_col=0) files['df_percentiles'] = pd.merge(files['df_percentiles'], df_features_n_values, left_index=True, right_index=True) mild_outliers = (files['df_percentiles']["Mild outliers"] / files['df_percentiles']["N"].astype(float) * 100) files['df_percentiles']["Mild outliers (%)"] = mild_outliers extreme_outliers = (files['df_percentiles']["Extreme outliers"] / files['df_percentiles']["N"].astype(float) * 100) files['df_percentiles']["Extreme outliers (%)"] = extreme_outliers files['df_percentiles'] = files['df_percentiles'][['N', 'min', 'max', '1%', '5%', '25%', '50%', '75%', '95%', '99%', 'IQR', 'Mild outliers', 'Mild outliers (%)', 'Extreme outliers', 'Extreme outliers (%)']] confmatrix_file = join(filedir, '{}_confMatrix.{}'.format(experiment_id, file_format)) if exists(confmatrix_file): conf_matrix = DataReader.read_from_file(confmatrix_file, index_col=0) files['df_confmatrix'] = self.process_confusion_matrix(conf_matrix) score_dist_file = join(filedir, '{}_score_dist.{}'.format(experiment_id, file_format)) if exists(score_dist_file): df_score_dist = DataReader.read_from_file(score_dist_file, index_col=1) df_score_dist.rename(columns={'sys_{}'.format(prefix): 'sys'}, inplace=True) files['df_score_dist'] = df_score_dist[['human', 'sys', 'difference']] # read in the feature boxplots by subgroup, if we were asked to for group in groups_eval: feature_boxplot_prefix = join(figdir, '{}_feature_boxplot_by_{}'.format(experiment_id, group)) svg_file = join(feature_boxplot_prefix + '.svg') png_file = join(feature_boxplot_prefix + '.png') if exists(svg_file): figs['feature_boxplots_by_{}_svg'.format(group)] = svg_file elif exists(png_file): figs['feature_boxplots_by_{}_png'.format(group)] = png_file # read in the betas image if exists betas_svg = join(figdir, '{}_betas.svg'.format(experiment_id)) if exists(betas_svg): figs['betas'] = betas_svg # read in the evaluation barplots by subgroup, if we were asked to for group in groups_eval: eval_barplot_svg_file = join(figdir, '{}_eval_by_{}.svg'.format(experiment_id, group)) if exists(eval_barplot_svg_file): figs['eval_barplot_by_{}'.format(group)] = eval_barplot_svg_file pca_svg_file = join(figdir, '{}_pca.svg'.format(experiment_id)) if exists(pca_svg_file): figs['pca_scree_plot'] = pca_svg_file return (files, figs, file_format)
def load_rsmtool_output(self, filedir, figdir, experiment_id, prefix, groups_eval): """ Function to load all of the outputs of an rsmtool experiment. For each type of output, we first check whether the file exists to allow comparing experiments with different sets of outputs. Parameters ---------- filedir : str Path to the directory containing output files. figdir : str Path to the directory containing output figures. experiment_id : str Original ``experiment_id`` used to generate the output files. prefix: str Must be set to ``scale`` or ``raw``. Indicates whether the score is scaled or not. groups_eval: list List of subgroup names used for subgroup evaluation. Returns ------- files : dict A dictionary with outputs converted to pandas data frames. If a particular type of output did not exist for the experiment, its value will be an empty data frame. figs: dict A dictionary with experiment figures. """ file_format = get_output_directory_extension(filedir, experiment_id) files = defaultdict(pd.DataFrame) figs = {} # feature distributions and the inter-feature correlations feature_train_file = join(filedir, '{}_train_features.{}'.format(experiment_id, file_format)) if exists(feature_train_file): files['df_train_features'] = DataReader.read_from_file(feature_train_file) feature_distplots_file = join(figdir, '{}_distrib.svg'.format(experiment_id)) if exists(feature_distplots_file): figs['feature_distplots'] = feature_distplots_file # with open(feature_distplots_file, 'rb') as f: # figs['feature_distplots'] = base64.b64encode(f.read()).decode('utf-8') feature_cors_file = join(filedir, '{}_cors_processed.{}'.format(experiment_id, file_format)) if exists(feature_cors_file): files['df_feature_cors'] = DataReader.read_from_file(feature_cors_file, index_col=0) # df_scores scores_file = join(filedir, '{}_pred_processed.{}'.format(experiment_id, file_format)) if exists(scores_file): df_scores = DataReader.read_from_file(scores_file, converters={'spkitemid': str}) files['df_scores'] = df_scores[['spkitemid', 'sc1', prefix]] # model coefficients if present betas_file = join(filedir, '{}_betas.{}'.format(experiment_id, file_format)) if exists(betas_file): files['df_coef'] = DataReader.read_from_file(betas_file, index_col=0) files['df_coef'].index.name = None # read in the model fit files if present model_fit_file = join(filedir, '{}_model_fit.{}'.format(experiment_id, file_format)) if exists(model_fit_file): files['df_model_fit'] = DataReader.read_from_file(model_fit_file) # human human agreement consistency_file = join(filedir, '{}_consistency.{}'.format(experiment_id, file_format)) # load if consistency file is present if exists(consistency_file): df_consistency = DataReader.read_from_file(consistency_file, index_col=0) files['df_consistency'] = df_consistency # degradation degradation_file = join(filedir, "{}_degradation.{}".format(experiment_id, file_format)) # load if degradation file is present if exists(degradation_file): df_degradation = DataReader.read_from_file(degradation_file, index_col=0) files['df_degradation'] = df_degradation # disattenuated correlations dis_corr_file = join(filedir, "{}_disattenuated_correlations.{}".format(experiment_id, file_format)) # load if disattenuated correlations is present if exists(dis_corr_file): df_dis_corr = DataReader.read_from_file(dis_corr_file, index_col=0) # we only use the row for raw_trim or scale_trim score files['df_disattenuated_correlations'] = df_dis_corr.loc[['{}_trim'.format(prefix)]] # read in disattenuated correlations by group for group in groups_eval: group_dis_corr_file = join(filedir, '{}_disattenuated_correlations_by_{}.{}'.format(experiment_id, group, file_format)) if exists(group_dis_corr_file): df_dis_cor_group = DataReader.read_from_file(group_dis_corr_file, index_col=0) files['df_disattenuated_correlations_by_{}'.format(group)] = df_dis_cor_group files['df_disattenuated_correlations_by_{}_overview'.format(group)] = self.make_summary_stat_df(df_dis_cor_group) # use the raw columns or the scale columns depending on the prefix existing_eval_cols = (_df_eval_columns_existing_raw if prefix == 'raw' else _df_eval_columns_existing_scale) rename_dict = raw_rename_dict if prefix == 'raw' else scale_rename_dict # read in the short version of the evaluation metrics for all data short_metrics_list = ["N", "Adj. Agmt.(br)", "Agmt.(br)", "K(br)", "Pearson(b)", "QWK(br)", "R2(b)", "RMSE(b)"] eval_file_short = join(filedir, '{}_eval_short.{}'.format(experiment_id, file_format)) if exists(eval_file_short): df_eval = DataReader.read_from_file(eval_file_short, index_col=0) df_eval = df_eval[existing_eval_cols] df_eval = df_eval.rename(columns=rename_dict) files['df_eval'] = df_eval[short_metrics_list] files['df_eval'].index.name = None eval_file = join(filedir, '{}_eval.{}'.format(experiment_id, file_format)) if exists(eval_file): files['df_eval_for_degradation'] = DataReader.read_from_file(eval_file, index_col=0) # read in the evaluation metrics by subgroup, if we are asked to for group in groups_eval: group_eval_file = join(filedir, '{}_eval_by_{}.{}'.format(experiment_id, group, file_format)) if exists(group_eval_file): df_eval = DataReader.read_from_file(group_eval_file, index_col=0) df_eval = df_eval[existing_eval_cols] df_eval = df_eval.rename(columns=rename_dict) files['df_eval_by_{}'.format(group)] = df_eval[short_metrics_list] files['df_eval_by_{}'.format(group)].index.name = None series = files['df_eval_by_{}'.format(group)] files['df_eval_by_{}_overview'.format(group)] = self.make_summary_stat_df(series) # set the ordering of mean/SD/SMD statistics files['df_eval_by_{}_m_sd'.format(group)] = df_eval[['N', 'H1 mean', 'H1 SD', 'score mean(br)', 'score SD(br)', 'score mean(b)', 'score SD(b)', 'SMD(br)', 'SMD(b)']] files['df_eval_by_{}_m_sd'.format(group)].index.name = None # read in the partial correlations vs. score for all data pcor_score_file = join(filedir, '{}_pcor_score_all_data.{}'.format(experiment_id, file_format)) if exists(pcor_score_file): files['df_pcor_sc1'] = DataReader.read_from_file(pcor_score_file, index_col=0) files['df_pcor_sc1_overview'] = self.make_summary_stat_df(files['df_pcor_sc1']) # read in the partial correlations by subgroups, if we are asked to for group in groups_eval: group_pcor_file = join(filedir, '{}_pcor_score_by_{}.{}'.format(experiment_id, group, file_format)) if exists(group_pcor_file): files['df_pcor_sc1_by_{}' ''.format(group)] = DataReader.read_from_file(group_pcor_file, index_col=0) series = files['df_pcor_sc1_by_{}'.format(group)] files['df_pcor_sc1_{}_overview'.format(group)] = self.make_summary_stat_df(series) # read in the marginal correlations vs. score for all data mcor_score_file = join(filedir, '{}_margcor_score_all_data.{}'.format(experiment_id, file_format)) if exists(mcor_score_file): files['df_mcor_sc1'] = DataReader.read_from_file(mcor_score_file, index_col=0) files['df_mcor_sc1_overview'] = self.make_summary_stat_df(files['df_mcor_sc1']) # read in the partial correlations by subgroups, if we are asked to for group in groups_eval: group_mcor_file = join(filedir, '{}_margcor_score_by_{}.{}'.format(experiment_id, group, file_format)) if exists(group_mcor_file): files['df_mcor_sc1_by_{}' ''.format(group)] = DataReader.read_from_file(group_mcor_file, index_col=0) series = files['df_mcor_sc1_by_{}'.format(group)] files['df_mcor_sc1_{}_overview'.format(group)] = self.make_summary_stat_df(series) pca_file = join(filedir, '{}_pca.{}'.format(experiment_id, file_format)) if exists(pca_file): files['df_pca'] = DataReader.read_from_file(pca_file, index_col=0) files['df_pcavar'] = DataReader.read_from_file(join(filedir, '{}_pcavar.{}'.format(experiment_id, file_format)), index_col=0) descriptives_file = join(filedir, '{}_feature_descriptives.{}'.format(experiment_id, file_format)) if exists(descriptives_file): # we read all files pertaining to the descriptive analysis together # since we merge the outputs files['df_descriptives'] = DataReader.read_from_file(descriptives_file, index_col=0) # this df contains only the number of features. this is used later # for another two tables to show the number of features df_features_n_values = files['df_descriptives'][['N', 'min', 'max']] files['df_descriptives'] = files['df_descriptives'][['N', 'mean', 'std. dev.', 'skewness', 'kurtosis']] outliers_file = join(filedir, '{}_feature_outliers.{}'.format(experiment_id, file_format)) df_outliers = DataReader.read_from_file(outliers_file, index_col=0) df_outliers = df_outliers.rename(columns={'upper': 'Upper', 'lower': 'Lower', 'both': 'Both', 'upperperc': 'Upper %', 'lowerperc': 'Lower %', 'bothperc': 'Both %'}) df_outliers_columns = df_outliers.columns.tolist() files['df_outliers'] = df_outliers # join with df_features_n_values to get the value of N files['df_outliers'] = pd.merge(files['df_outliers'], df_features_n_values, left_index=True, right_index=True)[['N'] + df_outliers_columns] # join with df_features_n_values to get the value of N percentiles_file = join(filedir, '{}_feature_descriptives' 'Extra.{}'.format(experiment_id, file_format)) files['df_percentiles'] = DataReader.read_from_file(percentiles_file, index_col=0) files['df_percentiles'] = pd.merge(files['df_percentiles'], df_features_n_values, left_index=True, right_index=True) mild_outliers = (files['df_percentiles']["Mild outliers"] / files['df_percentiles']["N"].astype(float) * 100) files['df_percentiles']["Mild outliers (%)"] = mild_outliers extreme_outliers = (files['df_percentiles']["Extreme outliers"] / files['df_percentiles']["N"].astype(float) * 100) files['df_percentiles']["Extreme outliers (%)"] = extreme_outliers files['df_percentiles'] = files['df_percentiles'][['N', 'min', 'max', '1%', '5%', '25%', '50%', '75%', '95%', '99%', 'IQR', 'Mild outliers', 'Mild outliers (%)', 'Extreme outliers', 'Extreme outliers (%)']] confmatrix_file = join(filedir, '{}_confMatrix.{}'.format(experiment_id, file_format)) if exists(confmatrix_file): conf_matrix = DataReader.read_from_file(confmatrix_file, index_col=0) files['df_confmatrix'] = self.process_confusion_matrix(conf_matrix) score_dist_file = join(filedir, '{}_score_dist.{}'.format(experiment_id, file_format)) if exists(score_dist_file): df_score_dist = DataReader.read_from_file(score_dist_file, index_col=1) df_score_dist.rename(columns={'sys_{}'.format(prefix): 'sys'}, inplace=True) files['df_score_dist'] = df_score_dist[['human', 'sys', 'difference']] # read in the feature boxplots by subgroup, if we were asked to for group in groups_eval: feature_boxplot_prefix = join(figdir, '{}_feature_boxplot_by_{}'.format(experiment_id, group)) svg_file = join(feature_boxplot_prefix + '.svg') png_file = join(feature_boxplot_prefix + '.png') if exists(svg_file): figs['feature_boxplots_by_{}_svg'.format(group)] = svg_file elif exists(png_file): figs['feature_boxplots_by_{}_png'.format(group)] = png_file # read in the betas image if exists betas_svg = join(figdir, '{}_betas.svg'.format(experiment_id)) if exists(betas_svg): figs['betas'] = betas_svg # read in the evaluation barplots by subgroup, if we were asked to for group in groups_eval: eval_barplot_svg_file = join(figdir, '{}_eval_by_{}.svg'.format(experiment_id, group)) if exists(eval_barplot_svg_file): figs['eval_barplot_by_{}'.format(group)] = eval_barplot_svg_file pca_svg_file = join(figdir, '{}_pca.svg'.format(experiment_id)) if exists(pca_svg_file): figs['pca_scree_plot'] = pca_svg_file return (files, figs, file_format)