Esempio n. 1
0
    def load_rsmtool_output(self, filedir, figdir, experiment_id, prefix, groups_eval):
        """
        Function to load all of the outputs of an rsmtool experiment.
        For each type of output, we first check whether the file exists
        to allow comparing experiments with different sets of outputs.

        Parameters
        ----------
        filedir : str
            Path to the directory containing output files.
        figdir : str
            Path to the directory containing output figures.
        experiment_id : str
            Original ``experiment_id`` used to generate the output files.
        prefix: str
            Must be set to ``scale`` or ``raw``. Indicates whether the score
            is scaled or not.
        groups_eval: list
            List of subgroup names used for subgroup evaluation.

        Returns
        -------
        files : dict
            A dictionary with outputs converted to pandas data
            frames. If a particular type of output did not exist for the
            experiment, its value will be an empty data frame.
        figs: dict
            A dictionary with experiment figures.
        """

        file_format = get_output_directory_extension(filedir, experiment_id)

        files = defaultdict(pd.DataFrame)
        figs = {}

        # feature distributions and the inter-feature correlations
        feature_train_file = join(filedir, '{}_train_features.{}'.format(experiment_id,
                                                                         file_format))
        if exists(feature_train_file):
            files['df_train_features'] = DataReader.read_from_file(feature_train_file)

        feature_distplots_file = join(figdir, '{}_distrib.svg'.format(experiment_id))
        if exists(feature_distplots_file):
            figs['feature_distplots'] = feature_distplots_file

        feature_cors_file = join(filedir, '{}_cors_processed.{}'.format(experiment_id,
                                                                        file_format))
        if exists(feature_cors_file):
            files['df_feature_cors'] = DataReader.read_from_file(feature_cors_file, index_col=0)

        # df_scores
        scores_file = join(filedir, '{}_pred_processed.{}'.format(experiment_id,
                                                                  file_format))
        if exists(scores_file):
            df_scores = DataReader.read_from_file(scores_file, converters={'spkitemid': str})
            files['df_scores'] = df_scores[['spkitemid', 'sc1', prefix]]

        # model coefficients if present
        betas_file = join(filedir, '{}_betas.{}'.format(experiment_id,
                                                        file_format))
        if exists(betas_file):
            files['df_coef'] = DataReader.read_from_file(betas_file, index_col=0)
            files['df_coef'].index.name = None

        # read in the model fit files if present
        model_fit_file = join(filedir, '{}_model_fit.{}'.format(experiment_id,
                                                                file_format))
        if exists(model_fit_file):
            files['df_model_fit'] = DataReader.read_from_file(model_fit_file)

        # human human agreement
        consistency_file = join(filedir, '{}_consistency.{}'.format(experiment_id,
                                                                    file_format))

        # load if consistency file is present
        if exists(consistency_file):
            df_consistency = DataReader.read_from_file(consistency_file, index_col=0)
            files['df_consistency'] = df_consistency

        # degradation
        degradation_file = join(filedir, "{}_degradation.{}".format(experiment_id,
                                                                    file_format))

        # load if degradation file is present
        if exists(degradation_file):
            df_degradation = DataReader.read_from_file(degradation_file, index_col=0)
            files['df_degradation'] = df_degradation

        # disattenuated correlations
        dis_corr_file = join(filedir, "{}_disattenuated_correlations.{}".format(experiment_id,
                                                                                file_format))

        # load if disattenuated correlations is present
        if exists(dis_corr_file):
            df_dis_corr = DataReader.read_from_file(dis_corr_file, index_col=0)
            # we only use the row for raw_trim or scale_trim score
            files['df_disattenuated_correlations'] = df_dis_corr.loc[['{}_trim'.format(prefix)]]

        # read in disattenuated correlations by group
        for group in groups_eval:
            group_dis_corr_file = join(filedir,
                                       '{}_disattenuated_correlations_by_{}.{}'.format(experiment_id,
                                                                                       group,
                                                                                       file_format))
            if exists(group_dis_corr_file):
                df_dis_cor_group = DataReader.read_from_file(group_dis_corr_file, index_col=0)
                files['df_disattenuated_correlations_by_{}'.format(group)] = df_dis_cor_group
                files['df_disattenuated_correlations_by_{}_overview'.format(group)] = self.make_summary_stat_df(df_dis_cor_group)

        # true score evaluations
        true_score_eval_file = join(filedir, "{}_true_score_eval.{}".format(experiment_id,
                                                                            file_format))

        # load true score evaluations if present
        if exists(true_score_eval_file):
            df_true_score_eval = DataReader.read_from_file(true_score_eval_file, index_col=0)
            # we only use the row for raw_trim or scale_trim score
            files['df_true_score_eval'] = df_true_score_eval.loc[['{}_trim'.format(prefix)]]

        # use the raw columns or the scale columns depending on the prefix
        existing_eval_cols = (_df_eval_columns_existing_raw if prefix == 'raw'
                              else _df_eval_columns_existing_scale)
        rename_dict = raw_rename_dict if prefix == 'raw' else scale_rename_dict

        # read in the short version of the evaluation metrics for all data
        short_metrics_list = ["N", "Adj. Agmt.(br)", "Agmt.(br)", "K(br)",
                              "Pearson(b)", "QWK(b)", "R2(b)", "RMSE(b)"]
        eval_file_short = join(filedir, '{}_eval_short.{}'.format(experiment_id, file_format))

        if exists(eval_file_short):
            df_eval = DataReader.read_from_file(eval_file_short, index_col=0)

            (rename_dict_new,
             existing_eval_cols_new,
             short_metrics_list_new,
             _) = self._modify_eval_columns_to_ensure_version_compatibilty(df_eval,
                                                                           rename_dict,
                                                                           existing_eval_cols,
                                                                           short_metrics_list)

            df_eval = df_eval[existing_eval_cols_new]
            df_eval = df_eval.rename(columns=rename_dict_new)
            files['df_eval'] = df_eval[short_metrics_list_new]
            files['df_eval'].index.name = None

        eval_file = join(filedir, '{}_eval.{}'.format(experiment_id, file_format))
        if exists(eval_file):
            files['df_eval_for_degradation'] = DataReader.read_from_file(eval_file, index_col=0)

        # read in the evaluation metrics by subgroup, if we are asked to
        for group in groups_eval:
            group_eval_file = join(filedir, '{}_eval_by_{}.{}'.format(experiment_id,
                                                                      group,
                                                                      file_format))
            if exists(group_eval_file):
                df_eval = DataReader.read_from_file(group_eval_file, index_col=0)

                (rename_dict_new,
                 existing_eval_cols_new,
                 short_metrics_list_new,
                 smd_name
                 ) = self._modify_eval_columns_to_ensure_version_compatibilty(df_eval,
                                                                              rename_dict,
                                                                              existing_eval_cols,
                                                                              short_metrics_list,
                                                                              raise_warnings=False)

                # if `SMD` is being used, rather than `DSM`, we print a note for the user; we don't
                # want to go so far as to raise a warning, but we do want to give the user some info
                if smd_name == 'SMD':
                    warnings.warn("The subgroup evaluations in `{}` use 'SMD'. Please note "
                                  "that newer versions of RSMTool (7.0 or greater) use 'DSM' with subgroup "
                                  "evaluations. For additional details on how these metrics "
                                  "differ, see the RSMTool documentation. Comparisons with experiments "
                                  "using SMD for subgroup calculations will be deprecated in the next major "
                                  "release.".format(group_eval_file), category=DeprecationWarning)

                df_eval = df_eval[existing_eval_cols_new]
                df_eval = df_eval.rename(columns=rename_dict_new)
                files['df_eval_by_{}'.format(group)] = df_eval[short_metrics_list_new]
                files['df_eval_by_{}'.format(group)].index.name = None

                series = files['df_eval_by_{}'.format(group)]
                files['df_eval_by_{}_overview'.format(group)] = self.make_summary_stat_df(series)

                # set the ordering of mean/SD/SMD statistics
                files['df_eval_by_{}_m_sd'.format(group)] = df_eval[['N', 'H1 mean',
                                                                     'H1 SD', 'score mean(br)',
                                                                     'score SD(br)',
                                                                     'score mean(b)',
                                                                     'score SD(b)',
                                                                     '{}(br)'.format(smd_name),
                                                                     '{}(b)'.format(smd_name)]]
                files['df_eval_by_{}_m_sd'.format(group)].index.name = None

        # read in the partial correlations vs. score for all data
        pcor_score_file = join(filedir, '{}_pcor_score_all_data.{}'.format(experiment_id,
                                                                           file_format))
        if exists(pcor_score_file):
            files['df_pcor_sc1'] = DataReader.read_from_file(pcor_score_file, index_col=0)
            files['df_pcor_sc1_overview'] = self.make_summary_stat_df(files['df_pcor_sc1'])

        # read in the partial correlations by subgroups, if we are asked to
        for group in groups_eval:
            group_pcor_file = join(filedir, '{}_pcor_score_by_{}.{}'.format(experiment_id,
                                                                            group,
                                                                            file_format))
            if exists(group_pcor_file):
                files['df_pcor_sc1_by_{}'
                      ''.format(group)] = DataReader.read_from_file(group_pcor_file,
                                                                    index_col=0)

                series = files['df_pcor_sc1_by_{}'.format(group)]
                files['df_pcor_sc1_{}_overview'.format(group)] = self.make_summary_stat_df(series)

        # read in the marginal correlations vs. score for all data
        mcor_score_file = join(filedir, '{}_margcor_score_all_data.{}'.format(experiment_id,
                                                                              file_format))
        if exists(mcor_score_file):
            files['df_mcor_sc1'] = DataReader.read_from_file(mcor_score_file, index_col=0)
            files['df_mcor_sc1_overview'] = self.make_summary_stat_df(files['df_mcor_sc1'])

        # read in the partial correlations by subgroups, if we are asked to
        for group in groups_eval:
            group_mcor_file = join(filedir,
                                   '{}_margcor_score_by_{}.{}'.format(experiment_id,
                                                                      group,
                                                                      file_format))
            if exists(group_mcor_file):
                files['df_mcor_sc1_by_{}'
                      ''.format(group)] = DataReader.read_from_file(group_mcor_file,
                                                                    index_col=0)

                series = files['df_mcor_sc1_by_{}'.format(group)]
                files['df_mcor_sc1_{}_overview'.format(group)] = self.make_summary_stat_df(series)

        pca_file = join(filedir, '{}_pca.{}'.format(experiment_id, file_format))
        if exists(pca_file):
            files['df_pca'] = DataReader.read_from_file(pca_file, index_col=0)
            files['df_pcavar'] = DataReader.read_from_file(join(filedir,
                                                                '{}_pcavar.{}'.format(experiment_id,
                                                                                      file_format)),
                                                           index_col=0)

        descriptives_file = join(filedir, '{}_feature_descriptives.{}'.format(experiment_id,
                                                                              file_format))
        if exists(descriptives_file):
            # we read all files pertaining to the descriptive analysis together
            # since we merge the outputs
            files['df_descriptives'] = DataReader.read_from_file(descriptives_file, index_col=0)

            # this df contains only the number of features. this is used later
            # for another two tables to show the number of features
            df_features_n_values = files['df_descriptives'][['N', 'min', 'max']]

            files['df_descriptives'] = files['df_descriptives'][['N', 'mean', 'std. dev.',
                                                                 'skewness', 'kurtosis']]

            outliers_file = join(filedir, '{}_feature_outliers.{}'.format(experiment_id,
                                                                          file_format))
            df_outliers = DataReader.read_from_file(outliers_file, index_col=0)
            df_outliers = df_outliers.rename(columns={'upper': 'Upper',
                                                      'lower': 'Lower',
                                                      'both': 'Both',
                                                      'upperperc': 'Upper %',
                                                      'lowerperc': 'Lower %',
                                                      'bothperc': 'Both %'})
            df_outliers_columns = df_outliers.columns.tolist()
            files['df_outliers'] = df_outliers

            # join with df_features_n_values to get the value of N
            files['df_outliers'] = pd.merge(files['df_outliers'], df_features_n_values,
                                            left_index=True,
                                            right_index=True)[['N'] + df_outliers_columns]

            # join with df_features_n_values to get the value of N
            percentiles_file = join(filedir, '{}_feature_descriptives'
                                             'Extra.{}'.format(experiment_id,
                                                               file_format))

            files['df_percentiles'] = DataReader.read_from_file(percentiles_file,
                                                                index_col=0)
            files['df_percentiles'] = pd.merge(files['df_percentiles'],
                                               df_features_n_values,
                                               left_index=True,
                                               right_index=True)

            mild_outliers = (files['df_percentiles']["Mild outliers"] /
                             files['df_percentiles']["N"].astype(float) * 100)

            files['df_percentiles']["Mild outliers (%)"] = mild_outliers

            extreme_outliers = (files['df_percentiles']["Extreme outliers"] /
                                files['df_percentiles']["N"].astype(float) * 100)

            files['df_percentiles']["Extreme outliers (%)"] = extreme_outliers

            files['df_percentiles'] = files['df_percentiles'][['N', 'min', 'max',
                                                               '1%', '5%', '25%',
                                                               '50%', '75%', '95%',
                                                               '99%', 'IQR', 'Mild outliers',
                                                               'Mild outliers (%)',
                                                               'Extreme outliers',
                                                               'Extreme outliers (%)']]

        confmatrix_file = join(filedir, '{}_confMatrix.{}'.format(experiment_id, file_format))
        if exists(confmatrix_file):
            conf_matrix = DataReader.read_from_file(confmatrix_file, index_col=0)
            files['df_confmatrix'] = self.process_confusion_matrix(conf_matrix)

        score_dist_file = join(filedir, '{}_score_dist.{}'.format(experiment_id, file_format))
        if exists(score_dist_file):
            df_score_dist = DataReader.read_from_file(score_dist_file, index_col=1)
            df_score_dist.rename(columns={'sys_{}'.format(prefix): 'sys'}, inplace=True)
            files['df_score_dist'] = df_score_dist[['human', 'sys', 'difference']]

        # read in the feature boxplots by subgroup, if we were asked to
        for group in groups_eval:
            feature_boxplot_prefix = join(figdir,
                                          '{}_feature_boxplot_by_{}'.format(experiment_id, group))
            svg_file = join(feature_boxplot_prefix + '.svg')
            png_file = join(feature_boxplot_prefix + '.png')
            if exists(svg_file):
                figs['feature_boxplots_by_{}_svg'.format(group)] = svg_file

            elif exists(png_file):
                figs['feature_boxplots_by_{}_png'.format(group)] = png_file

        # read in the betas image if exists
        betas_svg = join(figdir, '{}_betas.svg'.format(experiment_id))
        if exists(betas_svg):
            figs['betas'] = betas_svg

        # read in the evaluation barplots by subgroup, if we were asked to
        for group in groups_eval:
            eval_barplot_svg_file = join(figdir, '{}_eval_by_{}.svg'.format(experiment_id, group))
            if exists(eval_barplot_svg_file):
                figs['eval_barplot_by_{}'.format(group)] = eval_barplot_svg_file

        pca_svg_file = join(figdir, '{}_pca.svg'.format(experiment_id))
        if exists(pca_svg_file):
            figs['pca_scree_plot'] = pca_svg_file

        return (files, figs, file_format)
def test_get_output_directory_extension_error():
    directory = join(test_dir, 'data', 'files')
    get_output_directory_extension(directory, 'id_1')
def test_get_output_directory_extension():
    directory = join(test_dir, 'data', 'experiments', 'lr', 'output')
    result = get_output_directory_extension(directory, 'id_1')
    eq_(result, 'csv')
    def load_rsmtool_output(self, filedir, figdir, experiment_id, prefix, groups_eval):
        """
        Function to load all of the outputs of an rsmtool experiment.
        For each type of output, we first check whether the file exists
        to allow comparing experiments with different sets of outputs.

        Parameters
        ----------
        filedir : str
            Path to the directory containing output files.
        figdir : str
            Path to the directory containing output figures.
        experiment_id : str
            Original ``experiment_id`` used to generate the output files.
        prefix: str
            Must be set to ``scale`` or ``raw``. Indicates whether the score
            is scaled or not.
        groups_eval: list
            List of subgroup names used for subgroup evaluation.

        Returns
        -------
        files : dict
            A dictionary with outputs converted to pandas data
            frames. If a particular type of output did not exist for the
            experiment, its value will be an empty data frame.
        figs: dict
            A dictionary with experiment figures.
        """

        file_format = get_output_directory_extension(filedir, experiment_id)

        files = defaultdict(pd.DataFrame)
        figs = {}

        # feature distributions and the inter-feature correlations
        feature_train_file = join(filedir, '{}_train_features.{}'.format(experiment_id,
                                                                         file_format))
        if exists(feature_train_file):
            files['df_train_features'] = DataReader.read_from_file(feature_train_file)

        feature_distplots_file = join(figdir, '{}_distrib.svg'.format(experiment_id))
        if exists(feature_distplots_file):
            figs['feature_distplots'] = feature_distplots_file
            # with open(feature_distplots_file, 'rb') as f:
            #     figs['feature_distplots'] = base64.b64encode(f.read()).decode('utf-8')

        feature_cors_file = join(filedir, '{}_cors_processed.{}'.format(experiment_id,
                                                                        file_format))
        if exists(feature_cors_file):
            files['df_feature_cors'] = DataReader.read_from_file(feature_cors_file, index_col=0)

        # df_scores
        scores_file = join(filedir, '{}_pred_processed.{}'.format(experiment_id,
                                                                  file_format))
        if exists(scores_file):
            df_scores = DataReader.read_from_file(scores_file, converters={'spkitemid': str})
            files['df_scores'] = df_scores[['spkitemid', 'sc1', prefix]]

        # model coefficients if present
        betas_file = join(filedir, '{}_betas.{}'.format(experiment_id,
                                                        file_format))
        if exists(betas_file):
            files['df_coef'] = DataReader.read_from_file(betas_file, index_col=0)
            files['df_coef'].index.name = None

        # read in the model fit files if present
        model_fit_file = join(filedir, '{}_model_fit.{}'.format(experiment_id,
                                                                file_format))
        if exists(model_fit_file):
            files['df_model_fit'] = DataReader.read_from_file(model_fit_file)

        # human human agreement
        consistency_file = join(filedir, '{}_consistency.{}'.format(experiment_id,
                                                                    file_format))

        # load if consistency file is present
        if exists(consistency_file):
            df_consistency = DataReader.read_from_file(consistency_file, index_col=0)
            files['df_consistency'] = df_consistency

        # degradation
        degradation_file = join(filedir, "{}_degradation.{}".format(experiment_id,
                                                                    file_format))

        # load if degradation file is present
        if exists(degradation_file):
            df_degradation = DataReader.read_from_file(degradation_file, index_col=0)
            files['df_degradation'] = df_degradation

        # disattenuated correlations
        dis_corr_file = join(filedir, "{}_disattenuated_correlations.{}".format(experiment_id,
                                                                                file_format))

        # load if disattenuated correlations is present
        if exists(dis_corr_file):
            df_dis_corr = DataReader.read_from_file(dis_corr_file, index_col=0)
            # we only use the row for raw_trim or scale_trim score
            files['df_disattenuated_correlations'] = df_dis_corr.loc[['{}_trim'.format(prefix)]]

        # read in disattenuated correlations by group
        for group in groups_eval:
            group_dis_corr_file = join(filedir,
                                       '{}_disattenuated_correlations_by_{}.{}'.format(experiment_id,
                                                                                       group,
                                                                                       file_format))
            if exists(group_dis_corr_file):
                df_dis_cor_group = DataReader.read_from_file(group_dis_corr_file, index_col=0)
                files['df_disattenuated_correlations_by_{}'.format(group)] = df_dis_cor_group
                files['df_disattenuated_correlations_by_{}_overview'.format(group)] = self.make_summary_stat_df(df_dis_cor_group)

        # use the raw columns or the scale columns depending on the prefix
        existing_eval_cols = (_df_eval_columns_existing_raw if prefix == 'raw'
                              else _df_eval_columns_existing_scale)
        rename_dict = raw_rename_dict if prefix == 'raw' else scale_rename_dict

        # read in the short version of the evaluation metrics for all data
        short_metrics_list = ["N", "Adj. Agmt.(br)", "Agmt.(br)", "K(br)",
                              "Pearson(b)", "QWK(br)", "R2(b)", "RMSE(b)"]
        eval_file_short = join(filedir, '{}_eval_short.{}'.format(experiment_id, file_format))

        if exists(eval_file_short):
            df_eval = DataReader.read_from_file(eval_file_short, index_col=0)
            df_eval = df_eval[existing_eval_cols]
            df_eval = df_eval.rename(columns=rename_dict)
            files['df_eval'] = df_eval[short_metrics_list]
            files['df_eval'].index.name = None

        eval_file = join(filedir, '{}_eval.{}'.format(experiment_id, file_format))
        if exists(eval_file):
            files['df_eval_for_degradation'] = DataReader.read_from_file(eval_file, index_col=0)

        # read in the evaluation metrics by subgroup, if we are asked to
        for group in groups_eval:
            group_eval_file = join(filedir, '{}_eval_by_{}.{}'.format(experiment_id,
                                                                      group,
                                                                      file_format))
            if exists(group_eval_file):
                df_eval = DataReader.read_from_file(group_eval_file, index_col=0)
                df_eval = df_eval[existing_eval_cols]
                df_eval = df_eval.rename(columns=rename_dict)
                files['df_eval_by_{}'.format(group)] = df_eval[short_metrics_list]
                files['df_eval_by_{}'.format(group)].index.name = None

                series = files['df_eval_by_{}'.format(group)]
                files['df_eval_by_{}_overview'.format(group)] = self.make_summary_stat_df(series)

                # set the ordering of mean/SD/SMD statistics
                files['df_eval_by_{}_m_sd'.format(group)] = df_eval[['N', 'H1 mean',
                                                                     'H1 SD', 'score mean(br)',
                                                                     'score SD(br)',
                                                                     'score mean(b)',
                                                                     'score SD(b)',
                                                                     'SMD(br)', 'SMD(b)']]
                files['df_eval_by_{}_m_sd'.format(group)].index.name = None

        # read in the partial correlations vs. score for all data
        pcor_score_file = join(filedir, '{}_pcor_score_all_data.{}'.format(experiment_id,
                                                                           file_format))
        if exists(pcor_score_file):
            files['df_pcor_sc1'] = DataReader.read_from_file(pcor_score_file, index_col=0)
            files['df_pcor_sc1_overview'] = self.make_summary_stat_df(files['df_pcor_sc1'])

        # read in the partial correlations by subgroups, if we are asked to
        for group in groups_eval:
            group_pcor_file = join(filedir, '{}_pcor_score_by_{}.{}'.format(experiment_id,
                                                                            group,
                                                                            file_format))
            if exists(group_pcor_file):
                files['df_pcor_sc1_by_{}'
                      ''.format(group)] = DataReader.read_from_file(group_pcor_file,
                                                                    index_col=0)

                series = files['df_pcor_sc1_by_{}'.format(group)]
                files['df_pcor_sc1_{}_overview'.format(group)] = self.make_summary_stat_df(series)

        # read in the marginal correlations vs. score for all data
        mcor_score_file = join(filedir, '{}_margcor_score_all_data.{}'.format(experiment_id,
                                                                              file_format))
        if exists(mcor_score_file):
            files['df_mcor_sc1'] = DataReader.read_from_file(mcor_score_file, index_col=0)
            files['df_mcor_sc1_overview'] = self.make_summary_stat_df(files['df_mcor_sc1'])

        # read in the partial correlations by subgroups, if we are asked to
        for group in groups_eval:
            group_mcor_file = join(filedir,
                                   '{}_margcor_score_by_{}.{}'.format(experiment_id,
                                                                      group,
                                                                      file_format))
            if exists(group_mcor_file):
                files['df_mcor_sc1_by_{}'
                      ''.format(group)] = DataReader.read_from_file(group_mcor_file,
                                                                    index_col=0)

                series = files['df_mcor_sc1_by_{}'.format(group)]
                files['df_mcor_sc1_{}_overview'.format(group)] = self.make_summary_stat_df(series)

        pca_file = join(filedir, '{}_pca.{}'.format(experiment_id, file_format))
        if exists(pca_file):
            files['df_pca'] = DataReader.read_from_file(pca_file, index_col=0)
            files['df_pcavar'] = DataReader.read_from_file(join(filedir,
                                                                '{}_pcavar.{}'.format(experiment_id,
                                                                                      file_format)),
                                                           index_col=0)

        descriptives_file = join(filedir, '{}_feature_descriptives.{}'.format(experiment_id,
                                                                              file_format))
        if exists(descriptives_file):
            # we read all files pertaining to the descriptive analysis together
            # since we merge the outputs
            files['df_descriptives'] = DataReader.read_from_file(descriptives_file, index_col=0)

            # this df contains only the number of features. this is used later
            # for another two tables to show the number of features
            df_features_n_values = files['df_descriptives'][['N', 'min', 'max']]

            files['df_descriptives'] = files['df_descriptives'][['N', 'mean', 'std. dev.',
                                                                 'skewness', 'kurtosis']]

            outliers_file = join(filedir, '{}_feature_outliers.{}'.format(experiment_id,
                                                                          file_format))
            df_outliers = DataReader.read_from_file(outliers_file, index_col=0)
            df_outliers = df_outliers.rename(columns={'upper': 'Upper',
                                                      'lower': 'Lower',
                                                      'both': 'Both',
                                                      'upperperc': 'Upper %',
                                                      'lowerperc': 'Lower %',
                                                      'bothperc': 'Both %'})
            df_outliers_columns = df_outliers.columns.tolist()
            files['df_outliers'] = df_outliers

            # join with df_features_n_values to get the value of N
            files['df_outliers'] = pd.merge(files['df_outliers'], df_features_n_values,
                                            left_index=True,
                                            right_index=True)[['N'] + df_outliers_columns]

            # join with df_features_n_values to get the value of N
            percentiles_file = join(filedir, '{}_feature_descriptives'
                                             'Extra.{}'.format(experiment_id,
                                                               file_format))

            files['df_percentiles'] = DataReader.read_from_file(percentiles_file,
                                                                index_col=0)
            files['df_percentiles'] = pd.merge(files['df_percentiles'],
                                               df_features_n_values,
                                               left_index=True,
                                               right_index=True)

            mild_outliers = (files['df_percentiles']["Mild outliers"] /
                             files['df_percentiles']["N"].astype(float) * 100)

            files['df_percentiles']["Mild outliers (%)"] = mild_outliers

            extreme_outliers = (files['df_percentiles']["Extreme outliers"] /
                                files['df_percentiles']["N"].astype(float) * 100)

            files['df_percentiles']["Extreme outliers (%)"] = extreme_outliers

            files['df_percentiles'] = files['df_percentiles'][['N', 'min', 'max',
                                                               '1%', '5%', '25%',
                                                               '50%', '75%', '95%',
                                                               '99%', 'IQR', 'Mild outliers',
                                                               'Mild outliers (%)',
                                                               'Extreme outliers',
                                                               'Extreme outliers (%)']]

        confmatrix_file = join(filedir, '{}_confMatrix.{}'.format(experiment_id, file_format))
        if exists(confmatrix_file):
            conf_matrix = DataReader.read_from_file(confmatrix_file, index_col=0)
            files['df_confmatrix'] = self.process_confusion_matrix(conf_matrix)

        score_dist_file = join(filedir, '{}_score_dist.{}'.format(experiment_id, file_format))
        if exists(score_dist_file):
            df_score_dist = DataReader.read_from_file(score_dist_file, index_col=1)
            df_score_dist.rename(columns={'sys_{}'.format(prefix): 'sys'}, inplace=True)
            files['df_score_dist'] = df_score_dist[['human', 'sys', 'difference']]

        # read in the feature boxplots by subgroup, if we were asked to
        for group in groups_eval:
            feature_boxplot_prefix = join(figdir,
                                          '{}_feature_boxplot_by_{}'.format(experiment_id, group))
            svg_file = join(feature_boxplot_prefix + '.svg')
            png_file = join(feature_boxplot_prefix + '.png')
            if exists(svg_file):
                figs['feature_boxplots_by_{}_svg'.format(group)] = svg_file

            elif exists(png_file):
                figs['feature_boxplots_by_{}_png'.format(group)] = png_file

        # read in the betas image if exists
        betas_svg = join(figdir, '{}_betas.svg'.format(experiment_id))
        if exists(betas_svg):
            figs['betas'] = betas_svg

        # read in the evaluation barplots by subgroup, if we were asked to
        for group in groups_eval:
            eval_barplot_svg_file = join(figdir, '{}_eval_by_{}.svg'.format(experiment_id, group))
            if exists(eval_barplot_svg_file):
                figs['eval_barplot_by_{}'.format(group)] = eval_barplot_svg_file

        pca_svg_file = join(figdir, '{}_pca.svg'.format(experiment_id))
        if exists(pca_svg_file):
            figs['pca_scree_plot'] = pca_svg_file

        return (files, figs, file_format)
Esempio n. 5
0
def test_get_output_directory_extension_error():
    directory = join(test_dir, 'data', 'files')
    get_output_directory_extension(directory, 'id_1')
Esempio n. 6
0
def test_get_output_directory_extension():
    directory = join(test_dir, 'data', 'experiments', 'lr', 'output')
    result = get_output_directory_extension(directory, 'id_1')
    eq_(result, 'csv')