def get_container(self, name_ext_tuples, converters=None):
        """
        Get a DataContainer object from a list of tuples with (`name`, `ext`)
        """
        names_ = []
        paths_ = []
        for name, ext in name_ext_tuples:
            if name == 'train':
                df = self.df_train
            elif name == 'test':
                df = self.df_test
            elif name == 'feature_specs':
                df = self.df_specs
            else:
                df = self.df_other

            path = TestDataReader.make_file_from_ext(df, ext)

            names_.append(name)
            paths_.append(path)

        reader = DataReader(paths_, names_, converters)
        container = reader.read()

        self.filepaths.extend(paths_)
        return container
Beispiel #2
0
    def get_container(self, name_ext_tuples, converters=None):
        """
        Get a DataContainer object from a list of tuples with (`name`, `ext`)
        """
        names_ = []
        paths_ = []
        for name, ext in name_ext_tuples:
            if name == 'train':
                df = self.df_train
            elif name == 'test':
                df = self.df_test
            elif name == 'feature_specs':
                df = self.df_specs
            else:
                df = self.df_other

            path = TestDataReader.make_file_from_ext(df, ext)

            names_.append(name)
            paths_.append(path)

        reader = DataReader(paths_, names_, converters)
        container = reader.read()

        self.filepaths.extend(paths_)
        return container
def check_scaled_coefficients(source, experiment_id, file_format='csv'):
    """
    Check that the predictions generated using scaled
    coefficients match the scaled scores. Raises an
    AssertionError if they do not.

    Parameters
    ----------
    source : str
        Path to the source directory on disk.
    experiment_id : str
        The experiment ID.
    file_format : str, optional
        The format of the output files.
        Defaults to 'csv'.
    """
    preprocessed_test_file = join('test_outputs',
                                  source,
                                  'output',
                                  '{}_test_preprocessed_features.{}'.format(experiment_id,
                                                                            file_format))
    scaled_coefficients_file = join('test_outputs',
                                    source,
                                    'output',
                                    '{}_coefficients_scaled.{}'.format(experiment_id,
                                                                       file_format))
    predictions_file = join('test_outputs',
                            source,
                            'output',
                            '{}_pred_processed.{}'.format(experiment_id,
                                                          file_format))

    postprocessing_params_file = join('test_outputs',
                                      source,
                                      'output',
                                      '{}_postprocessing_params.{}'.format(experiment_id,
                                                                           file_format))

    postproc_params = DataReader.read_from_file(postprocessing_params_file).loc[0]
    df_preprocessed_test_data = DataReader.read_from_file(preprocessed_test_file)
    df_old_predictions = DataReader.read_from_file(predictions_file)
    df_old_predictions = df_old_predictions[['spkitemid', 'sc1', 'scale']]

    # create fake skll objects with new coefficients
    df_coef = DataReader.read_from_file(scaled_coefficients_file)
    learner = Modeler.create_fake_skll_learner(df_coef)
    modeler = Modeler.load_from_learner(learner)

    # generate new predictions and rename the prediction column to 'scale'
    df_new_predictions = modeler.predict(df_preprocessed_test_data,
                                         postproc_params['trim_min'],
                                         postproc_params['trim_max'])
    df_new_predictions.rename(columns={'raw': 'scale'}, inplace=True)

    # check that new predictions match the scaled old predictions
    assert_frame_equal(df_new_predictions.sort_index(axis=1),
                       df_old_predictions.sort_index(axis=1),
                       check_exact=False,
                       check_less_precise=True)
def check_subgroup_outputs(output_dir, experiment_id, subgroups, file_format='csv'):
    """
    Check to make sure that the subgroup outputs
    look okay. Raise an AssertionError if they do not.

    Parameters
    ----------
    output_dir : str
        Path to the `output` experiment output directory for a test.
    experiment_id : str
        The experiment ID.
    subgroups : list of str
        List of column names that contain grouping
        information.
    file_format : str, optional
        The format of the output files.
        Defaults to 'csv'.
    """
    train_preprocessed_file = join(output_dir,
                                   '{}_train_metadata.{}'.format(experiment_id,
                                                                 file_format))
    train_preprocessed = DataReader.read_from_file(train_preprocessed_file, index_col=0)

    test_preprocessed_file = join(output_dir,
                                  '{}_test_metadata.{}'.format(experiment_id,
                                                               file_format))
    test_preprocessed = DataReader.read_from_file(test_preprocessed_file,
                                                  index_col=0)
    for group in subgroups:
        ok_(group in train_preprocessed.columns)
        ok_(group in test_preprocessed.columns)

    # check that the total sum of N per category matches the total N
    # in data composition and the total N categories matches what is
    # in overall data composition
    file_data_composition_all = join(output_dir,
                                     '{}_data_composition.{}'.format(experiment_id,
                                                                     file_format))
    df_data_composition_all = DataReader.read_from_file(file_data_composition_all)
    for group in subgroups:
        file_composition_by_group = join(output_dir,
                                         '{}_data_composition_by_{}.{}'.format(experiment_id,
                                                                               group,
                                                                               file_format))
        composition_by_group = DataReader.read_from_file(file_composition_by_group)
        for partition in ['Training', 'Evaluation']:
            partition_info = df_data_composition_all.loc[df_data_composition_all['partition'] ==
                                                         partition]

            summation = sum(composition_by_group['{} set'
                                                 ''.format(partition)])
            ok_(summation == partition_info.iloc[0]['responses'])

            length = len(composition_by_group.loc[composition_by_group['{} set'
                                                                       ''.format(partition)] != 0])
            ok_(length == partition_info.iloc[0][group])
Beispiel #5
0
def check_scaled_coefficients(source, experiment_id, file_format='csv'):
    """
    Check that the predictions generated using scaled
    coefficients match the scaled scores. Raises an
    AssertionError if they do not.

    Parameters
    ----------
    source : str
        Path to the source directory on disk.
    experiment_id : str
        The experiment ID.
    file_format : str, optional
        The format of the output files.
        Defaults to 'csv'.
    """
    preprocessed_test_file = join(
        'test_outputs', source, 'output',
        '{}_test_preprocessed_features.{}'.format(experiment_id, file_format))
    scaled_coefficients_file = join(
        'test_outputs', source, 'output',
        '{}_coefficients_scaled.{}'.format(experiment_id, file_format))
    predictions_file = join(
        'test_outputs', source, 'output',
        '{}_pred_processed.{}'.format(experiment_id, file_format))

    postprocessing_params_file = join(
        'test_outputs', source, 'output',
        '{}_postprocessing_params.{}'.format(experiment_id, file_format))

    postproc_params = DataReader.read_from_file(
        postprocessing_params_file).loc[0]
    df_preprocessed_test_data = DataReader.read_from_file(
        preprocessed_test_file)
    df_old_predictions = DataReader.read_from_file(predictions_file)
    df_old_predictions = df_old_predictions[['spkitemid', 'sc1', 'scale']]

    # create fake skll objects with new coefficients
    df_coef = DataReader.read_from_file(scaled_coefficients_file)
    learner = Modeler.create_fake_skll_learner(df_coef)
    modeler = Modeler.load_from_learner(learner)

    # generate new predictions and rename the prediction column to 'scale'
    df_new_predictions = modeler.predict(df_preprocessed_test_data,
                                         postproc_params['trim_min'],
                                         postproc_params['trim_max'])
    df_new_predictions.rename(columns={'raw': 'scale'}, inplace=True)

    # check that new predictions match the scaled old predictions
    assert_frame_equal(df_new_predictions.sort_index(axis=1),
                       df_old_predictions.sort_index(axis=1),
                       check_exact=False,
                       check_less_precise=True)
    def test_locate_files_list(self):

        paths = ['file1.csv', 'file2.xlsx']
        config_dir = 'output'
        result = DataReader.locate_files(paths, config_dir)
        assert isinstance(result, list)
        eq_(result, [None, None])
Beispiel #7
0
    def locate_custom_sections(custom_report_section_paths, config_dir):
        """
        Get the absolute paths for custom report sections and check that
        the files exist. If a file does not exist, raise an exception.

        Parameters
        ----------
        custom_report_section_paths : list of str
            List of paths to IPython notebook
            files representing the custom sections.
        config_dir : str
            Path to the experiment configuration file.

        Returns
        -------
        custom_report_sections :  list of str
            List of absolute paths to the custom section
            notebooks.

        Raises
        ------
        FileNotFoundError
            If any of the files cannot be found.
        """

        custom_report_sections = []
        for cs_path in custom_report_section_paths:
            cs_location = DataReader.locate_files(cs_path, config_dir)
            if not cs_location:
                raise FileNotFoundError("Error: custom section not found at "
                                        "{}.".format(cs_path))
            else:
                custom_report_sections.append(cs_location)
        return custom_report_sections
Beispiel #8
0
    def test_locate_files_list(self):

        paths = ['file1.csv', 'file2.xlsx']
        config_dir = 'output'
        result = DataReader.locate_files(paths, config_dir)
        assert isinstance(result, list)
        eq_(result, [None, None])
Beispiel #9
0
def check_experiment_dir(experiment_dir,
                         experiment_name,
                         configpath):
    """
    Check that the supplied experiment directory exists and contains
    the output of the rsmtool experiment.

    Parameters
    ----------
    experiment_dir : str
        Supplied path to the experiment_dir.
    configpath : str
        Path to the directory containing the configuration file.

    Returns
    -------
    jsons : list
        A list paths to all configuration json files contained in the output directory

    Raises
    ------
    FileNotFoundError
        If the directory does not exist or does not contain and output
        of an RSMTool experiment.
    """
    full_path_experiment_dir = DataReader.locate_files(experiment_dir, configpath)
    if not full_path_experiment_dir:
        raise FileNotFoundError("The directory {} "
                                "does not exist.".format(experiment_dir))
    else:
        # check that there is an output directory
        csvdir = normpath(join(full_path_experiment_dir, 'output'))
        if not exists(csvdir):
            raise FileNotFoundError("The directory {} does not contain "
                                    "the output of an rsmtool "
                                    "experiment.".format(full_path_experiment_dir))

        # find the json configuration files for all experiments stored in this directory
        jsons = glob.glob(join(csvdir, '*.json'))
        if len(jsons) == 0:
            raise FileNotFoundError("The directory {} does not contain "
                                    "the .json configuration files for rsmtool "
                                    "experiments.".format(full_path_experiment_dir))

        # Raise an error if the user specified a list of experiment names
        # but we found several .jsons in the same directory
        if experiment_name and len(jsons) > 1:
            raise ValueError("{} seems to contain the output of multiple experiments. "
                             "In order to use custom experiment names, you must have "
                             "a separate directory "
                             "for each experiment".format(full_path_experiment_dir))

        # return [(json, experiment_name)] when we have experiment name or
        # [(json, None)] if no experiment name has been specified.
        # If the folder contains the output of multiple experiments, return
        # [(json1, None), (json2, None) .... ]
        return list(zip(jsons, [experiment_name] * len(jsons)))
Beispiel #10
0
    def test_locate_files_works(self):

        config_dir = 'temp_output'
        os.makedirs(config_dir, exist_ok=True)

        paths = 'file1.csv'
        full_path = os.path.abspath(os.path.join(config_dir, paths))
        open(full_path, 'a').close()

        result = DataReader.locate_files(paths, config_dir)
        rmtree(config_dir)
        eq_(result, full_path)
    def test_locate_files_works(self):

        config_dir = 'temp_output'
        os.makedirs(config_dir, exist_ok=True)

        paths = 'file1.csv'
        full_path = os.path.abspath(os.path.join(config_dir, paths))
        open(full_path, 'a').close()

        result = DataReader.locate_files(paths, config_dir)
        rmtree(config_dir)
        eq_(result, full_path)
Beispiel #12
0
    def check_read_from_file(self, extension):
        """Test whether ``read_from_file()`` works as expected."""
        name = TestDataReader.make_file_from_ext(self.df_train, extension)

        # now read in the file using `read_data_file()`
        df_read = DataReader.read_from_file(name,
                                            converters={'id': str, 'candidate': str})

        # Make sure we get rid of the file at the end,
        # at least if we get to this point (i.e. no errors raised)
        self.filepaths.append(name)

        assert_frame_equal(self.df_train, df_read)
    def check_read_from_file(self, extension):
        """
        Test whether the ``read_from_file()`` method works as expected.
        """

        name = TestDataReader.make_file_from_ext(self.df_train, extension)

        # now read in the file using `read_data_file()`
        df_read = DataReader.read_from_file(name,
                                            converters={'id': str, 'candidate': str})

        # Make sure we get rid of the file at the end,
        # at least if we get to this point (i.e. no errors raised)
        self.filepaths.append(name)

        assert_frame_equal(self.df_train, df_read)
Beispiel #14
0
def check_experiment_dir(experiment_dir, configpath):
    """
    Check that the supplied experiment directory exists and contains
    the output of the rsmtool experiment.

    Parameters
    ----------
    experiment_dir : str
        Supplied path to the experiment_dir.
    configpath : str
        Path to the directory containing the configuration file.

    Returns
    -------
    jsons : list
        A list paths to all configuration json files contained in the output directory

    Raises
    ------
    FileNotFoundError
        If the directory does not exist or does not contain and output
        of an RSMTool experiment.
    """
    full_path_experiment_dir = DataReader.locate_files(experiment_dir,
                                                       configpath)
    if not full_path_experiment_dir:
        raise FileNotFoundError("The directory {} "
                                "does not exist.".format(experiment_dir))
    else:
        # check that there is an output directory
        csvdir = normpath(join(full_path_experiment_dir, 'output'))
        if not exists(csvdir):
            raise FileNotFoundError(
                "The directory {} does not contain "
                "the output of an rsmtool "
                "experiment.".format(full_path_experiment_dir))

        # find the json configuration files for all experiments stored in this directory
        jsons = glob.glob(join(csvdir, '*.json'))
        if len(jsons) == 0:
            raise FileNotFoundError(
                "The directory {} does not contain "
                "the .json configuration files for rsmtool "
                "experiments.".format(full_path_experiment_dir))

        return jsons
def check_experiment_dir(experiment_dir, configpath):
    """
    Check that the supplied experiment directory exists and contains
    the output of the rsmtool experiment.

    Parameters
    ----------
    experiment_dir : str
        Supplied path to the experiment_dir.
    configpath : str
        Path to the directory containing the configuration file.

    Returns
    -------
    jsons : list
        A list paths to all configuration json files contained in the output directory

    Raises
    ------
    FileNotFoundError
        If the directory does not exist or does not contain and output
        of an RSMTool experiment.
    """
    full_path_experiment_dir = DataReader.locate_files(experiment_dir, configpath)
    if not full_path_experiment_dir:
        raise FileNotFoundError("The directory {} "
                                "does not exist.".format(experiment_dir))
    else:
        # check that there is an output directory
        csvdir = normpath(join(full_path_experiment_dir, 'output'))
        if not exists(csvdir):
            raise FileNotFoundError("The directory {} does not contain "
                                    "the output of an rsmtool "
                                    "experiment.".format(full_path_experiment_dir))

        # find the json configuration files for all experiments stored in this directory
        jsons = glob.glob(join(csvdir, '*.json'))
        if len(jsons) == 0:
            raise FileNotFoundError("The directory {} does not contain "
                                    "the .json configuration files for rsmtool "
                                    "experiments.".format(full_path_experiment_dir))

        return jsons
Beispiel #16
0
def check_file_output(file1, file2, file_format='csv'):
    """
    Check if two experiment files have values that are
    the same to within three decimal places. Raises an
    AssertionError if they are not.

    Parameters
    ----------
    file1 : str
        Path to the first file.
    file2 : str
        Path to the second files.
    file_format : str, optional
        The format of the output files.
        Defaults to 'csv'.
    """

    # make sure that the main id columns are read as strings since
    # this may affect merging in custom notebooks
    string_columns = ['spkitemid', 'candidate']

    converter_dict = {column: str for column in string_columns}

    df1 = DataReader.read_from_file(file1, converters=converter_dict)
    df2 = DataReader.read_from_file(file2, converters=converter_dict)

    # if the first column is numeric, just force the index to string;
    # however, if it is non-numeric, set it as the index and then
    # force it to string. We do this to ensure string indices are
    # preserved as such
    for df in [df1, df2]:
        if np.issubdtype(df[df.columns[0]].dtype, np.number):
            df.index = df.index.map(str)
        else:
            df.index = df[df.columns[0]]
            df.index = df.index.map(str)

    # sort all the indices alphabetically
    df1.sort_index(inplace=True)
    df2.sort_index(inplace=True)

    # convert any integer columns to floats in either data frame
    for df in [df1, df2]:
        for c in df.columns:
            if df[c].dtype == np.int64:
                df[c] = df[c].astype(np.float64)

    # do the same for indices
    for df in [df1, df2]:
        if df.index.dtype == np.int64:
            df.index = df.index.astype(np.float64)

    # for pca and factor correlations convert all values to absolutes
    # because the sign may not always be the same
    if (file1.endswith('pca.{}'.format(file_format))
            or file1.endswith('factor_correlations.{}'.format(file_format))):
        for df in [df1, df2]:
            msk = df.dtypes == np.float64
            df.loc[:, msk] = df.loc[:, msk].abs()

    try:
        assert_frame_equal(df1.sort_index(axis=1),
                           df2.sort_index(axis=1),
                           check_exact=False,
                           check_less_precise=True)
    except AssertionError as e:
        message = e.args[0]
        new_message = 'File {} - {}'.format(basename(file1), message)
        e.args = (new_message, )
        raise
Beispiel #17
0
 def test_setup_none_in_path(self):
     paths = ['path1.csv', None, 'path2.csv']
     framenames = ['train', 'test', 'features']
     DataReader(paths, framenames)
    def load_rsmtool_output(self, filedir, figdir, experiment_id, prefix, groups_eval):
        """
        Function to load all of the outputs of an rsmtool experiment.
        For each type of output, we first check whether the file exists
        to allow comparing experiments with different sets of outputs.

        Parameters
        ----------
        filedir : str
            Path to the directory containing output files.
        figdir : str
            Path to the directory containing output figures.
        experiment_id : str
            Original ``experiment_id`` used to generate the output files.
        prefix: str
            Must be set to ``scale`` or ``raw``. Indicates whether the score
            is scaled or not.
        groups_eval: list
            List of subgroup names used for subgroup evaluation.

        Returns
        -------
        files : dict
            A dictionary with outputs converted to pandas data
            frames. If a particular type of output did not exist for the
            experiment, its value will be an empty data frame.
        figs: dict
            A dictionary with experiment figures.
        """

        file_format = get_output_directory_extension(filedir, experiment_id)

        files = defaultdict(pd.DataFrame)
        figs = {}

        # feature distributions and the inter-feature correlations
        feature_train_file = join(filedir, '{}_train_features.{}'.format(experiment_id,
                                                                         file_format))
        if exists(feature_train_file):
            files['df_train_features'] = DataReader.read_from_file(feature_train_file)

        feature_distplots_file = join(figdir, '{}_distrib.svg'.format(experiment_id))
        if exists(feature_distplots_file):
            figs['feature_distplots'] = feature_distplots_file
            # with open(feature_distplots_file, 'rb') as f:
            #     figs['feature_distplots'] = base64.b64encode(f.read()).decode('utf-8')

        feature_cors_file = join(filedir, '{}_cors_processed.{}'.format(experiment_id,
                                                                        file_format))
        if exists(feature_cors_file):
            files['df_feature_cors'] = DataReader.read_from_file(feature_cors_file, index_col=0)

        # df_scores
        scores_file = join(filedir, '{}_pred_processed.{}'.format(experiment_id,
                                                                  file_format))
        if exists(scores_file):
            df_scores = DataReader.read_from_file(scores_file, converters={'spkitemid': str})
            files['df_scores'] = df_scores[['spkitemid', 'sc1', prefix]]

        # model coefficients if present
        betas_file = join(filedir, '{}_betas.{}'.format(experiment_id,
                                                        file_format))
        if exists(betas_file):
            files['df_coef'] = DataReader.read_from_file(betas_file, index_col=0)
            files['df_coef'].index.name = None

        # read in the model fit files if present
        model_fit_file = join(filedir, '{}_model_fit.{}'.format(experiment_id,
                                                                file_format))
        if exists(model_fit_file):
            files['df_model_fit'] = DataReader.read_from_file(model_fit_file)

        # human human agreement
        consistency_file = join(filedir, '{}_consistency.{}'.format(experiment_id,
                                                                    file_format))

        # load if consistency file is present
        if exists(consistency_file):
            df_consistency = DataReader.read_from_file(consistency_file, index_col=0)
            files['df_consistency'] = df_consistency

        # degradation
        degradation_file = join(filedir, "{}_degradation.{}".format(experiment_id,
                                                                    file_format))

        # load if degradation file is present
        if exists(degradation_file):
            df_degradation = DataReader.read_from_file(degradation_file, index_col=0)
            files['df_degradation'] = df_degradation

        # disattenuated correlations
        dis_corr_file = join(filedir, "{}_disattenuated_correlations.{}".format(experiment_id,
                                                                                file_format))

        # load if disattenuated correlations is present
        if exists(dis_corr_file):
            df_dis_corr = DataReader.read_from_file(dis_corr_file, index_col=0)
            # we only use the row for raw_trim or scale_trim score
            files['df_disattenuated_correlations'] = df_dis_corr.loc[['{}_trim'.format(prefix)]]

        # read in disattenuated correlations by group
        for group in groups_eval:
            group_dis_corr_file = join(filedir,
                                       '{}_disattenuated_correlations_by_{}.{}'.format(experiment_id,
                                                                                       group,
                                                                                       file_format))
            if exists(group_dis_corr_file):
                df_dis_cor_group = DataReader.read_from_file(group_dis_corr_file, index_col=0)
                files['df_disattenuated_correlations_by_{}'.format(group)] = df_dis_cor_group
                files['df_disattenuated_correlations_by_{}_overview'.format(group)] = self.make_summary_stat_df(df_dis_cor_group)

        # use the raw columns or the scale columns depending on the prefix
        existing_eval_cols = (_df_eval_columns_existing_raw if prefix == 'raw'
                              else _df_eval_columns_existing_scale)
        rename_dict = raw_rename_dict if prefix == 'raw' else scale_rename_dict

        # read in the short version of the evaluation metrics for all data
        short_metrics_list = ["N", "Adj. Agmt.(br)", "Agmt.(br)", "K(br)",
                              "Pearson(b)", "QWK(br)", "R2(b)", "RMSE(b)"]
        eval_file_short = join(filedir, '{}_eval_short.{}'.format(experiment_id, file_format))

        if exists(eval_file_short):
            df_eval = DataReader.read_from_file(eval_file_short, index_col=0)
            df_eval = df_eval[existing_eval_cols]
            df_eval = df_eval.rename(columns=rename_dict)
            files['df_eval'] = df_eval[short_metrics_list]
            files['df_eval'].index.name = None

        eval_file = join(filedir, '{}_eval.{}'.format(experiment_id, file_format))
        if exists(eval_file):
            files['df_eval_for_degradation'] = DataReader.read_from_file(eval_file, index_col=0)

        # read in the evaluation metrics by subgroup, if we are asked to
        for group in groups_eval:
            group_eval_file = join(filedir, '{}_eval_by_{}.{}'.format(experiment_id,
                                                                      group,
                                                                      file_format))
            if exists(group_eval_file):
                df_eval = DataReader.read_from_file(group_eval_file, index_col=0)
                df_eval = df_eval[existing_eval_cols]
                df_eval = df_eval.rename(columns=rename_dict)
                files['df_eval_by_{}'.format(group)] = df_eval[short_metrics_list]
                files['df_eval_by_{}'.format(group)].index.name = None

                series = files['df_eval_by_{}'.format(group)]
                files['df_eval_by_{}_overview'.format(group)] = self.make_summary_stat_df(series)

                # set the ordering of mean/SD/SMD statistics
                files['df_eval_by_{}_m_sd'.format(group)] = df_eval[['N', 'H1 mean',
                                                                     'H1 SD', 'score mean(br)',
                                                                     'score SD(br)',
                                                                     'score mean(b)',
                                                                     'score SD(b)',
                                                                     'SMD(br)', 'SMD(b)']]
                files['df_eval_by_{}_m_sd'.format(group)].index.name = None

        # read in the partial correlations vs. score for all data
        pcor_score_file = join(filedir, '{}_pcor_score_all_data.{}'.format(experiment_id,
                                                                           file_format))
        if exists(pcor_score_file):
            files['df_pcor_sc1'] = DataReader.read_from_file(pcor_score_file, index_col=0)
            files['df_pcor_sc1_overview'] = self.make_summary_stat_df(files['df_pcor_sc1'])

        # read in the partial correlations by subgroups, if we are asked to
        for group in groups_eval:
            group_pcor_file = join(filedir, '{}_pcor_score_by_{}.{}'.format(experiment_id,
                                                                            group,
                                                                            file_format))
            if exists(group_pcor_file):
                files['df_pcor_sc1_by_{}'
                      ''.format(group)] = DataReader.read_from_file(group_pcor_file,
                                                                    index_col=0)

                series = files['df_pcor_sc1_by_{}'.format(group)]
                files['df_pcor_sc1_{}_overview'.format(group)] = self.make_summary_stat_df(series)

        # read in the marginal correlations vs. score for all data
        mcor_score_file = join(filedir, '{}_margcor_score_all_data.{}'.format(experiment_id,
                                                                              file_format))
        if exists(mcor_score_file):
            files['df_mcor_sc1'] = DataReader.read_from_file(mcor_score_file, index_col=0)
            files['df_mcor_sc1_overview'] = self.make_summary_stat_df(files['df_mcor_sc1'])

        # read in the partial correlations by subgroups, if we are asked to
        for group in groups_eval:
            group_mcor_file = join(filedir,
                                   '{}_margcor_score_by_{}.{}'.format(experiment_id,
                                                                      group,
                                                                      file_format))
            if exists(group_mcor_file):
                files['df_mcor_sc1_by_{}'
                      ''.format(group)] = DataReader.read_from_file(group_mcor_file,
                                                                    index_col=0)

                series = files['df_mcor_sc1_by_{}'.format(group)]
                files['df_mcor_sc1_{}_overview'.format(group)] = self.make_summary_stat_df(series)

        pca_file = join(filedir, '{}_pca.{}'.format(experiment_id, file_format))
        if exists(pca_file):
            files['df_pca'] = DataReader.read_from_file(pca_file, index_col=0)
            files['df_pcavar'] = DataReader.read_from_file(join(filedir,
                                                                '{}_pcavar.{}'.format(experiment_id,
                                                                                      file_format)),
                                                           index_col=0)

        descriptives_file = join(filedir, '{}_feature_descriptives.{}'.format(experiment_id,
                                                                              file_format))
        if exists(descriptives_file):
            # we read all files pertaining to the descriptive analysis together
            # since we merge the outputs
            files['df_descriptives'] = DataReader.read_from_file(descriptives_file, index_col=0)

            # this df contains only the number of features. this is used later
            # for another two tables to show the number of features
            df_features_n_values = files['df_descriptives'][['N', 'min', 'max']]

            files['df_descriptives'] = files['df_descriptives'][['N', 'mean', 'std. dev.',
                                                                 'skewness', 'kurtosis']]

            outliers_file = join(filedir, '{}_feature_outliers.{}'.format(experiment_id,
                                                                          file_format))
            df_outliers = DataReader.read_from_file(outliers_file, index_col=0)
            df_outliers = df_outliers.rename(columns={'upper': 'Upper',
                                                      'lower': 'Lower',
                                                      'both': 'Both',
                                                      'upperperc': 'Upper %',
                                                      'lowerperc': 'Lower %',
                                                      'bothperc': 'Both %'})
            df_outliers_columns = df_outliers.columns.tolist()
            files['df_outliers'] = df_outliers

            # join with df_features_n_values to get the value of N
            files['df_outliers'] = pd.merge(files['df_outliers'], df_features_n_values,
                                            left_index=True,
                                            right_index=True)[['N'] + df_outliers_columns]

            # join with df_features_n_values to get the value of N
            percentiles_file = join(filedir, '{}_feature_descriptives'
                                             'Extra.{}'.format(experiment_id,
                                                               file_format))

            files['df_percentiles'] = DataReader.read_from_file(percentiles_file,
                                                                index_col=0)
            files['df_percentiles'] = pd.merge(files['df_percentiles'],
                                               df_features_n_values,
                                               left_index=True,
                                               right_index=True)

            mild_outliers = (files['df_percentiles']["Mild outliers"] /
                             files['df_percentiles']["N"].astype(float) * 100)

            files['df_percentiles']["Mild outliers (%)"] = mild_outliers

            extreme_outliers = (files['df_percentiles']["Extreme outliers"] /
                                files['df_percentiles']["N"].astype(float) * 100)

            files['df_percentiles']["Extreme outliers (%)"] = extreme_outliers

            files['df_percentiles'] = files['df_percentiles'][['N', 'min', 'max',
                                                               '1%', '5%', '25%',
                                                               '50%', '75%', '95%',
                                                               '99%', 'IQR', 'Mild outliers',
                                                               'Mild outliers (%)',
                                                               'Extreme outliers',
                                                               'Extreme outliers (%)']]

        confmatrix_file = join(filedir, '{}_confMatrix.{}'.format(experiment_id, file_format))
        if exists(confmatrix_file):
            conf_matrix = DataReader.read_from_file(confmatrix_file, index_col=0)
            files['df_confmatrix'] = self.process_confusion_matrix(conf_matrix)

        score_dist_file = join(filedir, '{}_score_dist.{}'.format(experiment_id, file_format))
        if exists(score_dist_file):
            df_score_dist = DataReader.read_from_file(score_dist_file, index_col=1)
            df_score_dist.rename(columns={'sys_{}'.format(prefix): 'sys'}, inplace=True)
            files['df_score_dist'] = df_score_dist[['human', 'sys', 'difference']]

        # read in the feature boxplots by subgroup, if we were asked to
        for group in groups_eval:
            feature_boxplot_prefix = join(figdir,
                                          '{}_feature_boxplot_by_{}'.format(experiment_id, group))
            svg_file = join(feature_boxplot_prefix + '.svg')
            png_file = join(feature_boxplot_prefix + '.png')
            if exists(svg_file):
                figs['feature_boxplots_by_{}_svg'.format(group)] = svg_file

            elif exists(png_file):
                figs['feature_boxplots_by_{}_png'.format(group)] = png_file

        # read in the betas image if exists
        betas_svg = join(figdir, '{}_betas.svg'.format(experiment_id))
        if exists(betas_svg):
            figs['betas'] = betas_svg

        # read in the evaluation barplots by subgroup, if we were asked to
        for group in groups_eval:
            eval_barplot_svg_file = join(figdir, '{}_eval_by_{}.svg'.format(experiment_id, group))
            if exists(eval_barplot_svg_file):
                figs['eval_barplot_by_{}'.format(group)] = eval_barplot_svg_file

        pca_svg_file = join(figdir, '{}_pca.svg'.format(experiment_id))
        if exists(pca_svg_file):
            figs['pca_scree_plot'] = pca_svg_file

        return (files, figs, file_format)
def run_experiment(config_file_or_obj,
                   output_dir):
    """
    Run RSMTool experiment using the given configuration
    file and generate all outputs in the given directory.

    Parameters
    ----------
    config_file_or_obj : str or Configuration
        Path to the experiment configuration file.
        Users can also pass a `Configuration` object that is in memory.
    output_dir : str
        Path to the experiment output directory.

    Raises
    ------
    ValueError
        If any of the required fields are missing or ill-specified.
    """

    logger = logging.getLogger(__name__)

    # create the 'output' and the 'figure' sub-directories
    # where all the experiment output such as the CSV files
    # and the box plots will be saved

    # Get absolute paths to output directories
    csvdir = abspath(join(output_dir, 'output'))
    figdir = abspath(join(output_dir, 'figure'))
    reportdir = abspath(join(output_dir, 'report'))
    featuredir = abspath(join(output_dir, 'feature'))

    # Make directories, if necessary
    makedirs(csvdir, exist_ok=True)
    makedirs(figdir, exist_ok=True)
    makedirs(reportdir, exist_ok=True)

    # Allow users to pass Configuration object to the
    # `config_file_or_obj` argument, rather than read from file
    if not isinstance(config_file_or_obj, Configuration):

        # Instantiate configuration parser object
        parser = ConfigurationParser.get_configparser(config_file_or_obj)
        configuration = parser.read_normalize_validate_and_process_config(config_file_or_obj)

        # get the directory where the configuration file lives
        configpath = dirname(config_file_or_obj)

    else:

        configuration = config_file_or_obj
        if configuration.filepath is not None:
            configpath = dirname(configuration.filepath)
        else:
            configpath = getcwd()

    logger.info('Saving configuration file.')
    configuration.save(output_dir)

    # Get output format
    file_format = configuration.get('file_format', 'csv')

    # Get DataWriter object
    writer = DataWriter(configuration['experiment_id'])

    # Get the paths and names for the DataReader

    (file_names,
     file_paths_org) = configuration.get_names_and_paths(['train_file', 'test_file',
                                                          'features',
                                                          'feature_subset_file'],
                                                         ['train', 'test',
                                                          'feature_specs',
                                                          'feature_subset_specs'])

    file_paths = DataReader.locate_files(file_paths_org, configpath)

    # if there are any missing files after trying to locate
    # all expected files, raise an error
    if None in file_paths:
        missing_file_paths = [file_paths_org[idx] for idx, path in enumerate(file_paths)
                              if path is None]
        raise FileNotFoundError('The following files were not found: '
                                '{}'.format(repr(missing_file_paths)))

    # Use the default converter for both train and test
    converters = {'train': configuration.get_default_converter(),
                  'test': configuration.get_default_converter()}

    logger.info('Reading in all data from files.')

    # Initialize the reader
    reader = DataReader(file_paths, file_names, converters)
    data_container = reader.read()

    logger.info('Preprocessing all features.')

    # Initialize the processor
    processor = FeaturePreprocessor()

    (processed_config,
     processed_container) = processor.process_data(configuration,
                                                   data_container)

    # Rename certain frames with more descriptive names
    # for writing out experiment files
    rename_dict = {'train_excluded': 'train_excluded_responses',
                   'test_excluded': 'test_excluded_responses',
                   'train_length': 'train_response_lengths',
                   'train_flagged': 'train_responses_with_excluded_flags',
                   'test_flagged': 'test_responses_with_excluded_flags'}

    logger.info('Saving training and test set data to disk.')

    # Write out files
    writer.write_experiment_output(csvdir,
                                   processed_container,
                                   ['train_features',
                                    'test_features',
                                    'train_metadata',
                                    'test_metadata',
                                    'train_other_columns',
                                    'test_other_columns',
                                    'train_preprocessed_features',
                                    'test_preprocessed_features',
                                    'train_excluded',
                                    'test_excluded',
                                    'train_length',
                                    'test_human_scores',
                                    'train_flagged',
                                    'test_flagged'],
                                   rename_dict,
                                   file_format=file_format)

    # Initialize the analyzer
    analyzer = Analyzer()

    (analyzed_config,
     analyzed_container) = analyzer.run_data_composition_analyses_for_rsmtool(processed_container,
                                                                              processed_config)

    # Write out files
    writer.write_experiment_output(csvdir,
                                   analyzed_container,
                                   file_format=file_format)

    logger.info('Training {} model.'.format(processed_config['model_name']))

    # Initialize modeler
    modeler = Modeler()

    modeler.train(processed_config,
                  processed_container,
                  csvdir,
                  figdir,
                  file_format)

    # Identify the features used by the model
    selected_features = modeler.get_feature_names()

    # Add selected features to processed configuration
    processed_config['selected_features'] = selected_features

    # Write out files
    writer.write_feature_csv(featuredir,
                             processed_container,
                             selected_features,
                             file_format=file_format)

    features_data_container = processed_container.copy()

    # Get selected feature info, and write out to file
    df_feature_info = features_data_container.feature_info.copy()
    df_selected_feature_info = df_feature_info[df_feature_info['feature'].isin(selected_features)]
    selected_feature_dataset_dict = {'name': 'selected_feature_info',
                                     'frame': df_selected_feature_info}

    features_data_container.add_dataset(selected_feature_dataset_dict,
                                        update=True)

    writer.write_experiment_output(csvdir,
                                   features_data_container,
                                   dataframe_names=['selected_feature_info'],
                                   new_names_dict={'selected_feature_info': 'feature'},
                                   file_format=file_format)

    logger.info('Running analyses on training set.')

    (train_analyzed_config,
     train_analyzed_container) = analyzer.run_training_analyses(processed_container,
                                                                processed_config)

    # Write out files
    writer.write_experiment_output(csvdir,
                                   train_analyzed_container,
                                   reset_index=True,
                                   file_format=file_format)

    # Use only selected features for predictions
    columns_for_prediction = ['spkitemid', 'sc1'] + selected_features
    train_for_prediction = processed_container.train_preprocessed_features[columns_for_prediction]
    test_for_prediction = processed_container.test_preprocessed_features[columns_for_prediction]

    logged_str = 'Generating training and test set predictions'
    logged_str += ' (expected scores).' if configuration['predict_expected_scores'] else '.'
    logger.info(logged_str)
    (pred_config,
     pred_data_container) = modeler.predict_train_and_test(train_for_prediction,
                                                           test_for_prediction,
                                                           processed_config)

    # Write out files
    writer.write_experiment_output(csvdir,
                                   pred_data_container,
                                   new_names_dict={'pred_test': 'pred_processed'},
                                   file_format=file_format)

    original_coef_file = join(csvdir, '{}_coefficients.{}'.format(pred_config['experiment_id'],
                                                                  file_format))

    # If coefficients file exists, then generate
    # scaled coefficients and save to file
    if exists(original_coef_file):
        logger.info('Scaling the coefficients and saving them to disk')
        try:

            # Scale coefficients, and return DataContainer w/ scaled coefficients
            scaled_data_container = modeler.scale_coefficients(pred_config)

            # Write out files to disk
            writer.write_experiment_output(csvdir,
                                           scaled_data_container,
                                           file_format=file_format)

        except AttributeError:
            raise ValueError("It appears you are trying to save two different "
                             "experiments to the same directory using the same "
                             "ID. Please clear the content of the directory and "
                             "rerun both experiments using different "
                             "experiment IDs.")

    # Add processed data_container frames to pred_data_container
    new_pred_data_container = pred_data_container + processed_container

    logger.info('Running prediction analyses.')
    (pred_analysis_config,
     pred_analysis_data_container) = analyzer.run_prediction_analyses(new_pred_data_container,
                                                                      pred_config)

    # Write out files
    writer.write_experiment_output(csvdir,
                                   pred_analysis_data_container,
                                   reset_index=True,
                                   file_format=file_format)
    # Initialize reporter
    reporter = Reporter()

    # generate the report
    logger.info('Starting report generation.')
    reporter.create_report(processed_config,
                           csvdir,
                           figdir)
Beispiel #20
0
    def test_locate_files_wrong_type(self):

        paths = {'file1.csv', 'file2.xlsx'}
        config_dir = 'output'
        DataReader.locate_files(paths, config_dir)
def check_file_output(file1, file2, file_format='csv'):
    """
    Check if two experiment files have values that are
    the same to within three decimal places. Raises an
    AssertionError if they are not.

    Parameters
    ----------
    file1 : str
        Path to the first file.
    file2 : str
        Path to the second files.
    file_format : str, optional
        The format of the output files.
        Defaults to 'csv'.
    """

    # make sure that the main id columns are read as strings since
    # this may affect merging in custom notebooks
    string_columns = ['spkitemid', 'candidate']

    converter_dict = {column: str for column in string_columns}

    df1 = DataReader.read_from_file(file1, converters=converter_dict)
    df2 = DataReader.read_from_file(file2, converters=converter_dict)

    # if the first column is numeric, just force the index to string;
    # however, if it is non-numeric, set it as the index and then
    # force it to string. We do this to ensure string indices are
    # preserved as such
    for df in [df1, df2]:
        if np.issubdtype(df[df.columns[0]].dtype, np.number):
            df.index = df.index.map(str)
        else:
            df.index = df[df.columns[0]]
            df.index = df.index.map(str)

    # sort all the indices alphabetically
    df1.sort_index(inplace=True)
    df2.sort_index(inplace=True)

    # convert any integer columns to floats in either data frame
    for df in [df1, df2]:
        for c in df.columns:
            if df[c].dtype == np.int64:
                df[c] = df[c].astype(np.float64)

    # do the same for indices
    for df in [df1, df2]:
        if df.index.dtype == np.int64:
            df.index = df.index.astype(np.float64)

    # for pca and factor correlations convert all values to absolutes
    # because the sign may not always be the same
    if (file1.endswith('pca.{}'.format(file_format)) or
            file1.endswith('factor_correlations.{}'.format(file_format))):
        for df in [df1, df2]:
            msk = df.dtypes == np.float64
            df.loc[:, msk] = df.loc[:, msk].abs()

    try:
        assert_frame_equal(df1.sort_index(axis=1),
                           df2.sort_index(axis=1),
                           check_exact=False,
                           check_less_precise=True)
    except AssertionError as e:
        message = e.args[0]
        new_message = 'File {} - {}'.format(basename(file1), message)
        e.args = (new_message, )
        raise
Beispiel #22
0
    def test_locate_files_str(self):

        paths = 'file1.csv'
        config_dir = 'output'
        result = DataReader.locate_files(paths, config_dir)
        eq_(result, None)
Beispiel #23
0
def run_experiment(config_file_or_obj, output_dir):
    """
    Run RSMTool experiment using the given configuration
    file and generate all outputs in the given directory.

    Parameters
    ----------
    config_file_or_obj : str or Configuration
        Path to the experiment configuration file.
        Users can also pass a `Configuration` object that is in memory.
    output_dir : str
        Path to the experiment output directory.

    Raises
    ------
    ValueError
        If any of the required fields are missing or ill-specified.
    """

    logger = logging.getLogger(__name__)

    # create the 'output' and the 'figure' sub-directories
    # where all the experiment output such as the CSV files
    # and the box plots will be saved

    # Get absolute paths to output directories
    csvdir = abspath(join(output_dir, 'output'))
    figdir = abspath(join(output_dir, 'figure'))
    reportdir = abspath(join(output_dir, 'report'))
    featuredir = abspath(join(output_dir, 'feature'))

    # Make directories, if necessary
    makedirs(csvdir, exist_ok=True)
    makedirs(figdir, exist_ok=True)
    makedirs(reportdir, exist_ok=True)

    # Allow users to pass Configuration object to the
    # `config_file_or_obj` argument, rather than read from file
    if not isinstance(config_file_or_obj, Configuration):

        # Instantiate configuration parser object
        parser = ConfigurationParser.get_configparser(config_file_or_obj)
        configuration = parser.read_normalize_validate_and_process_config(
            config_file_or_obj)

        # get the directory where the configuration file lives
        configpath = dirname(config_file_or_obj)

    else:

        configuration = config_file_or_obj
        if configuration.filepath is not None:
            configpath = dirname(configuration.filepath)
        else:
            configpath = getcwd()

    logger.info('Saving configuration file.')
    configuration.save(output_dir)

    # Get output format
    file_format = configuration.get('file_format', 'csv')

    # Get DataWriter object
    writer = DataWriter(configuration['experiment_id'])

    # Get the paths and names for the DataReader

    (file_names, file_paths_org) = configuration.get_names_and_paths(
        ['train_file', 'test_file', 'features', 'feature_subset_file'],
        ['train', 'test', 'feature_specs', 'feature_subset_specs'])

    file_paths = DataReader.locate_files(file_paths_org, configpath)

    # if there are any missing files after trying to locate
    # all expected files, raise an error
    if None in file_paths:
        missing_file_paths = [
            file_paths_org[idx] for idx, path in enumerate(file_paths)
            if path is None
        ]
        raise FileNotFoundError('The following files were not found: '
                                '{}'.format(repr(missing_file_paths)))

    # Use the default converter for both train and test
    converters = {
        'train': configuration.get_default_converter(),
        'test': configuration.get_default_converter()
    }

    logger.info('Reading in all data from files.')

    # Initialize the reader
    reader = DataReader(file_paths, file_names, converters)
    data_container = reader.read()

    logger.info('Preprocessing all features.')

    # Initialize the processor
    processor = FeaturePreprocessor()

    (processed_config,
     processed_container) = processor.process_data(configuration,
                                                   data_container)

    # Rename certain frames with more descriptive names
    # for writing out experiment files
    rename_dict = {
        'train_excluded': 'train_excluded_responses',
        'test_excluded': 'test_excluded_responses',
        'train_length': 'train_response_lengths',
        'train_flagged': 'train_responses_with_excluded_flags',
        'test_flagged': 'test_responses_with_excluded_flags'
    }

    logger.info('Saving training and test set data to disk.')

    # Write out files
    writer.write_experiment_output(
        csvdir,
        processed_container, [
            'train_features', 'test_features', 'train_metadata',
            'test_metadata', 'train_other_columns', 'test_other_columns',
            'train_preprocessed_features', 'test_preprocessed_features',
            'train_excluded', 'test_excluded', 'train_length',
            'test_human_scores', 'train_flagged', 'test_flagged'
        ],
        rename_dict,
        file_format=file_format)

    # Initialize the analyzer
    analyzer = Analyzer()

    (analyzed_config,
     analyzed_container) = analyzer.run_data_composition_analyses_for_rsmtool(
         processed_container, processed_config)

    # Write out files
    writer.write_experiment_output(csvdir,
                                   analyzed_container,
                                   file_format=file_format)

    logger.info('Training {} model.'.format(processed_config['model_name']))

    # Initialize modeler
    modeler = Modeler()

    modeler.train(processed_config, processed_container, csvdir, figdir,
                  file_format)

    # Identify the features used by the model
    selected_features = modeler.get_feature_names()

    # Add selected features to processed configuration
    processed_config['selected_features'] = selected_features

    # Write out files
    writer.write_feature_csv(featuredir,
                             processed_container,
                             selected_features,
                             file_format=file_format)

    features_data_container = processed_container.copy()

    # Get selected feature info, and write out to file
    df_feature_info = features_data_container.feature_info.copy()
    df_selected_feature_info = df_feature_info[df_feature_info['feature'].isin(
        selected_features)]
    selected_feature_dataset_dict = {
        'name': 'selected_feature_info',
        'frame': df_selected_feature_info
    }

    features_data_container.add_dataset(selected_feature_dataset_dict,
                                        update=True)

    writer.write_experiment_output(
        csvdir,
        features_data_container,
        dataframe_names=['selected_feature_info'],
        new_names_dict={'selected_feature_info': 'feature'},
        file_format=file_format)

    logger.info('Running analyses on training set.')

    (train_analyzed_config,
     train_analyzed_container) = analyzer.run_training_analyses(
         processed_container, processed_config)

    # Write out files
    writer.write_experiment_output(csvdir,
                                   train_analyzed_container,
                                   reset_index=True,
                                   file_format=file_format)

    # Use only selected features for predictions
    columns_for_prediction = ['spkitemid', 'sc1'] + selected_features
    train_for_prediction = processed_container.train_preprocessed_features[
        columns_for_prediction]
    test_for_prediction = processed_container.test_preprocessed_features[
        columns_for_prediction]

    logged_str = 'Generating training and test set predictions'
    logged_str += ' (expected scores).' if configuration[
        'predict_expected_scores'] else '.'
    logger.info(logged_str)
    (pred_config, pred_data_container) = modeler.predict_train_and_test(
        train_for_prediction, test_for_prediction, processed_config)

    # Write out files
    writer.write_experiment_output(
        csvdir,
        pred_data_container,
        new_names_dict={'pred_test': 'pred_processed'},
        file_format=file_format)

    original_coef_file = join(
        csvdir, '{}_coefficients.{}'.format(pred_config['experiment_id'],
                                            file_format))

    # If coefficients file exists, then generate
    # scaled coefficients and save to file
    if exists(original_coef_file):
        logger.info('Scaling the coefficients and saving them to disk')
        try:

            # Scale coefficients, and return DataContainer w/ scaled coefficients
            scaled_data_container = modeler.scale_coefficients(pred_config)

            # Write out files to disk
            writer.write_experiment_output(csvdir,
                                           scaled_data_container,
                                           file_format=file_format)

        except AttributeError:
            raise ValueError(
                "It appears you are trying to save two different "
                "experiments to the same directory using the same "
                "ID. Please clear the content of the directory and "
                "rerun both experiments using different "
                "experiment IDs.")

    # Add processed data_container frames to pred_data_container
    new_pred_data_container = pred_data_container + processed_container

    logger.info('Running prediction analyses.')
    (pred_analysis_config,
     pred_analysis_data_container) = analyzer.run_prediction_analyses(
         new_pred_data_container, pred_config)

    # Write out files
    writer.write_experiment_output(csvdir,
                                   pred_analysis_data_container,
                                   reset_index=True,
                                   file_format=file_format)
    # Initialize reporter
    reporter = Reporter()

    # generate the report
    logger.info('Starting report generation.')
    reporter.create_report(processed_config, csvdir, figdir)
Beispiel #24
0
def run_comparison(config_file_or_obj, output_dir):
    """
    Run an ``rsmcompare`` experiment using the given configuration
    file and generate the report in the given directory.

    Parameters
    ----------
    config_file_or_obj : str or Configuration
        Path to the experiment configuration file.
        Users can also pass a `Configuration` object that is in memory.
    output_dir : str
        Path to the experiment output directory.

    Raises
    ------
    ValueError
        If any of the required fields are missing or ill-specified.
    """

    logger = logging.getLogger(__name__)

    # Allow users to pass Configuration object to the
    # `config_file_or_obj` argument, rather than read file
    if not isinstance(config_file_or_obj, Configuration):

        # Instantiate configuration parser object
        parser = ConfigurationParser.get_configparser(config_file_or_obj)
        configuration = parser.read_normalize_validate_and_process_config(
            config_file_or_obj, context='rsmcompare')

        # get the directory where the configuration file lives
        configpath = dirname(config_file_or_obj)

    else:
        configuration = config_file_or_obj
        if configuration.filepath is not None:
            configpath = dirname(configuration.filepath)
        else:
            configpath = os.getcwd()

    logger.info('Saving configuration file.')
    configuration.save(output_dir)

    # get the information about the "old" experiment
    experiment_id_old = configuration['experiment_id_old']
    experiment_dir_old = DataReader.locate_files(
        configuration['experiment_dir_old'], configpath)
    if not experiment_dir_old:
        raise FileNotFoundError("The directory {} "
                                "does not exist.".format(
                                    configuration['experiment_dir_old']))
    else:
        csvdir_old = normpath(join(experiment_dir_old, 'output'))
        figdir_old = normpath(join(experiment_dir_old, 'figure'))
        if not exists(csvdir_old) or not exists(figdir_old):
            raise FileNotFoundError("The directory {} does not contain "
                                    "the output of an rsmtool "
                                    "experiment.".format(experiment_dir_old))

    check_experiment_id(experiment_dir_old, experiment_id_old)

    # get the information about the "new" experiment
    experiment_id_new = configuration['experiment_id_new']
    experiment_dir_new = DataReader.locate_files(
        configuration['experiment_dir_new'], configpath)
    if not experiment_dir_new:
        raise FileNotFoundError("The directory {} "
                                "does not exist.".format(
                                    configuration['experiment_dir_new']))
    else:
        csvdir_new = normpath(join(experiment_dir_new, 'output'))
        figdir_new = normpath(join(experiment_dir_new, 'figure'))
        if not exists(csvdir_new) or not exists(figdir_new):
            raise FileNotFoundError("The directory {} does not contain "
                                    "the output of an rsmtool "
                                    "experiment.".format(experiment_dir_new))

    check_experiment_id(experiment_dir_new, experiment_id_new)

    # are there specific general report sections we want to include?
    general_report_sections = configuration['general_sections']

    # what about the special or custom sections?
    special_report_sections = configuration['special_sections']

    custom_report_section_paths = configuration['custom_sections']

    # if custom report sections exist, locate sections; otherwise, create empty list
    if custom_report_section_paths:
        logger.info('Locating custom report sections')
        custom_report_sections = Reporter.locate_custom_sections(
            custom_report_section_paths, configpath)
    else:
        custom_report_sections = []

    # get the section order
    section_order = configuration['section_order']

    # get the subgroups if any
    subgroups = configuration.get('subgroups')

    # Initialize reporter
    reporter = Reporter()

    chosen_notebook_files = reporter.get_ordered_notebook_files(
        general_report_sections,
        special_report_sections,
        custom_report_sections,
        section_order,
        subgroups,
        model_type=None,
        context='rsmcompare')

    # add chosen notebook files to configuration
    configuration['chosen_notebook_files'] = chosen_notebook_files

    # now generate the comparison report
    logger.info('Starting report generation.')
    reporter.create_comparison_report(configuration, csvdir_old, figdir_old,
                                      csvdir_new, figdir_new, output_dir)
Beispiel #25
0
def run_evaluation(config_file_or_obj, output_dir):
    """
    Run an `rsmeval` experiment using the given configuration
    file and generate all outputs in the given directory.

    Parameters
    ----------
    config_file_or_obj : str or configuration_parser.Configuration
        Path to the experiment configuration file.
        Users can also pass a `Configuration` object that is in memory.
    output_dir : str
        Path to the experiment output directory.

    Raises
    ------
    ValueError
        If any of the required fields are missing or ill-specified.
    """
    logger = logging.getLogger(__name__)

    # create the 'output' and the 'figure' sub-directories
    # where all the experiment output such as the CSV files
    # and the box plots will be saved
    csvdir = abspath(join(output_dir, 'output'))
    figdir = abspath(join(output_dir, 'figure'))
    reportdir = abspath(join(output_dir, 'report'))
    os.makedirs(csvdir, exist_ok=True)
    os.makedirs(figdir, exist_ok=True)
    os.makedirs(reportdir, exist_ok=True)

    # Allow users to pass Configuration object to the
    # `config_file_or_obj` argument, rather than read file
    if not isinstance(config_file_or_obj, Configuration):

        # Instantiate configuration parser object
        parser = ConfigurationParser.get_configparser(config_file_or_obj)
        configuration = parser.read_normalize_validate_and_process_config(
            config_file_or_obj, context='rsmeval')

        # get the directory where the configuration file lives
        configpath = dirname(config_file_or_obj)

    else:

        configuration = config_file_or_obj
        if configuration.filepath is not None:
            configpath = dirname(configuration.filepath)
        else:
            configpath = os.getcwd()

    logger.info('Saving configuration file.')
    configuration.save(output_dir)

    # Get output format
    file_format = configuration.get('file_format', 'csv')

    # Get DataWriter object
    writer = DataWriter(configuration['experiment_id'])

    # Make sure prediction file can be located
    if not DataReader.locate_files(configuration['predictions_file'],
                                   configpath):
        raise FileNotFoundError('Error: Predictions file {} '
                                'not found.\n'.format(
                                    configuration['predictions_file']))

    scale_with = configuration.get('scale_with')

    # scale_with can be one of the following:
    # (a) None       : the predictions are assumed to be 'raw' and should be used as is
    #                  when computing the metrics; the names for the final columns are
    #                  'raw', 'raw_trim' and 'raw_trim_round'.
    # (b) 'asis'     : the predictions are assumed to be pre-scaled and should be used as is
    #                  when computing the metrics; the names for the final columns are
    #                  'scale', 'scale_trim' and 'scale_trim_round'.
    # (c) a CSV file : the predictions are assumed to be 'raw' and should be scaled
    #                  before computing the metrics; the names for the final columns are
    #                  'scale', 'scale_trim' and 'scale_trim_round'.

    # Check whether we want to do scaling
    do_scaling = (scale_with is not None and scale_with != 'asis')

    # The paths to files and names for data container properties
    paths = ['predictions_file']
    names = ['predictions']

    # If we want to do scaling, get the scale file
    if do_scaling:

        # Make sure scale file can be located
        scale_file_location = DataReader.locate_files(scale_with, configpath)
        if not scale_file_location:
            raise FileNotFoundError('Could not find scaling file {}.'
                                    ''.format(scale_file_location))

        paths.append('scale_with')
        names.append('scale')

    # Get the paths, names, and converters for the DataReader
    (file_names, file_paths) = configuration.get_names_and_paths(paths, names)

    file_paths = DataReader.locate_files(file_paths, configpath)

    converters = {'predictions': configuration.get_default_converter()}

    logger.info('Reading predictions: {}.'.format(
        configuration['predictions_file']))

    # Initialize the reader
    reader = DataReader(file_paths, file_names, converters)
    data_container = reader.read()

    logger.info('Preprocessing predictions.')

    # Initialize the processor
    processor = FeaturePreprocessor()

    (processed_config,
     processed_container) = processor.process_data(configuration,
                                                   data_container,
                                                   context='rsmeval')

    logger.info('Saving pre-processed predictions and metadata to disk.')
    writer.write_experiment_output(csvdir,
                                   processed_container,
                                   new_names_dict={
                                       'pred_test': 'pred_processed',
                                       'test_excluded':
                                       'test_excluded_responses'
                                   },
                                   file_format=file_format)

    # Initialize the analyzer
    analyzer = Analyzer()

    # do the data composition stats
    (analyzed_config,
     analyzed_container) = analyzer.run_data_composition_analyses_for_rsmeval(
         processed_container, processed_config)
    # Write out files
    writer.write_experiment_output(csvdir,
                                   analyzed_container,
                                   file_format=file_format)

    for_pred_data_container = analyzed_container + processed_container

    # run the analyses on the predictions of the model`
    logger.info('Running analyses on predictions.')
    (pred_analysis_config,
     pred_analysis_data_container) = analyzer.run_prediction_analyses(
         for_pred_data_container, analyzed_config)

    writer.write_experiment_output(csvdir,
                                   pred_analysis_data_container,
                                   reset_index=True,
                                   file_format=file_format)

    # Initialize reporter
    reporter = Reporter()

    # generate the report
    logger.info('Starting report generation.')
    reporter.create_report(processed_config, csvdir, figdir, context='rsmeval')
def compute_and_save_predictions(config_file_or_obj, output_file, feats_file=None):
    """
    Run ``rsmpredict`` with given configuration file and generate
    predictions (and, optionally, pre-processed feature values).

    Parameters
    ----------
    config_file_or_obj : str or configuration_parser.Configuration
        Path to the experiment configuration file.
        Users can also pass a `Configuration` object that is in memory.
    output_dir : str
        Path to the output directory for saving files.
    feats_file (optional): str
        Path to the output file for saving preprocessed feature values.

    Raises
    ------
    ValueError
        If any of the required fields are missing or ill-specified.
    """

    logger = logging.getLogger(__name__)

    # Allow users to pass Configuration object to the
    # `config_file_or_obj` argument, rather than read file
    if not isinstance(config_file_or_obj, Configuration):

        # Instantiate configuration parser object
        parser = ConfigurationParser.get_configparser(config_file_or_obj)
        config = parser.read_normalize_validate_and_process_config(config_file_or_obj,
                                                                   context='rsmpredict')

        # get the directory where the config file lives
        configpath = dirname(config_file_or_obj)

    else:

        config = config_file_or_obj
        if config.filepath is not None:
            configpath = dirname(config.filepath)
        else:
            configpath = os.getcwd()

    # get the experiment ID
    experiment_id = config['experiment_id']

    # Get output format
    file_format = config.get('file_format', 'csv')

    # Get DataWriter object
    writer = DataWriter(experiment_id)

    # get the input file containing the feature values
    # for which we want to generate the predictions
    input_features_file = DataReader.locate_files(config['input_features_file'], configpath)
    if not input_features_file:
        raise FileNotFoundError('Input file {} does not exist'
                                ''.format(config['input_features_file']))

    experiment_dir = DataReader.locate_files(config['experiment_dir'], configpath)
    if not experiment_dir:
        raise FileNotFoundError('The directory {} does not exist.'
                                ''.format(config['experiment_dir']))
    else:
        experiment_output_dir = normpath(join(experiment_dir, 'output'))
        if not exists(experiment_output_dir):
            raise FileNotFoundError('The directory {} does not contain '
                                    'the output of an rsmtool experiment.'.format(experiment_dir))

    # find all the .model files in the experiment output directory
    model_files = glob.glob(join(experiment_output_dir, '*.model'))
    if not model_files:
        raise FileNotFoundError('The directory {} does not contain any rsmtool models.'
                                ''.format(experiment_output_dir))

    experiment_ids = [splitext(basename(mf))[0] for mf in model_files]
    if experiment_id not in experiment_ids:
        raise FileNotFoundError('{} does not contain a model for the experiment "{}". '
                                'The following experiments are contained in this '
                                'directory: {}'.format(experiment_output_dir,
                                                       experiment_id,
                                                       experiment_ids))

    # check that the directory contains outher required files
    required_file_types = ['feature', 'postprocessing_params']
    for file_type in required_file_types:
        expected_file_name = "{}_{}.csv".format(experiment_id, file_type)
        if not exists(join(experiment_output_dir, expected_file_name)):
            raise FileNotFoundError('{} does not contain the required file '
                                    '{} that was generated during the '
                                    'original model training'.format(experiment_output_dir,
                                                                     expected_file_name))

    # model_files = glob.glob(join(experiment_output_dir, '*.model'))
    # if not model_files:
    #     raise FileNotFoundError('The directory {} does not contain any rsmtool models. '
    #                             ''.format(experiment_output_dir))

    logger.info('Reading input files.')

    feature_info = join(experiment_output_dir,
                        '{}_feature.csv'.format(experiment_id))

    post_processing = join(experiment_output_dir,
                           '{}_postprocessing_params.csv'.format(experiment_id))

    file_paths = [input_features_file, feature_info, post_processing]
    file_names = ['input_features',
                  'feature_info',
                  'postprocessing_params']

    converters = {'input_features': config.get_default_converter()}

    # Initialize the reader
    reader = DataReader(file_paths, file_names, converters)
    data_container = reader.read(kwargs_dict={'feature_info': {'index_col': 0}})

    # load the Modeler to generate the predictions
    model = Modeler.load_from_file(join(experiment_output_dir,
                                        '{}.model'.format(experiment_id)))

    # Add the model to the configuration object
    config['model'] = model

    # Initialize the processor
    processor = FeaturePreprocessor()

    (processed_config,
     processed_container) = processor.process_data(config,
                                                   data_container,
                                                   context='rsmpredict')

    # save the pre-processed features to disk if we were asked to
    if feats_file is not None:
        logger.info('Saving pre-processed feature values to {}'.format(feats_file))

        feats_dir = dirname(feats_file)

        # create any directories needed for the output file
        os.makedirs(feats_dir, exist_ok=True)

        _, feats_filename = split(feats_file)
        feats_filename, _ = splitext(feats_filename)

        # Write out files
        writer.write_experiment_output(feats_dir,
                                       processed_container,
                                       include_experiment_id=False,
                                       dataframe_names=['features_processed'],
                                       new_names_dict={'features_processed':
                                                       feats_filename},
                                       file_format=file_format)

    if (output_file.lower().endswith('.csv') or
            output_file.lower().endswith('.xlsx')):

        output_dir = dirname(output_file)
        _, filename = split(output_file)
        filename, _ = splitext(filename)

    else:
        output_dir = output_file
        filename = 'predictions_with_metadata'

    # create any directories needed for the output file
    os.makedirs(output_dir, exist_ok=True)

    # save the predictions to disk
    logger.info('Saving predictions.')

    # Write out files
    writer.write_experiment_output(output_dir,
                                   processed_container,
                                   include_experiment_id=False,
                                   dataframe_names=['predictions_with_metadata'],
                                   new_names_dict={'predictions_with_metadata':
                                                   filename},
                                   file_format=file_format)

    # save excluded responses to disk
    if not processed_container.excluded.empty:

        # save the predictions to disk
        logger.info('Saving excluded responses to {}'.format(join(output_dir,
                                                                  '{}_excluded_responses.csv'
                                                                  ''.format(filename))))

        # Write out files
        writer.write_experiment_output(output_dir,
                                       processed_container,
                                       include_experiment_id=False,
                                       dataframe_names=['excluded'],
                                       new_names_dict={'excluded':
                                                       '{}_excluded_responses'
                                                       ''.format(filename)},
                                       file_format=file_format)
Beispiel #27
0
    def load_rsmtool_output(self, filedir, figdir, experiment_id, prefix, groups_eval):
        """
        Function to load all of the outputs of an rsmtool experiment.
        For each type of output, we first check whether the file exists
        to allow comparing experiments with different sets of outputs.

        Parameters
        ----------
        filedir : str
            Path to the directory containing output files.
        figdir : str
            Path to the directory containing output figures.
        experiment_id : str
            Original ``experiment_id`` used to generate the output files.
        prefix: str
            Must be set to ``scale`` or ``raw``. Indicates whether the score
            is scaled or not.
        groups_eval: list
            List of subgroup names used for subgroup evaluation.

        Returns
        -------
        files : dict
            A dictionary with outputs converted to pandas data
            frames. If a particular type of output did not exist for the
            experiment, its value will be an empty data frame.
        figs: dict
            A dictionary with experiment figures.
        """

        file_format = get_output_directory_extension(filedir, experiment_id)

        files = defaultdict(pd.DataFrame)
        figs = {}

        # feature distributions and the inter-feature correlations
        feature_train_file = join(filedir, '{}_train_features.{}'.format(experiment_id,
                                                                         file_format))
        if exists(feature_train_file):
            files['df_train_features'] = DataReader.read_from_file(feature_train_file)

        feature_distplots_file = join(figdir, '{}_distrib.svg'.format(experiment_id))
        if exists(feature_distplots_file):
            figs['feature_distplots'] = feature_distplots_file

        feature_cors_file = join(filedir, '{}_cors_processed.{}'.format(experiment_id,
                                                                        file_format))
        if exists(feature_cors_file):
            files['df_feature_cors'] = DataReader.read_from_file(feature_cors_file, index_col=0)

        # df_scores
        scores_file = join(filedir, '{}_pred_processed.{}'.format(experiment_id,
                                                                  file_format))
        if exists(scores_file):
            df_scores = DataReader.read_from_file(scores_file, converters={'spkitemid': str})
            files['df_scores'] = df_scores[['spkitemid', 'sc1', prefix]]

        # model coefficients if present
        betas_file = join(filedir, '{}_betas.{}'.format(experiment_id,
                                                        file_format))
        if exists(betas_file):
            files['df_coef'] = DataReader.read_from_file(betas_file, index_col=0)
            files['df_coef'].index.name = None

        # read in the model fit files if present
        model_fit_file = join(filedir, '{}_model_fit.{}'.format(experiment_id,
                                                                file_format))
        if exists(model_fit_file):
            files['df_model_fit'] = DataReader.read_from_file(model_fit_file)

        # human human agreement
        consistency_file = join(filedir, '{}_consistency.{}'.format(experiment_id,
                                                                    file_format))

        # load if consistency file is present
        if exists(consistency_file):
            df_consistency = DataReader.read_from_file(consistency_file, index_col=0)
            files['df_consistency'] = df_consistency

        # degradation
        degradation_file = join(filedir, "{}_degradation.{}".format(experiment_id,
                                                                    file_format))

        # load if degradation file is present
        if exists(degradation_file):
            df_degradation = DataReader.read_from_file(degradation_file, index_col=0)
            files['df_degradation'] = df_degradation

        # disattenuated correlations
        dis_corr_file = join(filedir, "{}_disattenuated_correlations.{}".format(experiment_id,
                                                                                file_format))

        # load if disattenuated correlations is present
        if exists(dis_corr_file):
            df_dis_corr = DataReader.read_from_file(dis_corr_file, index_col=0)
            # we only use the row for raw_trim or scale_trim score
            files['df_disattenuated_correlations'] = df_dis_corr.loc[['{}_trim'.format(prefix)]]

        # read in disattenuated correlations by group
        for group in groups_eval:
            group_dis_corr_file = join(filedir,
                                       '{}_disattenuated_correlations_by_{}.{}'.format(experiment_id,
                                                                                       group,
                                                                                       file_format))
            if exists(group_dis_corr_file):
                df_dis_cor_group = DataReader.read_from_file(group_dis_corr_file, index_col=0)
                files['df_disattenuated_correlations_by_{}'.format(group)] = df_dis_cor_group
                files['df_disattenuated_correlations_by_{}_overview'.format(group)] = self.make_summary_stat_df(df_dis_cor_group)

        # true score evaluations
        true_score_eval_file = join(filedir, "{}_true_score_eval.{}".format(experiment_id,
                                                                            file_format))

        # load true score evaluations if present
        if exists(true_score_eval_file):
            df_true_score_eval = DataReader.read_from_file(true_score_eval_file, index_col=0)
            # we only use the row for raw_trim or scale_trim score
            files['df_true_score_eval'] = df_true_score_eval.loc[['{}_trim'.format(prefix)]]

        # use the raw columns or the scale columns depending on the prefix
        existing_eval_cols = (_df_eval_columns_existing_raw if prefix == 'raw'
                              else _df_eval_columns_existing_scale)
        rename_dict = raw_rename_dict if prefix == 'raw' else scale_rename_dict

        # read in the short version of the evaluation metrics for all data
        short_metrics_list = ["N", "Adj. Agmt.(br)", "Agmt.(br)", "K(br)",
                              "Pearson(b)", "QWK(b)", "R2(b)", "RMSE(b)"]
        eval_file_short = join(filedir, '{}_eval_short.{}'.format(experiment_id, file_format))

        if exists(eval_file_short):
            df_eval = DataReader.read_from_file(eval_file_short, index_col=0)

            (rename_dict_new,
             existing_eval_cols_new,
             short_metrics_list_new,
             _) = self._modify_eval_columns_to_ensure_version_compatibilty(df_eval,
                                                                           rename_dict,
                                                                           existing_eval_cols,
                                                                           short_metrics_list)

            df_eval = df_eval[existing_eval_cols_new]
            df_eval = df_eval.rename(columns=rename_dict_new)
            files['df_eval'] = df_eval[short_metrics_list_new]
            files['df_eval'].index.name = None

        eval_file = join(filedir, '{}_eval.{}'.format(experiment_id, file_format))
        if exists(eval_file):
            files['df_eval_for_degradation'] = DataReader.read_from_file(eval_file, index_col=0)

        # read in the evaluation metrics by subgroup, if we are asked to
        for group in groups_eval:
            group_eval_file = join(filedir, '{}_eval_by_{}.{}'.format(experiment_id,
                                                                      group,
                                                                      file_format))
            if exists(group_eval_file):
                df_eval = DataReader.read_from_file(group_eval_file, index_col=0)

                (rename_dict_new,
                 existing_eval_cols_new,
                 short_metrics_list_new,
                 smd_name
                 ) = self._modify_eval_columns_to_ensure_version_compatibilty(df_eval,
                                                                              rename_dict,
                                                                              existing_eval_cols,
                                                                              short_metrics_list,
                                                                              raise_warnings=False)

                # if `SMD` is being used, rather than `DSM`, we print a note for the user; we don't
                # want to go so far as to raise a warning, but we do want to give the user some info
                if smd_name == 'SMD':
                    warnings.warn("The subgroup evaluations in `{}` use 'SMD'. Please note "
                                  "that newer versions of RSMTool (7.0 or greater) use 'DSM' with subgroup "
                                  "evaluations. For additional details on how these metrics "
                                  "differ, see the RSMTool documentation. Comparisons with experiments "
                                  "using SMD for subgroup calculations will be deprecated in the next major "
                                  "release.".format(group_eval_file), category=DeprecationWarning)

                df_eval = df_eval[existing_eval_cols_new]
                df_eval = df_eval.rename(columns=rename_dict_new)
                files['df_eval_by_{}'.format(group)] = df_eval[short_metrics_list_new]
                files['df_eval_by_{}'.format(group)].index.name = None

                series = files['df_eval_by_{}'.format(group)]
                files['df_eval_by_{}_overview'.format(group)] = self.make_summary_stat_df(series)

                # set the ordering of mean/SD/SMD statistics
                files['df_eval_by_{}_m_sd'.format(group)] = df_eval[['N', 'H1 mean',
                                                                     'H1 SD', 'score mean(br)',
                                                                     'score SD(br)',
                                                                     'score mean(b)',
                                                                     'score SD(b)',
                                                                     '{}(br)'.format(smd_name),
                                                                     '{}(b)'.format(smd_name)]]
                files['df_eval_by_{}_m_sd'.format(group)].index.name = None

        # read in the partial correlations vs. score for all data
        pcor_score_file = join(filedir, '{}_pcor_score_all_data.{}'.format(experiment_id,
                                                                           file_format))
        if exists(pcor_score_file):
            files['df_pcor_sc1'] = DataReader.read_from_file(pcor_score_file, index_col=0)
            files['df_pcor_sc1_overview'] = self.make_summary_stat_df(files['df_pcor_sc1'])

        # read in the partial correlations by subgroups, if we are asked to
        for group in groups_eval:
            group_pcor_file = join(filedir, '{}_pcor_score_by_{}.{}'.format(experiment_id,
                                                                            group,
                                                                            file_format))
            if exists(group_pcor_file):
                files['df_pcor_sc1_by_{}'
                      ''.format(group)] = DataReader.read_from_file(group_pcor_file,
                                                                    index_col=0)

                series = files['df_pcor_sc1_by_{}'.format(group)]
                files['df_pcor_sc1_{}_overview'.format(group)] = self.make_summary_stat_df(series)

        # read in the marginal correlations vs. score for all data
        mcor_score_file = join(filedir, '{}_margcor_score_all_data.{}'.format(experiment_id,
                                                                              file_format))
        if exists(mcor_score_file):
            files['df_mcor_sc1'] = DataReader.read_from_file(mcor_score_file, index_col=0)
            files['df_mcor_sc1_overview'] = self.make_summary_stat_df(files['df_mcor_sc1'])

        # read in the partial correlations by subgroups, if we are asked to
        for group in groups_eval:
            group_mcor_file = join(filedir,
                                   '{}_margcor_score_by_{}.{}'.format(experiment_id,
                                                                      group,
                                                                      file_format))
            if exists(group_mcor_file):
                files['df_mcor_sc1_by_{}'
                      ''.format(group)] = DataReader.read_from_file(group_mcor_file,
                                                                    index_col=0)

                series = files['df_mcor_sc1_by_{}'.format(group)]
                files['df_mcor_sc1_{}_overview'.format(group)] = self.make_summary_stat_df(series)

        pca_file = join(filedir, '{}_pca.{}'.format(experiment_id, file_format))
        if exists(pca_file):
            files['df_pca'] = DataReader.read_from_file(pca_file, index_col=0)
            files['df_pcavar'] = DataReader.read_from_file(join(filedir,
                                                                '{}_pcavar.{}'.format(experiment_id,
                                                                                      file_format)),
                                                           index_col=0)

        descriptives_file = join(filedir, '{}_feature_descriptives.{}'.format(experiment_id,
                                                                              file_format))
        if exists(descriptives_file):
            # we read all files pertaining to the descriptive analysis together
            # since we merge the outputs
            files['df_descriptives'] = DataReader.read_from_file(descriptives_file, index_col=0)

            # this df contains only the number of features. this is used later
            # for another two tables to show the number of features
            df_features_n_values = files['df_descriptives'][['N', 'min', 'max']]

            files['df_descriptives'] = files['df_descriptives'][['N', 'mean', 'std. dev.',
                                                                 'skewness', 'kurtosis']]

            outliers_file = join(filedir, '{}_feature_outliers.{}'.format(experiment_id,
                                                                          file_format))
            df_outliers = DataReader.read_from_file(outliers_file, index_col=0)
            df_outliers = df_outliers.rename(columns={'upper': 'Upper',
                                                      'lower': 'Lower',
                                                      'both': 'Both',
                                                      'upperperc': 'Upper %',
                                                      'lowerperc': 'Lower %',
                                                      'bothperc': 'Both %'})
            df_outliers_columns = df_outliers.columns.tolist()
            files['df_outliers'] = df_outliers

            # join with df_features_n_values to get the value of N
            files['df_outliers'] = pd.merge(files['df_outliers'], df_features_n_values,
                                            left_index=True,
                                            right_index=True)[['N'] + df_outliers_columns]

            # join with df_features_n_values to get the value of N
            percentiles_file = join(filedir, '{}_feature_descriptives'
                                             'Extra.{}'.format(experiment_id,
                                                               file_format))

            files['df_percentiles'] = DataReader.read_from_file(percentiles_file,
                                                                index_col=0)
            files['df_percentiles'] = pd.merge(files['df_percentiles'],
                                               df_features_n_values,
                                               left_index=True,
                                               right_index=True)

            mild_outliers = (files['df_percentiles']["Mild outliers"] /
                             files['df_percentiles']["N"].astype(float) * 100)

            files['df_percentiles']["Mild outliers (%)"] = mild_outliers

            extreme_outliers = (files['df_percentiles']["Extreme outliers"] /
                                files['df_percentiles']["N"].astype(float) * 100)

            files['df_percentiles']["Extreme outliers (%)"] = extreme_outliers

            files['df_percentiles'] = files['df_percentiles'][['N', 'min', 'max',
                                                               '1%', '5%', '25%',
                                                               '50%', '75%', '95%',
                                                               '99%', 'IQR', 'Mild outliers',
                                                               'Mild outliers (%)',
                                                               'Extreme outliers',
                                                               'Extreme outliers (%)']]

        confmatrix_file = join(filedir, '{}_confMatrix.{}'.format(experiment_id, file_format))
        if exists(confmatrix_file):
            conf_matrix = DataReader.read_from_file(confmatrix_file, index_col=0)
            files['df_confmatrix'] = self.process_confusion_matrix(conf_matrix)

        score_dist_file = join(filedir, '{}_score_dist.{}'.format(experiment_id, file_format))
        if exists(score_dist_file):
            df_score_dist = DataReader.read_from_file(score_dist_file, index_col=1)
            df_score_dist.rename(columns={'sys_{}'.format(prefix): 'sys'}, inplace=True)
            files['df_score_dist'] = df_score_dist[['human', 'sys', 'difference']]

        # read in the feature boxplots by subgroup, if we were asked to
        for group in groups_eval:
            feature_boxplot_prefix = join(figdir,
                                          '{}_feature_boxplot_by_{}'.format(experiment_id, group))
            svg_file = join(feature_boxplot_prefix + '.svg')
            png_file = join(feature_boxplot_prefix + '.png')
            if exists(svg_file):
                figs['feature_boxplots_by_{}_svg'.format(group)] = svg_file

            elif exists(png_file):
                figs['feature_boxplots_by_{}_png'.format(group)] = png_file

        # read in the betas image if exists
        betas_svg = join(figdir, '{}_betas.svg'.format(experiment_id))
        if exists(betas_svg):
            figs['betas'] = betas_svg

        # read in the evaluation barplots by subgroup, if we were asked to
        for group in groups_eval:
            eval_barplot_svg_file = join(figdir, '{}_eval_by_{}.svg'.format(experiment_id, group))
            if exists(eval_barplot_svg_file):
                figs['eval_barplot_by_{}'.format(group)] = eval_barplot_svg_file

        pca_svg_file = join(figdir, '{}_pca.svg'.format(experiment_id))
        if exists(pca_svg_file):
            figs['pca_scree_plot'] = pca_svg_file

        return (files, figs, file_format)
    def test_locate_files_str(self):

        paths = 'file1.csv'
        config_dir = 'output'
        result = DataReader.locate_files(paths, config_dir)
        eq_(result, None)
Beispiel #29
0
def compute_and_save_predictions(config_file_or_obj,
                                 output_file,
                                 feats_file=None):
    """
    Run ``rsmpredict`` with given configuration file and generate
    predictions (and, optionally, pre-processed feature values).

    Parameters
    ----------
    config_file_or_obj : str or configuration_parser.Configuration
        Path to the experiment configuration file.
        Users can also pass a `Configuration` object that is in memory.
    output_dir : str
        Path to the output directory for saving files.
    feats_file (optional): str
        Path to the output file for saving preprocessed feature values.

    Raises
    ------
    ValueError
        If any of the required fields are missing or ill-specified.
    """

    logger = logging.getLogger(__name__)

    # Allow users to pass Configuration object to the
    # `config_file_or_obj` argument, rather than read file
    if not isinstance(config_file_or_obj, Configuration):

        # Instantiate configuration parser object
        parser = ConfigurationParser.get_configparser(config_file_or_obj)
        config = parser.read_normalize_validate_and_process_config(
            config_file_or_obj, context='rsmpredict')

        # get the directory where the config file lives
        configpath = dirname(config_file_or_obj)

    else:

        config = config_file_or_obj
        if config.filepath is not None:
            configpath = dirname(config.filepath)
        else:
            configpath = os.getcwd()

    # get the experiment ID
    experiment_id = config['experiment_id']

    # Get output format
    file_format = config.get('file_format', 'csv')

    # Get DataWriter object
    writer = DataWriter(experiment_id)

    # get the input file containing the feature values
    # for which we want to generate the predictions
    input_features_file = DataReader.locate_files(
        config['input_features_file'], configpath)
    if not input_features_file:
        raise FileNotFoundError('Input file {} does not exist'
                                ''.format(config['input_features_file']))

    experiment_dir = DataReader.locate_files(config['experiment_dir'],
                                             configpath)
    if not experiment_dir:
        raise FileNotFoundError('The directory {} does not exist.'
                                ''.format(config['experiment_dir']))
    else:
        experiment_output_dir = normpath(join(experiment_dir, 'output'))
        if not exists(experiment_output_dir):
            raise FileNotFoundError(
                'The directory {} does not contain '
                'the output of an rsmtool experiment.'.format(experiment_dir))

    # find all the .model files in the experiment output directory
    model_files = glob.glob(join(experiment_output_dir, '*.model'))
    if not model_files:
        raise FileNotFoundError(
            'The directory {} does not contain any rsmtool models.'
            ''.format(experiment_output_dir))

    experiment_ids = [splitext(basename(mf))[0] for mf in model_files]
    if experiment_id not in experiment_ids:
        raise FileNotFoundError(
            '{} does not contain a model for the experiment "{}". '
            'The following experiments are contained in this '
            'directory: {}'.format(experiment_output_dir, experiment_id,
                                   experiment_ids))

    # check that the directory contains outher required files
    required_file_types = ['feature', 'postprocessing_params']
    for file_type in required_file_types:
        expected_file_name = "{}_{}.csv".format(experiment_id, file_type)
        if not exists(join(experiment_output_dir, expected_file_name)):
            raise FileNotFoundError('{} does not contain the required file '
                                    '{} that was generated during the '
                                    'original model training'.format(
                                        experiment_output_dir,
                                        expected_file_name))

    # model_files = glob.glob(join(experiment_output_dir, '*.model'))
    # if not model_files:
    #     raise FileNotFoundError('The directory {} does not contain any rsmtool models. '
    #                             ''.format(experiment_output_dir))

    logger.info('Reading input files.')

    feature_info = join(experiment_output_dir,
                        '{}_feature.csv'.format(experiment_id))

    post_processing = join(
        experiment_output_dir,
        '{}_postprocessing_params.csv'.format(experiment_id))

    file_paths = [input_features_file, feature_info, post_processing]
    file_names = ['input_features', 'feature_info', 'postprocessing_params']

    converters = {'input_features': config.get_default_converter()}

    # Initialize the reader
    reader = DataReader(file_paths, file_names, converters)
    data_container = reader.read(
        kwargs_dict={'feature_info': {
            'index_col': 0
        }})

    # load the Modeler to generate the predictions
    model = Modeler.load_from_file(
        join(experiment_output_dir, '{}.model'.format(experiment_id)))

    # Add the model to the configuration object
    config['model'] = model

    # Initialize the processor
    processor = FeaturePreprocessor()

    (processed_config,
     processed_container) = processor.process_data(config,
                                                   data_container,
                                                   context='rsmpredict')

    # save the pre-processed features to disk if we were asked to
    if feats_file is not None:
        logger.info(
            'Saving pre-processed feature values to {}'.format(feats_file))

        feats_dir = dirname(feats_file)

        # create any directories needed for the output file
        os.makedirs(feats_dir, exist_ok=True)

        _, feats_filename = split(feats_file)
        feats_filename, _ = splitext(feats_filename)

        # Write out files
        writer.write_experiment_output(
            feats_dir,
            processed_container,
            include_experiment_id=False,
            dataframe_names=['features_processed'],
            new_names_dict={'features_processed': feats_filename},
            file_format=file_format)

    if (output_file.lower().endswith('.csv')
            or output_file.lower().endswith('.xlsx')):

        output_dir = dirname(output_file)
        _, filename = split(output_file)
        filename, _ = splitext(filename)

    else:
        output_dir = output_file
        filename = 'predictions_with_metadata'

    # create any directories needed for the output file
    os.makedirs(output_dir, exist_ok=True)

    # save the predictions to disk
    logger.info('Saving predictions.')

    # Write out files
    writer.write_experiment_output(
        output_dir,
        processed_container,
        include_experiment_id=False,
        dataframe_names=['predictions_with_metadata'],
        new_names_dict={'predictions_with_metadata': filename},
        file_format=file_format)

    # save excluded responses to disk
    if not processed_container.excluded.empty:

        # save the predictions to disk
        logger.info('Saving excluded responses to {}'.format(
            join(output_dir, '{}_excluded_responses.csv'
                 ''.format(filename))))

        # Write out files
        writer.write_experiment_output(output_dir,
                                       processed_container,
                                       include_experiment_id=False,
                                       dataframe_names=['excluded'],
                                       new_names_dict={
                                           'excluded':
                                           '{}_excluded_responses'
                                           ''.format(filename)
                                       },
                                       file_format=file_format)
Beispiel #30
0
def check_subgroup_outputs(output_dir,
                           experiment_id,
                           subgroups,
                           file_format='csv'):
    """
    Check to make sure that the subgroup outputs
    look okay. Raise an AssertionError if they do not.

    Parameters
    ----------
    output_dir : str
        Path to the `output` experiment output directory for a test.
    experiment_id : str
        The experiment ID.
    subgroups : list of str
        List of column names that contain grouping
        information.
    file_format : str, optional
        The format of the output files.
        Defaults to 'csv'.
    """
    train_preprocessed_file = join(
        output_dir, '{}_train_metadata.{}'.format(experiment_id, file_format))
    train_preprocessed = DataReader.read_from_file(train_preprocessed_file,
                                                   index_col=0)

    test_preprocessed_file = join(
        output_dir, '{}_test_metadata.{}'.format(experiment_id, file_format))
    test_preprocessed = DataReader.read_from_file(test_preprocessed_file,
                                                  index_col=0)
    for group in subgroups:
        ok_(group in train_preprocessed.columns)
        ok_(group in test_preprocessed.columns)

    # check that the total sum of N per category matches the total N
    # in data composition and the total N categories matches what is
    # in overall data composition
    file_data_composition_all = join(
        output_dir, '{}_data_composition.{}'.format(experiment_id,
                                                    file_format))
    df_data_composition_all = DataReader.read_from_file(
        file_data_composition_all)
    for group in subgroups:
        file_composition_by_group = join(
            output_dir,
            '{}_data_composition_by_{}.{}'.format(experiment_id, group,
                                                  file_format))
        composition_by_group = DataReader.read_from_file(
            file_composition_by_group)
        for partition in ['Training', 'Evaluation']:
            partition_info = df_data_composition_all.loc[
                df_data_composition_all['partition'] == partition]

            summation = sum(composition_by_group['{} set'
                                                 ''.format(partition)])
            ok_(summation == partition_info.iloc[0]['responses'])

            length = len(composition_by_group.loc[
                composition_by_group['{} set'
                                     ''.format(partition)] != 0])
            ok_(length == partition_info.iloc[0][group])
    def test_locate_files_wrong_type(self):

        paths = {'file1.csv', 'file2.xlsx'}
        config_dir = 'output'
        DataReader.locate_files(paths, config_dir)