def check_scaled_coefficients(source, experiment_id, file_format='csv'):
    """
    Check that the predictions generated using scaled
    coefficients match the scaled scores. Raises an
    AssertionError if they do not.

    Parameters
    ----------
    source : str
        Path to the source directory on disk.
    experiment_id : str
        The experiment ID.
    file_format : str, optional
        The format of the output files.
        Defaults to 'csv'.
    """
    preprocessed_test_file = join('test_outputs',
                                  source,
                                  'output',
                                  '{}_test_preprocessed_features.{}'.format(experiment_id,
                                                                            file_format))
    scaled_coefficients_file = join('test_outputs',
                                    source,
                                    'output',
                                    '{}_coefficients_scaled.{}'.format(experiment_id,
                                                                       file_format))
    predictions_file = join('test_outputs',
                            source,
                            'output',
                            '{}_pred_processed.{}'.format(experiment_id,
                                                          file_format))

    postprocessing_params_file = join('test_outputs',
                                      source,
                                      'output',
                                      '{}_postprocessing_params.{}'.format(experiment_id,
                                                                           file_format))

    postproc_params = DataReader.read_from_file(postprocessing_params_file).loc[0]
    df_preprocessed_test_data = DataReader.read_from_file(preprocessed_test_file)
    df_old_predictions = DataReader.read_from_file(predictions_file)
    df_old_predictions = df_old_predictions[['spkitemid', 'sc1', 'scale']]

    # create fake skll objects with new coefficients
    df_coef = DataReader.read_from_file(scaled_coefficients_file)
    learner = Modeler.create_fake_skll_learner(df_coef)
    modeler = Modeler.load_from_learner(learner)

    # generate new predictions and rename the prediction column to 'scale'
    df_new_predictions = modeler.predict(df_preprocessed_test_data,
                                         postproc_params['trim_min'],
                                         postproc_params['trim_max'])
    df_new_predictions.rename(columns={'raw': 'scale'}, inplace=True)

    # check that new predictions match the scaled old predictions
    assert_frame_equal(df_new_predictions.sort_index(axis=1),
                       df_old_predictions.sort_index(axis=1),
                       check_exact=False,
                       check_less_precise=True)
Ejemplo n.º 2
0
def check_scaled_coefficients(source, experiment_id, file_format='csv'):
    """
    Check that the predictions generated using scaled
    coefficients match the scaled scores. Raises an
    AssertionError if they do not.

    Parameters
    ----------
    source : str
        Path to the source directory on disk.
    experiment_id : str
        The experiment ID.
    file_format : str, optional
        The format of the output files.
        Defaults to 'csv'.
    """
    preprocessed_test_file = join(
        'test_outputs', source, 'output',
        '{}_test_preprocessed_features.{}'.format(experiment_id, file_format))
    scaled_coefficients_file = join(
        'test_outputs', source, 'output',
        '{}_coefficients_scaled.{}'.format(experiment_id, file_format))
    predictions_file = join(
        'test_outputs', source, 'output',
        '{}_pred_processed.{}'.format(experiment_id, file_format))

    postprocessing_params_file = join(
        'test_outputs', source, 'output',
        '{}_postprocessing_params.{}'.format(experiment_id, file_format))

    postproc_params = DataReader.read_from_file(
        postprocessing_params_file).loc[0]
    df_preprocessed_test_data = DataReader.read_from_file(
        preprocessed_test_file)
    df_old_predictions = DataReader.read_from_file(predictions_file)
    df_old_predictions = df_old_predictions[['spkitemid', 'sc1', 'scale']]

    # create fake skll objects with new coefficients
    df_coef = DataReader.read_from_file(scaled_coefficients_file)
    learner = Modeler.create_fake_skll_learner(df_coef)
    modeler = Modeler.load_from_learner(learner)

    # generate new predictions and rename the prediction column to 'scale'
    df_new_predictions = modeler.predict(df_preprocessed_test_data,
                                         postproc_params['trim_min'],
                                         postproc_params['trim_max'])
    df_new_predictions.rename(columns={'raw': 'scale'}, inplace=True)

    # check that new predictions match the scaled old predictions
    assert_frame_equal(df_new_predictions.sort_index(axis=1),
                       df_old_predictions.sort_index(axis=1),
                       check_exact=False,
                       check_less_precise=True)
Ejemplo n.º 3
0
def run_experiment(config_file_or_obj, output_dir):
    """
    Run RSMTool experiment using the given configuration
    file and generate all outputs in the given directory.

    Parameters
    ----------
    config_file_or_obj : str or Configuration
        Path to the experiment configuration file.
        Users can also pass a `Configuration` object that is in memory.
    output_dir : str
        Path to the experiment output directory.

    Raises
    ------
    ValueError
        If any of the required fields are missing or ill-specified.
    """

    logger = logging.getLogger(__name__)

    # create the 'output' and the 'figure' sub-directories
    # where all the experiment output such as the CSV files
    # and the box plots will be saved

    # Get absolute paths to output directories
    csvdir = abspath(join(output_dir, 'output'))
    figdir = abspath(join(output_dir, 'figure'))
    reportdir = abspath(join(output_dir, 'report'))
    featuredir = abspath(join(output_dir, 'feature'))

    # Make directories, if necessary
    makedirs(csvdir, exist_ok=True)
    makedirs(figdir, exist_ok=True)
    makedirs(reportdir, exist_ok=True)

    # Allow users to pass Configuration object to the
    # `config_file_or_obj` argument, rather than read from file
    if not isinstance(config_file_or_obj, Configuration):

        # Instantiate configuration parser object
        parser = ConfigurationParser.get_configparser(config_file_or_obj)
        configuration = parser.read_normalize_validate_and_process_config(
            config_file_or_obj)

        # get the directory where the configuration file lives
        configpath = dirname(config_file_or_obj)

    else:

        configuration = config_file_or_obj
        if configuration.filepath is not None:
            configpath = dirname(configuration.filepath)
        else:
            configpath = getcwd()

    logger.info('Saving configuration file.')
    configuration.save(output_dir)

    # Get output format
    file_format = configuration.get('file_format', 'csv')

    # Get DataWriter object
    writer = DataWriter(configuration['experiment_id'])

    # Get the paths and names for the DataReader

    (file_names, file_paths_org) = configuration.get_names_and_paths(
        ['train_file', 'test_file', 'features', 'feature_subset_file'],
        ['train', 'test', 'feature_specs', 'feature_subset_specs'])

    file_paths = DataReader.locate_files(file_paths_org, configpath)

    # if there are any missing files after trying to locate
    # all expected files, raise an error
    if None in file_paths:
        missing_file_paths = [
            file_paths_org[idx] for idx, path in enumerate(file_paths)
            if path is None
        ]
        raise FileNotFoundError('The following files were not found: '
                                '{}'.format(repr(missing_file_paths)))

    # Use the default converter for both train and test
    converters = {
        'train': configuration.get_default_converter(),
        'test': configuration.get_default_converter()
    }

    logger.info('Reading in all data from files.')

    # Initialize the reader
    reader = DataReader(file_paths, file_names, converters)
    data_container = reader.read()

    logger.info('Preprocessing all features.')

    # Initialize the processor
    processor = FeaturePreprocessor()

    (processed_config,
     processed_container) = processor.process_data(configuration,
                                                   data_container)

    # Rename certain frames with more descriptive names
    # for writing out experiment files
    rename_dict = {
        'train_excluded': 'train_excluded_responses',
        'test_excluded': 'test_excluded_responses',
        'train_length': 'train_response_lengths',
        'train_flagged': 'train_responses_with_excluded_flags',
        'test_flagged': 'test_responses_with_excluded_flags'
    }

    logger.info('Saving training and test set data to disk.')

    # Write out files
    writer.write_experiment_output(
        csvdir,
        processed_container, [
            'train_features', 'test_features', 'train_metadata',
            'test_metadata', 'train_other_columns', 'test_other_columns',
            'train_preprocessed_features', 'test_preprocessed_features',
            'train_excluded', 'test_excluded', 'train_length',
            'test_human_scores', 'train_flagged', 'test_flagged'
        ],
        rename_dict,
        file_format=file_format)

    # Initialize the analyzer
    analyzer = Analyzer()

    (analyzed_config,
     analyzed_container) = analyzer.run_data_composition_analyses_for_rsmtool(
         processed_container, processed_config)

    # Write out files
    writer.write_experiment_output(csvdir,
                                   analyzed_container,
                                   file_format=file_format)

    logger.info('Training {} model.'.format(processed_config['model_name']))

    # Initialize modeler
    modeler = Modeler()

    modeler.train(processed_config, processed_container, csvdir, figdir,
                  file_format)

    # Identify the features used by the model
    selected_features = modeler.get_feature_names()

    # Add selected features to processed configuration
    processed_config['selected_features'] = selected_features

    # Write out files
    writer.write_feature_csv(featuredir,
                             processed_container,
                             selected_features,
                             file_format=file_format)

    features_data_container = processed_container.copy()

    # Get selected feature info, and write out to file
    df_feature_info = features_data_container.feature_info.copy()
    df_selected_feature_info = df_feature_info[df_feature_info['feature'].isin(
        selected_features)]
    selected_feature_dataset_dict = {
        'name': 'selected_feature_info',
        'frame': df_selected_feature_info
    }

    features_data_container.add_dataset(selected_feature_dataset_dict,
                                        update=True)

    writer.write_experiment_output(
        csvdir,
        features_data_container,
        dataframe_names=['selected_feature_info'],
        new_names_dict={'selected_feature_info': 'feature'},
        file_format=file_format)

    logger.info('Running analyses on training set.')

    (train_analyzed_config,
     train_analyzed_container) = analyzer.run_training_analyses(
         processed_container, processed_config)

    # Write out files
    writer.write_experiment_output(csvdir,
                                   train_analyzed_container,
                                   reset_index=True,
                                   file_format=file_format)

    # Use only selected features for predictions
    columns_for_prediction = ['spkitemid', 'sc1'] + selected_features
    train_for_prediction = processed_container.train_preprocessed_features[
        columns_for_prediction]
    test_for_prediction = processed_container.test_preprocessed_features[
        columns_for_prediction]

    logged_str = 'Generating training and test set predictions'
    logged_str += ' (expected scores).' if configuration[
        'predict_expected_scores'] else '.'
    logger.info(logged_str)
    (pred_config, pred_data_container) = modeler.predict_train_and_test(
        train_for_prediction, test_for_prediction, processed_config)

    # Write out files
    writer.write_experiment_output(
        csvdir,
        pred_data_container,
        new_names_dict={'pred_test': 'pred_processed'},
        file_format=file_format)

    original_coef_file = join(
        csvdir, '{}_coefficients.{}'.format(pred_config['experiment_id'],
                                            file_format))

    # If coefficients file exists, then generate
    # scaled coefficients and save to file
    if exists(original_coef_file):
        logger.info('Scaling the coefficients and saving them to disk')
        try:

            # Scale coefficients, and return DataContainer w/ scaled coefficients
            scaled_data_container = modeler.scale_coefficients(pred_config)

            # Write out files to disk
            writer.write_experiment_output(csvdir,
                                           scaled_data_container,
                                           file_format=file_format)

        except AttributeError:
            raise ValueError(
                "It appears you are trying to save two different "
                "experiments to the same directory using the same "
                "ID. Please clear the content of the directory and "
                "rerun both experiments using different "
                "experiment IDs.")

    # Add processed data_container frames to pred_data_container
    new_pred_data_container = pred_data_container + processed_container

    logger.info('Running prediction analyses.')
    (pred_analysis_config,
     pred_analysis_data_container) = analyzer.run_prediction_analyses(
         new_pred_data_container, pred_config)

    # Write out files
    writer.write_experiment_output(csvdir,
                                   pred_analysis_data_container,
                                   reset_index=True,
                                   file_format=file_format)
    # Initialize reporter
    reporter = Reporter()

    # generate the report
    logger.info('Starting report generation.')
    reporter.create_report(processed_config, csvdir, figdir)
def compute_and_save_predictions(config_file_or_obj, output_file, feats_file=None):
    """
    Run ``rsmpredict`` with given configuration file and generate
    predictions (and, optionally, pre-processed feature values).

    Parameters
    ----------
    config_file_or_obj : str or configuration_parser.Configuration
        Path to the experiment configuration file.
        Users can also pass a `Configuration` object that is in memory.
    output_dir : str
        Path to the output directory for saving files.
    feats_file (optional): str
        Path to the output file for saving preprocessed feature values.

    Raises
    ------
    ValueError
        If any of the required fields are missing or ill-specified.
    """

    logger = logging.getLogger(__name__)

    # Allow users to pass Configuration object to the
    # `config_file_or_obj` argument, rather than read file
    if not isinstance(config_file_or_obj, Configuration):

        # Instantiate configuration parser object
        parser = ConfigurationParser.get_configparser(config_file_or_obj)
        config = parser.read_normalize_validate_and_process_config(config_file_or_obj,
                                                                   context='rsmpredict')

        # get the directory where the config file lives
        configpath = dirname(config_file_or_obj)

    else:

        config = config_file_or_obj
        if config.filepath is not None:
            configpath = dirname(config.filepath)
        else:
            configpath = os.getcwd()

    # get the experiment ID
    experiment_id = config['experiment_id']

    # Get output format
    file_format = config.get('file_format', 'csv')

    # Get DataWriter object
    writer = DataWriter(experiment_id)

    # get the input file containing the feature values
    # for which we want to generate the predictions
    input_features_file = DataReader.locate_files(config['input_features_file'], configpath)
    if not input_features_file:
        raise FileNotFoundError('Input file {} does not exist'
                                ''.format(config['input_features_file']))

    experiment_dir = DataReader.locate_files(config['experiment_dir'], configpath)
    if not experiment_dir:
        raise FileNotFoundError('The directory {} does not exist.'
                                ''.format(config['experiment_dir']))
    else:
        experiment_output_dir = normpath(join(experiment_dir, 'output'))
        if not exists(experiment_output_dir):
            raise FileNotFoundError('The directory {} does not contain '
                                    'the output of an rsmtool experiment.'.format(experiment_dir))

    # find all the .model files in the experiment output directory
    model_files = glob.glob(join(experiment_output_dir, '*.model'))
    if not model_files:
        raise FileNotFoundError('The directory {} does not contain any rsmtool models.'
                                ''.format(experiment_output_dir))

    experiment_ids = [splitext(basename(mf))[0] for mf in model_files]
    if experiment_id not in experiment_ids:
        raise FileNotFoundError('{} does not contain a model for the experiment "{}". '
                                'The following experiments are contained in this '
                                'directory: {}'.format(experiment_output_dir,
                                                       experiment_id,
                                                       experiment_ids))

    # check that the directory contains outher required files
    required_file_types = ['feature', 'postprocessing_params']
    for file_type in required_file_types:
        expected_file_name = "{}_{}.csv".format(experiment_id, file_type)
        if not exists(join(experiment_output_dir, expected_file_name)):
            raise FileNotFoundError('{} does not contain the required file '
                                    '{} that was generated during the '
                                    'original model training'.format(experiment_output_dir,
                                                                     expected_file_name))

    # model_files = glob.glob(join(experiment_output_dir, '*.model'))
    # if not model_files:
    #     raise FileNotFoundError('The directory {} does not contain any rsmtool models. '
    #                             ''.format(experiment_output_dir))

    logger.info('Reading input files.')

    feature_info = join(experiment_output_dir,
                        '{}_feature.csv'.format(experiment_id))

    post_processing = join(experiment_output_dir,
                           '{}_postprocessing_params.csv'.format(experiment_id))

    file_paths = [input_features_file, feature_info, post_processing]
    file_names = ['input_features',
                  'feature_info',
                  'postprocessing_params']

    converters = {'input_features': config.get_default_converter()}

    # Initialize the reader
    reader = DataReader(file_paths, file_names, converters)
    data_container = reader.read(kwargs_dict={'feature_info': {'index_col': 0}})

    # load the Modeler to generate the predictions
    model = Modeler.load_from_file(join(experiment_output_dir,
                                        '{}.model'.format(experiment_id)))

    # Add the model to the configuration object
    config['model'] = model

    # Initialize the processor
    processor = FeaturePreprocessor()

    (processed_config,
     processed_container) = processor.process_data(config,
                                                   data_container,
                                                   context='rsmpredict')

    # save the pre-processed features to disk if we were asked to
    if feats_file is not None:
        logger.info('Saving pre-processed feature values to {}'.format(feats_file))

        feats_dir = dirname(feats_file)

        # create any directories needed for the output file
        os.makedirs(feats_dir, exist_ok=True)

        _, feats_filename = split(feats_file)
        feats_filename, _ = splitext(feats_filename)

        # Write out files
        writer.write_experiment_output(feats_dir,
                                       processed_container,
                                       include_experiment_id=False,
                                       dataframe_names=['features_processed'],
                                       new_names_dict={'features_processed':
                                                       feats_filename},
                                       file_format=file_format)

    if (output_file.lower().endswith('.csv') or
            output_file.lower().endswith('.xlsx')):

        output_dir = dirname(output_file)
        _, filename = split(output_file)
        filename, _ = splitext(filename)

    else:
        output_dir = output_file
        filename = 'predictions_with_metadata'

    # create any directories needed for the output file
    os.makedirs(output_dir, exist_ok=True)

    # save the predictions to disk
    logger.info('Saving predictions.')

    # Write out files
    writer.write_experiment_output(output_dir,
                                   processed_container,
                                   include_experiment_id=False,
                                   dataframe_names=['predictions_with_metadata'],
                                   new_names_dict={'predictions_with_metadata':
                                                   filename},
                                   file_format=file_format)

    # save excluded responses to disk
    if not processed_container.excluded.empty:

        # save the predictions to disk
        logger.info('Saving excluded responses to {}'.format(join(output_dir,
                                                                  '{}_excluded_responses.csv'
                                                                  ''.format(filename))))

        # Write out files
        writer.write_experiment_output(output_dir,
                                       processed_container,
                                       include_experiment_id=False,
                                       dataframe_names=['excluded'],
                                       new_names_dict={'excluded':
                                                       '{}_excluded_responses'
                                                       ''.format(filename)},
                                       file_format=file_format)
Ejemplo n.º 5
0
 def test_get_feature_names_is_none(self):
     modeler = Modeler()
     eq_(modeler.get_feature_names(), None)
Ejemplo n.º 6
0
 def test_get_intercept_is_none(self):
     modeler = Modeler()
     eq_(modeler.get_intercept(), None)
Ejemplo n.º 7
0
 def test_get_coefficients_is_none(self):
     modeler = Modeler()
     eq_(modeler.get_coefficients(), None)
Ejemplo n.º 8
0
    def setUp(self):

        series = pd.Series([34, .34, 1.2], index=['const', 'A', 'B'])
        coef = Modeler.ols_coefficients_to_dataframe(series)
        learner = Modeler.create_fake_skll_learner(coef)
        self.modeler = Modeler.load_from_learner(learner)
 def test_get_feature_names_is_none(self):
     modeler = Modeler()
     eq_(modeler.get_feature_names(), None)
 def test_get_intercept_is_none(self):
     modeler = Modeler()
     eq_(modeler.get_intercept(), None)
 def test_get_coefficients_is_none(self):
     modeler = Modeler()
     eq_(modeler.get_coefficients(), None)
    def setUp(self):

        series = pd.Series([34, .34, 1.2], index=['const', 'A', 'B'])
        coef = Modeler.ols_coefficients_to_dataframe(series)
        learner = Modeler.create_fake_skll_learner(coef)
        self.modeler = Modeler.load_from_learner(learner)
Ejemplo n.º 13
0
def run_experiment(config_file_or_obj,
                   output_dir):
    """
    Run RSMTool experiment using the given configuration
    file and generate all outputs in the given directory.

    Parameters
    ----------
    config_file_or_obj : str or Configuration
        Path to the experiment configuration file.
        Users can also pass a `Configuration` object that is in memory.
    output_dir : str
        Path to the experiment output directory.

    Raises
    ------
    ValueError
        If any of the required fields are missing or ill-specified.
    """

    logger = logging.getLogger(__name__)

    # create the 'output' and the 'figure' sub-directories
    # where all the experiment output such as the CSV files
    # and the box plots will be saved

    # Get absolute paths to output directories
    csvdir = abspath(join(output_dir, 'output'))
    figdir = abspath(join(output_dir, 'figure'))
    reportdir = abspath(join(output_dir, 'report'))
    featuredir = abspath(join(output_dir, 'feature'))

    # Make directories, if necessary
    makedirs(csvdir, exist_ok=True)
    makedirs(figdir, exist_ok=True)
    makedirs(reportdir, exist_ok=True)

    # Allow users to pass Configuration object to the
    # `config_file_or_obj` argument, rather than read from file
    if not isinstance(config_file_or_obj, Configuration):

        # Instantiate configuration parser object
        parser = ConfigurationParser.get_configparser(config_file_or_obj)
        configuration = parser.read_normalize_validate_and_process_config(config_file_or_obj)

        # get the directory where the configuration file lives
        configpath = dirname(config_file_or_obj)

    else:

        configuration = config_file_or_obj
        if configuration.filepath is not None:
            configpath = dirname(configuration.filepath)
        else:
            configpath = getcwd()

    logger.info('Saving configuration file.')
    configuration.save(output_dir)

    # Get output format
    file_format = configuration.get('file_format', 'csv')

    # Get DataWriter object
    writer = DataWriter(configuration['experiment_id'])

    # Get the paths and names for the DataReader

    (file_names,
     file_paths_org) = configuration.get_names_and_paths(['train_file', 'test_file',
                                                          'features',
                                                          'feature_subset_file'],
                                                         ['train', 'test',
                                                          'feature_specs',
                                                          'feature_subset_specs'])

    file_paths = DataReader.locate_files(file_paths_org, configpath)

    # if there are any missing files after trying to locate
    # all expected files, raise an error
    if None in file_paths:
        missing_file_paths = [file_paths_org[idx] for idx, path in enumerate(file_paths)
                              if path is None]
        raise FileNotFoundError('The following files were not found: '
                                '{}'.format(repr(missing_file_paths)))

    # Use the default converter for both train and test
    converters = {'train': configuration.get_default_converter(),
                  'test': configuration.get_default_converter()}

    logger.info('Reading in all data from files.')

    # Initialize the reader
    reader = DataReader(file_paths, file_names, converters)
    data_container = reader.read()

    logger.info('Preprocessing all features.')

    # Initialize the processor
    processor = FeaturePreprocessor()

    (processed_config,
     processed_container) = processor.process_data(configuration,
                                                   data_container)

    # Rename certain frames with more descriptive names
    # for writing out experiment files
    rename_dict = {'train_excluded': 'train_excluded_responses',
                   'test_excluded': 'test_excluded_responses',
                   'train_length': 'train_response_lengths',
                   'train_flagged': 'train_responses_with_excluded_flags',
                   'test_flagged': 'test_responses_with_excluded_flags'}

    logger.info('Saving training and test set data to disk.')

    # Write out files
    writer.write_experiment_output(csvdir,
                                   processed_container,
                                   ['train_features',
                                    'test_features',
                                    'train_metadata',
                                    'test_metadata',
                                    'train_other_columns',
                                    'test_other_columns',
                                    'train_preprocessed_features',
                                    'test_preprocessed_features',
                                    'train_excluded',
                                    'test_excluded',
                                    'train_length',
                                    'test_human_scores',
                                    'train_flagged',
                                    'test_flagged'],
                                   rename_dict,
                                   file_format=file_format)

    # Initialize the analyzer
    analyzer = Analyzer()

    (analyzed_config,
     analyzed_container) = analyzer.run_data_composition_analyses_for_rsmtool(processed_container,
                                                                              processed_config)

    # Write out files
    writer.write_experiment_output(csvdir,
                                   analyzed_container,
                                   file_format=file_format)

    logger.info('Training {} model.'.format(processed_config['model_name']))

    # Initialize modeler
    modeler = Modeler()

    modeler.train(processed_config,
                  processed_container,
                  csvdir,
                  figdir,
                  file_format)

    # Identify the features used by the model
    selected_features = modeler.get_feature_names()

    # Add selected features to processed configuration
    processed_config['selected_features'] = selected_features

    # Write out files
    writer.write_feature_csv(featuredir,
                             processed_container,
                             selected_features,
                             file_format=file_format)

    features_data_container = processed_container.copy()

    # Get selected feature info, and write out to file
    df_feature_info = features_data_container.feature_info.copy()
    df_selected_feature_info = df_feature_info[df_feature_info['feature'].isin(selected_features)]
    selected_feature_dataset_dict = {'name': 'selected_feature_info',
                                     'frame': df_selected_feature_info}

    features_data_container.add_dataset(selected_feature_dataset_dict,
                                        update=True)

    writer.write_experiment_output(csvdir,
                                   features_data_container,
                                   dataframe_names=['selected_feature_info'],
                                   new_names_dict={'selected_feature_info': 'feature'},
                                   file_format=file_format)

    logger.info('Running analyses on training set.')

    (train_analyzed_config,
     train_analyzed_container) = analyzer.run_training_analyses(processed_container,
                                                                processed_config)

    # Write out files
    writer.write_experiment_output(csvdir,
                                   train_analyzed_container,
                                   reset_index=True,
                                   file_format=file_format)

    # Use only selected features for predictions
    columns_for_prediction = ['spkitemid', 'sc1'] + selected_features
    train_for_prediction = processed_container.train_preprocessed_features[columns_for_prediction]
    test_for_prediction = processed_container.test_preprocessed_features[columns_for_prediction]

    logged_str = 'Generating training and test set predictions'
    logged_str += ' (expected scores).' if configuration['predict_expected_scores'] else '.'
    logger.info(logged_str)
    (pred_config,
     pred_data_container) = modeler.predict_train_and_test(train_for_prediction,
                                                           test_for_prediction,
                                                           processed_config)

    # Write out files
    writer.write_experiment_output(csvdir,
                                   pred_data_container,
                                   new_names_dict={'pred_test': 'pred_processed'},
                                   file_format=file_format)

    original_coef_file = join(csvdir, '{}_coefficients.{}'.format(pred_config['experiment_id'],
                                                                  file_format))

    # If coefficients file exists, then generate
    # scaled coefficients and save to file
    if exists(original_coef_file):
        logger.info('Scaling the coefficients and saving them to disk')
        try:

            # Scale coefficients, and return DataContainer w/ scaled coefficients
            scaled_data_container = modeler.scale_coefficients(pred_config)

            # Write out files to disk
            writer.write_experiment_output(csvdir,
                                           scaled_data_container,
                                           file_format=file_format)

        except AttributeError:
            raise ValueError("It appears you are trying to save two different "
                             "experiments to the same directory using the same "
                             "ID. Please clear the content of the directory and "
                             "rerun both experiments using different "
                             "experiment IDs.")

    # Add processed data_container frames to pred_data_container
    new_pred_data_container = pred_data_container + processed_container

    logger.info('Running prediction analyses.')
    (pred_analysis_config,
     pred_analysis_data_container) = analyzer.run_prediction_analyses(new_pred_data_container,
                                                                      pred_config)

    # Write out files
    writer.write_experiment_output(csvdir,
                                   pred_analysis_data_container,
                                   reset_index=True,
                                   file_format=file_format)
    # Initialize reporter
    reporter = Reporter()

    # generate the report
    logger.info('Starting report generation.')
    reporter.create_report(processed_config,
                           csvdir,
                           figdir)
Ejemplo n.º 14
0
def compute_and_save_predictions(config_file_or_obj,
                                 output_file,
                                 feats_file=None):
    """
    Run ``rsmpredict`` with given configuration file and generate
    predictions (and, optionally, pre-processed feature values).

    Parameters
    ----------
    config_file_or_obj : str or configuration_parser.Configuration
        Path to the experiment configuration file.
        Users can also pass a `Configuration` object that is in memory.
    output_dir : str
        Path to the output directory for saving files.
    feats_file (optional): str
        Path to the output file for saving preprocessed feature values.

    Raises
    ------
    ValueError
        If any of the required fields are missing or ill-specified.
    """

    logger = logging.getLogger(__name__)

    # Allow users to pass Configuration object to the
    # `config_file_or_obj` argument, rather than read file
    if not isinstance(config_file_or_obj, Configuration):

        # Instantiate configuration parser object
        parser = ConfigurationParser.get_configparser(config_file_or_obj)
        config = parser.read_normalize_validate_and_process_config(
            config_file_or_obj, context='rsmpredict')

        # get the directory where the config file lives
        configpath = dirname(config_file_or_obj)

    else:

        config = config_file_or_obj
        if config.filepath is not None:
            configpath = dirname(config.filepath)
        else:
            configpath = os.getcwd()

    # get the experiment ID
    experiment_id = config['experiment_id']

    # Get output format
    file_format = config.get('file_format', 'csv')

    # Get DataWriter object
    writer = DataWriter(experiment_id)

    # get the input file containing the feature values
    # for which we want to generate the predictions
    input_features_file = DataReader.locate_files(
        config['input_features_file'], configpath)
    if not input_features_file:
        raise FileNotFoundError('Input file {} does not exist'
                                ''.format(config['input_features_file']))

    experiment_dir = DataReader.locate_files(config['experiment_dir'],
                                             configpath)
    if not experiment_dir:
        raise FileNotFoundError('The directory {} does not exist.'
                                ''.format(config['experiment_dir']))
    else:
        experiment_output_dir = normpath(join(experiment_dir, 'output'))
        if not exists(experiment_output_dir):
            raise FileNotFoundError(
                'The directory {} does not contain '
                'the output of an rsmtool experiment.'.format(experiment_dir))

    # find all the .model files in the experiment output directory
    model_files = glob.glob(join(experiment_output_dir, '*.model'))
    if not model_files:
        raise FileNotFoundError(
            'The directory {} does not contain any rsmtool models.'
            ''.format(experiment_output_dir))

    experiment_ids = [splitext(basename(mf))[0] for mf in model_files]
    if experiment_id not in experiment_ids:
        raise FileNotFoundError(
            '{} does not contain a model for the experiment "{}". '
            'The following experiments are contained in this '
            'directory: {}'.format(experiment_output_dir, experiment_id,
                                   experiment_ids))

    # check that the directory contains outher required files
    required_file_types = ['feature', 'postprocessing_params']
    for file_type in required_file_types:
        expected_file_name = "{}_{}.csv".format(experiment_id, file_type)
        if not exists(join(experiment_output_dir, expected_file_name)):
            raise FileNotFoundError('{} does not contain the required file '
                                    '{} that was generated during the '
                                    'original model training'.format(
                                        experiment_output_dir,
                                        expected_file_name))

    # model_files = glob.glob(join(experiment_output_dir, '*.model'))
    # if not model_files:
    #     raise FileNotFoundError('The directory {} does not contain any rsmtool models. '
    #                             ''.format(experiment_output_dir))

    logger.info('Reading input files.')

    feature_info = join(experiment_output_dir,
                        '{}_feature.csv'.format(experiment_id))

    post_processing = join(
        experiment_output_dir,
        '{}_postprocessing_params.csv'.format(experiment_id))

    file_paths = [input_features_file, feature_info, post_processing]
    file_names = ['input_features', 'feature_info', 'postprocessing_params']

    converters = {'input_features': config.get_default_converter()}

    # Initialize the reader
    reader = DataReader(file_paths, file_names, converters)
    data_container = reader.read(
        kwargs_dict={'feature_info': {
            'index_col': 0
        }})

    # load the Modeler to generate the predictions
    model = Modeler.load_from_file(
        join(experiment_output_dir, '{}.model'.format(experiment_id)))

    # Add the model to the configuration object
    config['model'] = model

    # Initialize the processor
    processor = FeaturePreprocessor()

    (processed_config,
     processed_container) = processor.process_data(config,
                                                   data_container,
                                                   context='rsmpredict')

    # save the pre-processed features to disk if we were asked to
    if feats_file is not None:
        logger.info(
            'Saving pre-processed feature values to {}'.format(feats_file))

        feats_dir = dirname(feats_file)

        # create any directories needed for the output file
        os.makedirs(feats_dir, exist_ok=True)

        _, feats_filename = split(feats_file)
        feats_filename, _ = splitext(feats_filename)

        # Write out files
        writer.write_experiment_output(
            feats_dir,
            processed_container,
            include_experiment_id=False,
            dataframe_names=['features_processed'],
            new_names_dict={'features_processed': feats_filename},
            file_format=file_format)

    if (output_file.lower().endswith('.csv')
            or output_file.lower().endswith('.xlsx')):

        output_dir = dirname(output_file)
        _, filename = split(output_file)
        filename, _ = splitext(filename)

    else:
        output_dir = output_file
        filename = 'predictions_with_metadata'

    # create any directories needed for the output file
    os.makedirs(output_dir, exist_ok=True)

    # save the predictions to disk
    logger.info('Saving predictions.')

    # Write out files
    writer.write_experiment_output(
        output_dir,
        processed_container,
        include_experiment_id=False,
        dataframe_names=['predictions_with_metadata'],
        new_names_dict={'predictions_with_metadata': filename},
        file_format=file_format)

    # save excluded responses to disk
    if not processed_container.excluded.empty:

        # save the predictions to disk
        logger.info('Saving excluded responses to {}'.format(
            join(output_dir, '{}_excluded_responses.csv'
                 ''.format(filename))))

        # Write out files
        writer.write_experiment_output(output_dir,
                                       processed_container,
                                       include_experiment_id=False,
                                       dataframe_names=['excluded'],
                                       new_names_dict={
                                           'excluded':
                                           '{}_excluded_responses'
                                           ''.format(filename)
                                       },
                                       file_format=file_format)