Example #1
0
def test_run_experiment_lr_with_object():
    # basic experiment with a LinearRegression model

    source = 'lr-object'
    experiment_id = 'lr_object'

    config_file = join(rsmtool_test_dir, 'data', 'experiments', source,
                       '{}.json'.format(experiment_id))

    config_dict = {
        "train_file": "../../files/train.csv",
        "id_column": "ID",
        "use_scaled_predictions": True,
        "test_label_column": "score",
        "train_label_column": "score",
        "test_file": "../../files/test.csv",
        "trim_max": 6,
        "features": "features.csv",
        "trim_min": 1,
        "model": "LinearRegression",
        "experiment_id": "lr_object",
        "description": "Using all features with an LinearRegression model."
    }

    config_parser = ConfigurationParser()
    config_parser.load_config_from_dict(config_dict)
    config_obj = config_parser.normalize_validate_and_process_config()
    config_obj = config_file

    do_run_experiment(source, experiment_id, config_obj)
    output_dir = join('test_outputs', source, 'output')
    expected_output_dir = join(rsmtool_test_dir, 'data', 'experiments', source,
                               'output')
    html_report = join('test_outputs', source, 'report',
                       '{}_report.html'.format(experiment_id))

    csv_files = glob(join(output_dir, '*.csv'))
    for csv_file in csv_files:
        csv_filename = basename(csv_file)
        expected_csv_file = join(expected_output_dir, csv_filename)

        if exists(expected_csv_file):
            yield check_file_output, csv_file, expected_csv_file

    yield check_generated_output, csv_files, experiment_id, 'rsmtool'
    yield check_scaled_coefficients, source, experiment_id
    yield check_report, html_report
    def test_validate_config_too_few_experiment_names(self):
        data = {
            'summary_id': 'summary',
            'experiment_dirs': ["dir1", "dir2", "dir3"],
            'experiment_names': ['exp1', 'exp2']
        }

        _ = ConfigurationParser.validate_config(data, context='rsmsummarize')
def test_run_experiment_lr_with_object():
    # basic experiment with a LinearRegression model

    source = 'lr-object'
    experiment_id = 'lr_object'

    config_file = join(rsmtool_test_dir,
                       'data',
                       'experiments',
                       source,
                       '{}.json'.format(experiment_id))

    config_dict = {"train_file": "../../files/train.csv",
                   "id_column": "ID",
                   "use_scaled_predictions": True,
                   "test_label_column": "score",
                   "train_label_column": "score",
                   "test_file": "../../files/test.csv",
                   "trim_max": 6,
                   "features": "features.csv",
                   "trim_min": 1,
                   "model": "LinearRegression",
                   "experiment_id": "lr_object",
                   "description": "Using all features with an LinearRegression model."}

    config_parser = ConfigurationParser()
    config_parser.load_config_from_dict(config_dict)
    config_obj = config_parser.normalize_validate_and_process_config()
    config_obj = config_file

    do_run_experiment(source, experiment_id, config_obj)
    output_dir = join('test_outputs', source, 'output')
    expected_output_dir = join(rsmtool_test_dir, 'data', 'experiments', source, 'output')
    html_report = join('test_outputs', source, 'report', '{}_report.html'.format(experiment_id))

    csv_files = glob(join(output_dir, '*.csv'))
    for csv_file in csv_files:
        csv_filename = basename(csv_file)
        expected_csv_file = join(expected_output_dir, csv_filename)

        if exists(expected_csv_file):
            yield check_file_output, csv_file, expected_csv_file

    yield check_generated_output, csv_files, experiment_id, 'rsmtool'
    yield check_scaled_coefficients, source, experiment_id
    yield check_report, html_report
    def test_validate_config_experiment_id_1(self):
        data = {
            'experiment_id': 'test experiment',
            'train_file': 'data/rsmtool_smTrain.csv',
            'test_file': 'data/rsmtool_smEval.csv',
            'model': 'LinearRegression'
        }

        _ = ConfigurationParser.validate_config(data)
    def test_validate_config_experiment_id_2(self):
        data = {
            'experiment_id': 'test experiment',
            'predictions_file': 'data/foo',
            'system_score_column': 'h1',
            'trim_min': 1,
            'trim_max': 5
        }

        _ = ConfigurationParser.validate_config(data, context='rsmeval')
    def test_validate_config_experiment_id_4(self):
        data = {
            'comparison_id': 'old vs new',
            'experiment_id_old': 'old experiment',
            'experiment_dir_old': 'data/old',
            'experiment_id_new': 'new_experiment',
            'experiment_dir_new': 'data/new'
        }

        _ = ConfigurationParser.validate_config(data, context='rsmcompare')
    def test_validate_config_min_responses_but_no_candidate(self):
        data = {
            'experiment_id': 'experiment_1',
            'train_file': 'data/rsmtool_smTrain.csv',
            'test_file': 'data/rsmtool_smEval.csv',
            'model': 'LinearRegression',
            'min_responses_per_candidate': 5
        }

        _ = ConfigurationParser.validate_config(data)
    def test_invalid_skll_objective(self):
        data = {
            'experiment_id': 'experiment_1',
            'train_file': 'data/rsmtool_smTrain.csv',
            'test_file': 'data/rsmtool_smEval.csv',
            'description': 'Test',
            'model': 'LinearSVR',
            'skll_objective': 'squared_error'
        }

        _ = ConfigurationParser.validate_config(data)
    def test_builtin_model_for_expected_scores(self):
        data = {
            'experiment_id': 'experiment_1',
            'train_file': 'data/rsmtool_smTrain.csv',
            'test_file': 'data/rsmtool_smEval.csv',
            'description': 'Test',
            'model': 'NNLR',
            'predict_expected_scores': 'true'
        }

        _ = ConfigurationParser.validate_config(data)
    def test_validate_config_unknown_fields(self):
        data = {
            'experiment_id': 'experiment_1',
            'train_file': 'data/rsmtool_smTrain.csv',
            'test_file': 'data/rsmtool_smEval.csv',
            'description': 'Test',
            'model': 'LinearRegression',
            'output': 'foobar'
        }

        _ = ConfigurationParser.validate_config(data)
    def test_validate_config_experiment_id_9(self):
        data = {
            'summary_id':
            'this_is_a_really_really_really_'
            'really_really_really_really_really_really_really_'
            'really_really_really_really_really_really_really_'
            'really_really_really_really_really_really_really_'
            'really_really_really_long_id',
            'experiment_dirs': []
        }

        _ = ConfigurationParser.validate_config(data, context='rsmsummarize')
    def test_validate_config_too_few_subgroup_keys(self):
        data = {
            'experiment_id': 'experiment_1',
            'train_file': 'data/rsmtool_smTrain.csv',
            'test_file': 'data/rsmtool_smEval.csv',
            'model': 'LinearRegression',
            'subgroups': ['L1', 'L2'],
            'min_n_per_group': {
                "L1": 100
            }
        }

        _ = ConfigurationParser.validate_config(data)
    def test_process_fields_rsmsummarize(self):
        data = {
            'summary_id': 'summary',
            'experiment_dirs': 'home/dir1, home/dir2, home/dir3',
            'experiment_names': 'exp1, exp2, exp3'
        }

        newdata = ConfigurationParser.process_config(data)

        assert_array_equal(newdata['experiment_dirs'],
                           ['home/dir1', 'home/dir2', 'home/dir3'])
        assert_array_equal(newdata['experiment_names'],
                           ['exp1', 'exp2', 'exp3'])
    def test_validate_config_min_n_without_subgroups(self):
        data = {
            'experiment_id': 'experiment_1',
            'train_file': 'data/rsmtool_smTrain.csv',
            'test_file': 'data/rsmtool_smEval.csv',
            'model': 'LinearRegression',
            'min_n_per_group': {
                "L1": 100,
                "L2": 50
            }
        }

        _ = ConfigurationParser.validate_config(data)
Example #15
0
def test_run_experiment_lr_summary_with_object():

    # basic rsmsummarize experiment comparing several rsmtool experiments
    source = 'lr-self-summary-object'

    config_file = join(rsmtool_test_dir, 'data', 'experiments', source,
                       'rsmsummarize.json')

    config_dict = {
        "summary_id": "model_comparison",
        "experiment_dirs": ["lr-subgroups", "lr-subgroups", "lr-subgroups"],
        "description": "Comparison of rsmtool experiment with itself."
    }

    config_parser = ConfigurationParser()
    config_parser.load_config_from_dict(config_dict)
    config_obj = config_parser.normalize_validate_and_process_config(
        context='rsmsummarize')
    config_obj = config_file

    do_run_summary(source, config_obj)

    html_report = join('test_outputs', source, 'report',
                       'model_comparison_report.html')

    output_dir = join('test_outputs', source, 'output')
    expected_output_dir = join(rsmtool_test_dir, 'data', 'experiments', source,
                               'output')

    csv_files = glob(join(output_dir, '*.csv'))
    for csv_file in csv_files:
        csv_filename = basename(csv_file)
        expected_csv_file = join(expected_output_dir, csv_filename)

        if exists(expected_csv_file):
            yield check_file_output, csv_file, expected_csv_file

    yield check_report, html_report
    def test_process_fields_with_integer(self):
        data = {
            'experiment_id': 'experiment_1',
            'train_file': 'data/rsmtool_smTrain.csv',
            'test_file': 'data/rsmtool_smEval.csv',
            'description': 'Test',
            'model': 'empWt',
            'use_scaled_predictions': 'True',
            'feature_prefix': '1gram, 2gram',
            'subgroups': 'native language, GPA_range',
            'exclude_zero_scores': 1
        }

        _ = ConfigurationParser.process_config(data)
    def test_validate_config_unspecified_fields(self):
        data = {
            'experiment_id': 'experiment_1',
            'train_file': 'data/rsmtool_smTrain.csv',
            'test_file': 'data/rsmtool_smEval.csv',
            'model': 'LinearRegression'
        }

        newdata = ConfigurationParser.validate_config(data)
        assert_equal(newdata['id_column'], 'spkitemid')
        assert_equal(newdata['use_scaled_predictions'], False)
        assert_equal(newdata['select_transformations'], False)
        assert_array_equal(newdata['general_sections'], ['all'])
        assert_equal(newdata['description'], '')
    def test_validate_config_warning_feature_file_and_transformations(self):
        data = {
            'experiment_id': 'experiment_1',
            'train_file': 'data/rsmtool_smTrain.csv',
            'test_file': 'data/rsmtool_smEval.csv',
            'model': 'LinearRegression',
            'select_transformations': True,
            'features': 'some_file.csv'
        }

        with warnings.catch_warnings(record=True) as warning_list:
            _ = ConfigurationParser.validate_config(data)
            eq_(len(warning_list), 1)
            ok_(issubclass(warning_list[0].category, UserWarning))
    def test_validate_config_numeric_subgroup_threshold(self):
        data = {
            'experiment_id': 'experiment_1',
            'train_file': 'data/rsmtool_smTrain.csv',
            'test_file': 'data/rsmtool_smEval.csv',
            'model': 'LinearRegression',
            'subgroups': ['L2', 'L1'],
            'min_n_per_group': 100
        }

        newdata = ConfigurationParser.validate_config(data)
        eq_(type(newdata['min_n_per_group']), dict)
        assert_equal(newdata['min_n_per_group']['L1'], 100)
        assert_equal(newdata['min_n_per_group']['L2'], 100)
    def test_validate_config_warning_feature_list_and_transformations(self):
        # this should no show warnings
        data = {
            'experiment_id': 'experiment_1',
            'train_file': 'data/rsmtool_smTrain.csv',
            'test_file': 'data/rsmtool_smEval.csv',
            'model': 'LinearRegression',
            'select_transformations': True,
            'features': ['feature1', 'feature2']
        }

        with warnings.catch_warnings(record=True) as warning_list:
            _ = ConfigurationParser.validate_config(data)
            eq_(len(warning_list), 0)
def test_run_experiment_lr_summary_with_object():

    # basic rsmsummarize experiment comparing several rsmtool experiments
    source = 'lr-self-summary-object'

    config_file = join(rsmtool_test_dir,
                       'data',
                       'experiments',
                       source,
                       'rsmsummarize.json')

    config_dict = {"summary_id": "model_comparison",
                   "experiment_dirs": ["lr-subgroups", "lr-subgroups", "lr-subgroups"],
                   "description": "Comparison of rsmtool experiment with itself."}

    config_parser = ConfigurationParser()
    config_parser.load_config_from_dict(config_dict)
    config_obj = config_parser.normalize_validate_and_process_config(context='rsmsummarize')
    config_obj = config_file

    do_run_summary(source, config_obj)

    html_report = join('test_outputs', source, 'report', 'model_comparison_report.html')

    output_dir = join('test_outputs', source, 'output')
    expected_output_dir = join(rsmtool_test_dir, 'data', 'experiments', source, 'output')

    csv_files = glob(join(output_dir, '*.csv'))
    for csv_file in csv_files:
        csv_filename = basename(csv_file)
        expected_csv_file = join(expected_output_dir, csv_filename)

        if exists(expected_csv_file):
            yield check_file_output, csv_file, expected_csv_file

    yield check_report, html_report
    def test_validate_config_experiment_id_5(self):
        data = {
            'experiment_id':
            'this_is_a_really_really_really_'
            'really_really_really_really_really_really_really_'
            'really_really_really_really_really_really_really_'
            'really_really_really_really_really_really_really_'
            'really_really_really_long_id',
            'train_file':
            'data/rsmtool_smTrain.csv',
            'test_file':
            'data/rsmtool_smEval.csv',
            'model':
            'LinearRegression'
        }

        _ = ConfigurationParser.validate_config(data)
    def test_process_fields(self):
        data = {
            'experiment_id': 'experiment_1',
            'train_file': 'data/rsmtool_smTrain.csv',
            'test_file': 'data/rsmtool_smEval.csv',
            'description': 'Test',
            'model': 'empWt',
            'use_scaled_predictions': 'True',
            'subgroups': 'native language, GPA_range',
            'exclude_zero_scores': 'false'
        }

        newdata = ConfigurationParser.process_config(data)
        assert_array_equal(newdata['subgroups'],
                           ['native language', 'GPA_range'])
        eq_(type(newdata['use_scaled_predictions']), bool)
        eq_(newdata['use_scaled_predictions'], True)
        eq_(newdata['exclude_zero_scores'], False)
    def test_validate_config_experiment_id_7(self):
        data = {
            'comparison_id':
            'this_is_a_really_really_really_'
            'really_really_really_really_really_really_really_'
            'really_really_really_really_really_really_really_'
            'really_really_really_really_really_really_really_'
            'really_really_really_long_id',
            'experiment_id_old':
            'old_experiment',
            'experiment_dir_old':
            'data/old',
            'experiment_id_new':
            'new_experiment',
            'experiment_dir_new':
            'data/new'
        }

        _ = ConfigurationParser.validate_config(data, context='rsmcompare')
    def test_validate_config_missing_fields(self):
        data = {'experiment_id': 'test'}

        _ = ConfigurationParser.validate_config(data)
    def test_validate_config_experiment_id_8(self):
        data = {'summary_id': 'model summary', 'experiment_dirs': []}

        _ = ConfigurationParser.validate_config(data, context='rsmsummarize')
def compute_and_save_predictions(config_file_or_obj, output_file, feats_file=None):
    """
    Run ``rsmpredict`` with given configuration file and generate
    predictions (and, optionally, pre-processed feature values).

    Parameters
    ----------
    config_file_or_obj : str or configuration_parser.Configuration
        Path to the experiment configuration file.
        Users can also pass a `Configuration` object that is in memory.
    output_dir : str
        Path to the output directory for saving files.
    feats_file (optional): str
        Path to the output file for saving preprocessed feature values.

    Raises
    ------
    ValueError
        If any of the required fields are missing or ill-specified.
    """

    logger = logging.getLogger(__name__)

    # Allow users to pass Configuration object to the
    # `config_file_or_obj` argument, rather than read file
    if not isinstance(config_file_or_obj, Configuration):

        # Instantiate configuration parser object
        parser = ConfigurationParser.get_configparser(config_file_or_obj)
        config = parser.read_normalize_validate_and_process_config(config_file_or_obj,
                                                                   context='rsmpredict')

        # get the directory where the config file lives
        configpath = dirname(config_file_or_obj)

    else:

        config = config_file_or_obj
        if config.filepath is not None:
            configpath = dirname(config.filepath)
        else:
            configpath = os.getcwd()

    # get the experiment ID
    experiment_id = config['experiment_id']

    # Get output format
    file_format = config.get('file_format', 'csv')

    # Get DataWriter object
    writer = DataWriter(experiment_id)

    # get the input file containing the feature values
    # for which we want to generate the predictions
    input_features_file = DataReader.locate_files(config['input_features_file'], configpath)
    if not input_features_file:
        raise FileNotFoundError('Input file {} does not exist'
                                ''.format(config['input_features_file']))

    experiment_dir = DataReader.locate_files(config['experiment_dir'], configpath)
    if not experiment_dir:
        raise FileNotFoundError('The directory {} does not exist.'
                                ''.format(config['experiment_dir']))
    else:
        experiment_output_dir = normpath(join(experiment_dir, 'output'))
        if not exists(experiment_output_dir):
            raise FileNotFoundError('The directory {} does not contain '
                                    'the output of an rsmtool experiment.'.format(experiment_dir))

    # find all the .model files in the experiment output directory
    model_files = glob.glob(join(experiment_output_dir, '*.model'))
    if not model_files:
        raise FileNotFoundError('The directory {} does not contain any rsmtool models.'
                                ''.format(experiment_output_dir))

    experiment_ids = [splitext(basename(mf))[0] for mf in model_files]
    if experiment_id not in experiment_ids:
        raise FileNotFoundError('{} does not contain a model for the experiment "{}". '
                                'The following experiments are contained in this '
                                'directory: {}'.format(experiment_output_dir,
                                                       experiment_id,
                                                       experiment_ids))

    # check that the directory contains outher required files
    required_file_types = ['feature', 'postprocessing_params']
    for file_type in required_file_types:
        expected_file_name = "{}_{}.csv".format(experiment_id, file_type)
        if not exists(join(experiment_output_dir, expected_file_name)):
            raise FileNotFoundError('{} does not contain the required file '
                                    '{} that was generated during the '
                                    'original model training'.format(experiment_output_dir,
                                                                     expected_file_name))

    # model_files = glob.glob(join(experiment_output_dir, '*.model'))
    # if not model_files:
    #     raise FileNotFoundError('The directory {} does not contain any rsmtool models. '
    #                             ''.format(experiment_output_dir))

    logger.info('Reading input files.')

    feature_info = join(experiment_output_dir,
                        '{}_feature.csv'.format(experiment_id))

    post_processing = join(experiment_output_dir,
                           '{}_postprocessing_params.csv'.format(experiment_id))

    file_paths = [input_features_file, feature_info, post_processing]
    file_names = ['input_features',
                  'feature_info',
                  'postprocessing_params']

    converters = {'input_features': config.get_default_converter()}

    # Initialize the reader
    reader = DataReader(file_paths, file_names, converters)
    data_container = reader.read(kwargs_dict={'feature_info': {'index_col': 0}})

    # load the Modeler to generate the predictions
    model = Modeler.load_from_file(join(experiment_output_dir,
                                        '{}.model'.format(experiment_id)))

    # Add the model to the configuration object
    config['model'] = model

    # Initialize the processor
    processor = FeaturePreprocessor()

    (processed_config,
     processed_container) = processor.process_data(config,
                                                   data_container,
                                                   context='rsmpredict')

    # save the pre-processed features to disk if we were asked to
    if feats_file is not None:
        logger.info('Saving pre-processed feature values to {}'.format(feats_file))

        feats_dir = dirname(feats_file)

        # create any directories needed for the output file
        os.makedirs(feats_dir, exist_ok=True)

        _, feats_filename = split(feats_file)
        feats_filename, _ = splitext(feats_filename)

        # Write out files
        writer.write_experiment_output(feats_dir,
                                       processed_container,
                                       include_experiment_id=False,
                                       dataframe_names=['features_processed'],
                                       new_names_dict={'features_processed':
                                                       feats_filename},
                                       file_format=file_format)

    if (output_file.lower().endswith('.csv') or
            output_file.lower().endswith('.xlsx')):

        output_dir = dirname(output_file)
        _, filename = split(output_file)
        filename, _ = splitext(filename)

    else:
        output_dir = output_file
        filename = 'predictions_with_metadata'

    # create any directories needed for the output file
    os.makedirs(output_dir, exist_ok=True)

    # save the predictions to disk
    logger.info('Saving predictions.')

    # Write out files
    writer.write_experiment_output(output_dir,
                                   processed_container,
                                   include_experiment_id=False,
                                   dataframe_names=['predictions_with_metadata'],
                                   new_names_dict={'predictions_with_metadata':
                                                   filename},
                                   file_format=file_format)

    # save excluded responses to disk
    if not processed_container.excluded.empty:

        # save the predictions to disk
        logger.info('Saving excluded responses to {}'.format(join(output_dir,
                                                                  '{}_excluded_responses.csv'
                                                                  ''.format(filename))))

        # Write out files
        writer.write_experiment_output(output_dir,
                                       processed_container,
                                       include_experiment_id=False,
                                       dataframe_names=['excluded'],
                                       new_names_dict={'excluded':
                                                       '{}_excluded_responses'
                                                       ''.format(filename)},
                                       file_format=file_format)
Example #28
0
def run_summary(config_file_or_obj, output_dir):
    """
    Run rsmsummarize experiment using the given configuration
    file and generate all outputs in the given directory.

    Parameters
    ----------
    config_file_or_obj : str or configuration_parser.Configuration
        Path to the experiment configuration file.
        Users can also pass a `Configuration` object that is in memory.
    output_dir : str
        Path to the experiment output directory.

    Raises
    ------
    ValueError
        If any of the required fields are missing or ill-specified.
    """
    logger = logging.getLogger(__name__)

    # create the 'output' and the 'figure' sub-directories
    # where all the experiment output such as the CSV files
    # and the box plots will be saved
    csvdir = abspath(join(output_dir, 'output'))
    figdir = abspath(join(output_dir, 'figure'))
    reportdir = abspath(join(output_dir, 'report'))

    os.makedirs(csvdir, exist_ok=True)
    os.makedirs(figdir, exist_ok=True)
    os.makedirs(reportdir, exist_ok=True)

    # Allow users to pass Configuration object to the
    # `config_file_or_obj` argument, rather than read file
    if not isinstance(config_file_or_obj, Configuration):

        # Instantiate configuration parser object
        parser = ConfigurationParser.get_configparser(config_file_or_obj)
        configuration = parser.read_normalize_validate_and_process_config(config_file_or_obj,
                                                                          context='rsmsummarize')

        # get the directory where the configuration file lives
        configpath = dirname(config_file_or_obj)

    else:

        configuration = config_file_or_obj
        if configuration.filepath is not None:
            configpath = dirname(configuration.filepath)
        else:
            configpath = os.getcwd()

    logger.info('Saving configuration file.')
    configuration.save(output_dir)

    # get the list of the experiment dirs
    experiment_dirs = configuration['experiment_dirs']

    # Get experiment names if any
    experiment_names = configuration.get('experiment_names')
    experiment_names = experiment_names if experiment_names else [None] * len(experiment_dirs)
    dirs_with_names = zip(experiment_dirs, experiment_names)

    # check the experiment dirs and assemble the list of csvdir and jsons
    all_experiments = []
    for (experiment_dir, experiment_name) in dirs_with_names:
        experiments = check_experiment_dir(experiment_dir,
                                           experiment_name,
                                           configpath)
        all_experiments.extend(experiments)

    # get the subgroups if any
    # Note: at the moment no comparison are reported for subgroups.
    # this option is added to the code to make it easier to add
    # subgroup comparisons in future versions
    subgroups = configuration.get('subgroups')

    general_report_sections = configuration['general_sections']

    # get any special sections that the user might have specified
    special_report_sections = configuration['special_sections']

    # get any custom sections and locate them to make sure
    # that they exist, otherwise raise an exception
    custom_report_section_paths = configuration['custom_sections']
    if custom_report_section_paths:
        logger.info('Locating custom report sections')
        custom_report_sections = Reporter.locate_custom_sections(custom_report_section_paths,
                                                                 configpath)
    else:
        custom_report_sections = []

    section_order = configuration['section_order']

    # Initialize reporter
    reporter = Reporter()

    # check all sections values and order and get the
    # ordered list of notebook files
    chosen_notebook_files = reporter.get_ordered_notebook_files(general_report_sections,
                                                                special_report_sections,
                                                                custom_report_sections,
                                                                section_order,
                                                                subgroups,
                                                                model_type=None,
                                                                context='rsmsummarize')

    # add chosen notebook files to configuration
    configuration['chosen_notebook_files'] = chosen_notebook_files

    # now generate the comparison report
    logger.info('Starting report generation')
    reporter.create_summary_report(configuration,
                                   all_experiments,
                                   csvdir)
class TestConfigurationParser:

    def setUp(self):
        self.parser = ConfigurationParser()

    def test_normalize_config(self):
        data = {'expID': 'experiment_1',
                'train': 'data/rsmtool_smTrain.csv',
                'LRmodel': 'empWt',
                'feature': 'feature/feature_list.json',
                'description': 'A sample model with 9 features '
                               'trained using average score and tested using r1.',
                'test': 'data/rsmtool_smEval.csv',
                'train.lab': 'sc1',
                'crossvalidate': 'yes',
                'test.lab': 'r1',
                'scale': 'scale'}

        with warnings.catch_warnings():
            warnings.filterwarnings('ignore', category=DeprecationWarning)

            # Add data to `ConfigurationParser` object
            self.parser.load_config_from_dict(data)

            newdata = self.parser.normalize_config()
            ok_('experiment_id' in newdata.keys())
            assert_equal(newdata['experiment_id'], 'experiment_1')
            assert_equal(newdata['use_scaled_predictions'], True)

        # test for non-standard scaling value
        data = {'expID': 'experiment_1',
                'train': 'data/rsmtool_smTrain.csv',
                'LRmodel': 'LinearRegression',
                'scale': 'Yes'}
        with warnings.catch_warnings():

            # Add data to `ConfigurationParser` object
            self.parser._config = data

            warnings.filterwarnings('ignore', category=DeprecationWarning)
            assert_raises(ValueError, self.parser.normalize_config)

        # test when no scaling is specified
        data = {'expID': 'experiment_1',
                'train': 'data/rsmtool_smTrain.csv',
                'LRmodel': 'LinearRegression',
                'feature': 'feature/feature_list.json',
                'description': 'A sample model with 9 features '
                               'trained using average score and tested using r1.',
                'test': 'data/rsmtool_smEval.csv',
                'train.lab': 'sc1',
                'crossvalidate': 'yes',
                'test.lab': 'r1'}

        with warnings.catch_warnings():
            warnings.filterwarnings('ignore', category=DeprecationWarning)

            # Add data to `ConfigurationParser` object
            self.parser._config = data
            newdata = self.parser.normalize_config()
            ok_('use_scaled_predictions' not in newdata.keys())

    @raises(ValueError)
    def test_validate_config_missing_fields(self):
        data = {'expID': 'test'}

        # Add data to `ConfigurationParser` object
        self.parser.load_config_from_dict(data)
        self.parser.validate_config()

    @raises(ValueError)
    def test_validate_config_min_responses_but_no_candidate(self):
        data = {'experiment_id': 'experiment_1',
                'train_file': 'data/rsmtool_smTrain.csv',
                'test_file': 'data/rsmtool_smEval.csv',
                'model': 'LinearRegression',
                'min_responses_per_candidate': 5}

        # Add data to `ConfigurationParser` object
        self.parser.load_config_from_dict(data)
        self.parser.validate_config()

    def test_validate_config_unspecified_fields(self):
        data = {'experiment_id': 'experiment_1',
                'train_file': 'data/rsmtool_smTrain.csv',
                'test_file': 'data/rsmtool_smEval.csv',
                'model': 'LinearRegression'}

        # Add data to `ConfigurationParser` object
        self.parser._config = data
        newdata = self.parser.validate_config()
        assert_equal(newdata['id_column'], 'spkitemid')
        assert_equal(newdata['use_scaled_predictions'], False)
        assert_equal(newdata['select_transformations'], False)
        assert_equal(newdata['general_sections'], 'all')
        assert_equal(newdata['description'], '')

    @raises(ValueError)
    def test_validate_config_unknown_fields(self):
        data = {'experiment_id': 'experiment_1',
                'train_file': 'data/rsmtool_smTrain.csv',
                'test_file': 'data/rsmtool_smEval.csv',
                'description': 'Test',
                'model': 'LinearRegression',
                'output': 'foobar'}

        # Add data to `ConfigurationParser` object
        self.parser.load_config_from_dict(data)
        self.parser.validate_config()

    @raises(ValueError)
    def test_validate_config_experiment_id_1(self):
        data = {'experiment_id': 'test experiment',
                'train_file': 'data/rsmtool_smTrain.csv',
                'test_file': 'data/rsmtool_smEval.csv',
                'model': 'LinearRegression'}

        # Add data to `ConfigurationParser` object
        self.parser.load_config_from_dict(data)
        self.parser.validate_config()

    @raises(ValueError)
    def test_validate_config_experiment_id_2(self):
        data = {'experiment_id': 'test experiment',
                'predictions_file': 'data/foo',
                'system_score_column': 'h1',
                'trim_min': 1,
                'trim_max': 5}

        # Add data to `ConfigurationParser` object
        self.parser.load_config_from_dict(data)
        self.parser.validate_config(context='rsmeval')

    @raises(ValueError)
    def test_validate_config_experiment_id_3(self):
        data = {'comparison_id': 'old vs new',
                'experiment_id_old': 'old_experiment',
                'experiment_dir_old': 'data/old',
                'experiment_id_new': 'new_experiment',
                'experiment_dir_new': 'data/new'}

        # Add data to `ConfigurationParser` object
        self.parser.load_config_from_dict(data)
        self.parser.validate_config(context='rsmcompare')

    @raises(ValueError)
    def test_validate_config_experiment_id_4(self):
        data = {'comparison_id': 'old vs new',
                'experiment_id_old': 'old experiment',
                'experiment_dir_old': 'data/old',
                'experiment_id_new': 'new_experiment',
                'experiment_dir_new': 'data/new'}

        # Add data to `ConfigurationParser` object
        self.parser.load_config_from_dict(data)
        self.parser.validate_config(context='rsmcompare')

    @raises(ValueError)
    def test_validate_config_experiment_id_5(self):
        data = {'experiment_id': 'this_is_a_really_really_really_'
                'really_really_really_really_really_really_really_'
                'really_really_really_really_really_really_really_'
                'really_really_really_really_really_really_really_'
                'really_really_really_long_id',
                'train_file': 'data/rsmtool_smTrain.csv',
                'test_file': 'data/rsmtool_smEval.csv',
                'model': 'LinearRegression'}

        # Add data to `ConfigurationParser` object
        self.parser.load_config_from_dict(data)
        self.parser.validate_config()

    @raises(ValueError)
    def test_validate_config_experiment_id_6(self):
        data = {'experiment_id': 'this_is_a_really_really_really_'
                'really_really_really_really_really_really_really_'
                'really_really_really_really_really_really_really_'
                'really_really_really_really_really_really_really_'
                'really_really_really_long_id',
                'predictions_file': 'data/foo',
                'system_score_column': 'h1',
                'trim_min': 1,
                'trim_max': 5}

        # Add data to `ConfigurationParser` object
        self.parser.load_config_from_dict(data)
        self.parser.validate_config(context='rsmeval')

    @raises(ValueError)
    def test_validate_config_experiment_id_7(self):
        data = {'comparison_id': 'this_is_a_really_really_really_'
                'really_really_really_really_really_really_really_'
                'really_really_really_really_really_really_really_'
                'really_really_really_really_really_really_really_'
                'really_really_really_long_id',
                'experiment_id_old': 'old_experiment',
                'experiment_dir_old': 'data/old',
                'experiment_id_new': 'new_experiment',
                'experiment_dir_new': 'data/new'}

        # Add data to `ConfigurationParser` object
        self.parser.load_config_from_dict(data)
        self.parser.validate_config(context='rsmcompare')

    @raises(ValueError)
    def test_validate_config_experiment_id_8(self):
        data = {'summary_id': 'model summary',
                'experiment_dirs': []}

        # Add data to `ConfigurationParser` object
        self.parser.load_config_from_dict(data)
        self.parser.validate_config(context='rsmsummarize')

    @raises(ValueError)
    def test_validate_config_experiment_id_9(self):
        data = {'summary_id': 'this_is_a_really_really_really_'
                'really_really_really_really_really_really_really_'
                'really_really_really_really_really_really_really_'
                'really_really_really_really_really_really_really_'
                'really_really_really_long_id',
                'experiment_dirs': []}

        # Add data to `ConfigurationParser` object
        self.parser.load_config_from_dict(data)
        self.parser.validate_config(context='rsmsummarize')

    def test_process_fields(self):
        data = {'experiment_id': 'experiment_1',
                'train_file': 'data/rsmtool_smTrain.csv',
                'test_file': 'data/rsmtool_smEval.csv',
                'description': 'Test',
                'model': 'empWt',
                'use_scaled_predictions': 'True',
                'feature_prefix': '1gram, 2gram',
                'subgroups': 'native language, GPA_range',
                'exclude_zero_scores': 'false'}

        # Add data to `ConfigurationParser` object
        self.parser.load_config_from_dict(data)
        newdata = self.parser.validate_config(inplace=False)

        # Add data to `ConfigurationParser` object
        self.parser._config = newdata
        newdata = self.parser.process_config(inplace=False)
        assert_array_equal(newdata['feature_prefix'], ['1gram', '2gram'])
        assert_array_equal(newdata['subgroups'], ['native language', 'GPA_range'])
        eq_(type(newdata['use_scaled_predictions']), bool)
        eq_(newdata['use_scaled_predictions'], True)
        eq_(newdata['exclude_zero_scores'], False)

    @raises(ValueError)
    def test_process_fields_with_non_boolean(self):
        data = {'experiment_id': 'experiment_1',
                'train_file': 'data/rsmtool_smTrain.csv',
                'test_file': 'data/rsmtool_smEval.csv',
                'description': 'Test',
                'model': 'empWt',
                'use_scaled_predictions': 'True',
                'feature_prefix': '1gram, 2gram',
                'subgroups': 'native language, GPA_range',
                'exclude_zero_scores': 'Yes'}

        # Add data to `ConfigurationParser` object
        self.parser.load_config_from_dict(data)
        newdata = self.parser.validate_config()
        # Add data to `ConfigurationParser` object
        self.parser._config = newdata
        newdata = self.parser.process_config()

    @raises(ValueError)
    def test_process_fields_with_integer(self):
        data = {'experiment_id': 'experiment_1',
                'train_file': 'data/rsmtool_smTrain.csv',
                'test_file': 'data/rsmtool_smEval.csv',
                'description': 'Test',
                'model': 'empWt',
                'use_scaled_predictions': 'True',
                'feature_prefix': '1gram, 2gram',
                'subgroups': 'native language, GPA_range',
                'exclude_zero_scores': 1}

        # Add data to `ConfigurationParser` object
        self.parser.load_config_from_dict(data)
        newdata = self.parser.validate_config()
        # Add data to `ConfigurationParser` object
        self.parser._config = newdata
        newdata = self.parser.process_config()

    @raises(ValueError)
    def test_invalid_skll_objective(self):
        data = {'experiment_id': 'experiment_1',
                'train_file': 'data/rsmtool_smTrain.csv',
                'test_file': 'data/rsmtool_smEval.csv',
                'description': 'Test',
                'model': 'LinearSVR',
                'skll_objective': 'squared_error'}

        # Add data to `ConfigurationParser` object
        self.parser.load_config_from_dict(data)
        self.parser.validate_config()

    @raises(ValueError)
    def test_wrong_skll_model_for_expected_scores(self):
        data = {'experiment_id': 'experiment_1',
                'train_file': 'data/rsmtool_smTrain.csv',
                'test_file': 'data/rsmtool_smEval.csv',
                'description': 'Test',
                'model': 'LinearSVR',
                'predict_expected_scores': 'true'}

        # Add data to `ConfigurationParser` object
        self.parser.load_config_from_dict(data)
        self.parser.validate_config()

    @raises(ValueError)
    def test_builtin_model_for_expected_scores(self):
        data = {'experiment_id': 'experiment_1',
                'train_file': 'data/rsmtool_smTrain.csv',
                'test_file': 'data/rsmtool_smEval.csv',
                'description': 'Test',
                'model': 'NNLR',
                'predict_expected_scores': 'true'}

        # Add data to `ConfigurationParser` object
        self.parser.load_config_from_dict(data)
        self.parser.validate_config()

    def test_get_correct_configparser_cfg(self):
        config_parser = ConfigurationParser.get_configparser('config.cfg')
        assert isinstance(config_parser, CFGConfigurationParser)

    def test_get_correct_configparser_json(self):
        config_parser = ConfigurationParser.get_configparser('config.json')
        assert isinstance(config_parser, JSONConfigurationParser)
Example #30
0
def run_experiment(config_file_or_obj, output_dir):
    """
    Run RSMTool experiment using the given configuration
    file and generate all outputs in the given directory.

    Parameters
    ----------
    config_file_or_obj : str or Configuration
        Path to the experiment configuration file.
        Users can also pass a `Configuration` object that is in memory.
    output_dir : str
        Path to the experiment output directory.

    Raises
    ------
    ValueError
        If any of the required fields are missing or ill-specified.
    """

    logger = logging.getLogger(__name__)

    # create the 'output' and the 'figure' sub-directories
    # where all the experiment output such as the CSV files
    # and the box plots will be saved

    # Get absolute paths to output directories
    csvdir = abspath(join(output_dir, 'output'))
    figdir = abspath(join(output_dir, 'figure'))
    reportdir = abspath(join(output_dir, 'report'))
    featuredir = abspath(join(output_dir, 'feature'))

    # Make directories, if necessary
    makedirs(csvdir, exist_ok=True)
    makedirs(figdir, exist_ok=True)
    makedirs(reportdir, exist_ok=True)

    # Allow users to pass Configuration object to the
    # `config_file_or_obj` argument, rather than read from file
    if not isinstance(config_file_or_obj, Configuration):

        # Instantiate configuration parser object
        parser = ConfigurationParser.get_configparser(config_file_or_obj)
        configuration = parser.read_normalize_validate_and_process_config(
            config_file_or_obj)

        # get the directory where the configuration file lives
        configpath = dirname(config_file_or_obj)

    else:

        configuration = config_file_or_obj
        if configuration.filepath is not None:
            configpath = dirname(configuration.filepath)
        else:
            configpath = getcwd()

    logger.info('Saving configuration file.')
    configuration.save(output_dir)

    # Get output format
    file_format = configuration.get('file_format', 'csv')

    # Get DataWriter object
    writer = DataWriter(configuration['experiment_id'])

    # Get the paths and names for the DataReader

    (file_names, file_paths_org) = configuration.get_names_and_paths(
        ['train_file', 'test_file', 'features', 'feature_subset_file'],
        ['train', 'test', 'feature_specs', 'feature_subset_specs'])

    file_paths = DataReader.locate_files(file_paths_org, configpath)

    # if there are any missing files after trying to locate
    # all expected files, raise an error
    if None in file_paths:
        missing_file_paths = [
            file_paths_org[idx] for idx, path in enumerate(file_paths)
            if path is None
        ]
        raise FileNotFoundError('The following files were not found: '
                                '{}'.format(repr(missing_file_paths)))

    # Use the default converter for both train and test
    converters = {
        'train': configuration.get_default_converter(),
        'test': configuration.get_default_converter()
    }

    logger.info('Reading in all data from files.')

    # Initialize the reader
    reader = DataReader(file_paths, file_names, converters)
    data_container = reader.read()

    logger.info('Preprocessing all features.')

    # Initialize the processor
    processor = FeaturePreprocessor()

    (processed_config,
     processed_container) = processor.process_data(configuration,
                                                   data_container)

    # Rename certain frames with more descriptive names
    # for writing out experiment files
    rename_dict = {
        'train_excluded': 'train_excluded_responses',
        'test_excluded': 'test_excluded_responses',
        'train_length': 'train_response_lengths',
        'train_flagged': 'train_responses_with_excluded_flags',
        'test_flagged': 'test_responses_with_excluded_flags'
    }

    logger.info('Saving training and test set data to disk.')

    # Write out files
    writer.write_experiment_output(
        csvdir,
        processed_container, [
            'train_features', 'test_features', 'train_metadata',
            'test_metadata', 'train_other_columns', 'test_other_columns',
            'train_preprocessed_features', 'test_preprocessed_features',
            'train_excluded', 'test_excluded', 'train_length',
            'test_human_scores', 'train_flagged', 'test_flagged'
        ],
        rename_dict,
        file_format=file_format)

    # Initialize the analyzer
    analyzer = Analyzer()

    (analyzed_config,
     analyzed_container) = analyzer.run_data_composition_analyses_for_rsmtool(
         processed_container, processed_config)

    # Write out files
    writer.write_experiment_output(csvdir,
                                   analyzed_container,
                                   file_format=file_format)

    logger.info('Training {} model.'.format(processed_config['model_name']))

    # Initialize modeler
    modeler = Modeler()

    modeler.train(processed_config, processed_container, csvdir, figdir,
                  file_format)

    # Identify the features used by the model
    selected_features = modeler.get_feature_names()

    # Add selected features to processed configuration
    processed_config['selected_features'] = selected_features

    # Write out files
    writer.write_feature_csv(featuredir,
                             processed_container,
                             selected_features,
                             file_format=file_format)

    features_data_container = processed_container.copy()

    # Get selected feature info, and write out to file
    df_feature_info = features_data_container.feature_info.copy()
    df_selected_feature_info = df_feature_info[df_feature_info['feature'].isin(
        selected_features)]
    selected_feature_dataset_dict = {
        'name': 'selected_feature_info',
        'frame': df_selected_feature_info
    }

    features_data_container.add_dataset(selected_feature_dataset_dict,
                                        update=True)

    writer.write_experiment_output(
        csvdir,
        features_data_container,
        dataframe_names=['selected_feature_info'],
        new_names_dict={'selected_feature_info': 'feature'},
        file_format=file_format)

    logger.info('Running analyses on training set.')

    (train_analyzed_config,
     train_analyzed_container) = analyzer.run_training_analyses(
         processed_container, processed_config)

    # Write out files
    writer.write_experiment_output(csvdir,
                                   train_analyzed_container,
                                   reset_index=True,
                                   file_format=file_format)

    # Use only selected features for predictions
    columns_for_prediction = ['spkitemid', 'sc1'] + selected_features
    train_for_prediction = processed_container.train_preprocessed_features[
        columns_for_prediction]
    test_for_prediction = processed_container.test_preprocessed_features[
        columns_for_prediction]

    logged_str = 'Generating training and test set predictions'
    logged_str += ' (expected scores).' if configuration[
        'predict_expected_scores'] else '.'
    logger.info(logged_str)
    (pred_config, pred_data_container) = modeler.predict_train_and_test(
        train_for_prediction, test_for_prediction, processed_config)

    # Write out files
    writer.write_experiment_output(
        csvdir,
        pred_data_container,
        new_names_dict={'pred_test': 'pred_processed'},
        file_format=file_format)

    original_coef_file = join(
        csvdir, '{}_coefficients.{}'.format(pred_config['experiment_id'],
                                            file_format))

    # If coefficients file exists, then generate
    # scaled coefficients and save to file
    if exists(original_coef_file):
        logger.info('Scaling the coefficients and saving them to disk')
        try:

            # Scale coefficients, and return DataContainer w/ scaled coefficients
            scaled_data_container = modeler.scale_coefficients(pred_config)

            # Write out files to disk
            writer.write_experiment_output(csvdir,
                                           scaled_data_container,
                                           file_format=file_format)

        except AttributeError:
            raise ValueError(
                "It appears you are trying to save two different "
                "experiments to the same directory using the same "
                "ID. Please clear the content of the directory and "
                "rerun both experiments using different "
                "experiment IDs.")

    # Add processed data_container frames to pred_data_container
    new_pred_data_container = pred_data_container + processed_container

    logger.info('Running prediction analyses.')
    (pred_analysis_config,
     pred_analysis_data_container) = analyzer.run_prediction_analyses(
         new_pred_data_container, pred_config)

    # Write out files
    writer.write_experiment_output(csvdir,
                                   pred_analysis_data_container,
                                   reset_index=True,
                                   file_format=file_format)
    # Initialize reporter
    reporter = Reporter()

    # generate the report
    logger.info('Starting report generation.')
    reporter.create_report(processed_config, csvdir, figdir)
 def setUp(self):
     self.parser = ConfigurationParser()
 def test_init_non_json_file(self):
     with tempfile.NamedTemporaryFile(suffix=".txt") as tempf:
         _ = ConfigurationParser(tempf.name)
 def test_get_correct_configparser_cfg(self):
     config_parser = ConfigurationParser.get_configparser('config.cfg')
     assert isinstance(config_parser, CFGConfigurationParser)
 def test_get_correct_configparser_json(self):
     config_parser = ConfigurationParser.get_configparser('config.json')
     assert isinstance(config_parser, JSONConfigurationParser)
def run_experiment(config_file_or_obj,
                   output_dir):
    """
    Run RSMTool experiment using the given configuration
    file and generate all outputs in the given directory.

    Parameters
    ----------
    config_file_or_obj : str or Configuration
        Path to the experiment configuration file.
        Users can also pass a `Configuration` object that is in memory.
    output_dir : str
        Path to the experiment output directory.

    Raises
    ------
    ValueError
        If any of the required fields are missing or ill-specified.
    """

    logger = logging.getLogger(__name__)

    # create the 'output' and the 'figure' sub-directories
    # where all the experiment output such as the CSV files
    # and the box plots will be saved

    # Get absolute paths to output directories
    csvdir = abspath(join(output_dir, 'output'))
    figdir = abspath(join(output_dir, 'figure'))
    reportdir = abspath(join(output_dir, 'report'))
    featuredir = abspath(join(output_dir, 'feature'))

    # Make directories, if necessary
    makedirs(csvdir, exist_ok=True)
    makedirs(figdir, exist_ok=True)
    makedirs(reportdir, exist_ok=True)

    # Allow users to pass Configuration object to the
    # `config_file_or_obj` argument, rather than read from file
    if not isinstance(config_file_or_obj, Configuration):

        # Instantiate configuration parser object
        parser = ConfigurationParser.get_configparser(config_file_or_obj)
        configuration = parser.read_normalize_validate_and_process_config(config_file_or_obj)

        # get the directory where the configuration file lives
        configpath = dirname(config_file_or_obj)

    else:

        configuration = config_file_or_obj
        if configuration.filepath is not None:
            configpath = dirname(configuration.filepath)
        else:
            configpath = getcwd()

    logger.info('Saving configuration file.')
    configuration.save(output_dir)

    # Get output format
    file_format = configuration.get('file_format', 'csv')

    # Get DataWriter object
    writer = DataWriter(configuration['experiment_id'])

    # Get the paths and names for the DataReader

    (file_names,
     file_paths_org) = configuration.get_names_and_paths(['train_file', 'test_file',
                                                          'features',
                                                          'feature_subset_file'],
                                                         ['train', 'test',
                                                          'feature_specs',
                                                          'feature_subset_specs'])

    file_paths = DataReader.locate_files(file_paths_org, configpath)

    # if there are any missing files after trying to locate
    # all expected files, raise an error
    if None in file_paths:
        missing_file_paths = [file_paths_org[idx] for idx, path in enumerate(file_paths)
                              if path is None]
        raise FileNotFoundError('The following files were not found: '
                                '{}'.format(repr(missing_file_paths)))

    # Use the default converter for both train and test
    converters = {'train': configuration.get_default_converter(),
                  'test': configuration.get_default_converter()}

    logger.info('Reading in all data from files.')

    # Initialize the reader
    reader = DataReader(file_paths, file_names, converters)
    data_container = reader.read()

    logger.info('Preprocessing all features.')

    # Initialize the processor
    processor = FeaturePreprocessor()

    (processed_config,
     processed_container) = processor.process_data(configuration,
                                                   data_container)

    # Rename certain frames with more descriptive names
    # for writing out experiment files
    rename_dict = {'train_excluded': 'train_excluded_responses',
                   'test_excluded': 'test_excluded_responses',
                   'train_length': 'train_response_lengths',
                   'train_flagged': 'train_responses_with_excluded_flags',
                   'test_flagged': 'test_responses_with_excluded_flags'}

    logger.info('Saving training and test set data to disk.')

    # Write out files
    writer.write_experiment_output(csvdir,
                                   processed_container,
                                   ['train_features',
                                    'test_features',
                                    'train_metadata',
                                    'test_metadata',
                                    'train_other_columns',
                                    'test_other_columns',
                                    'train_preprocessed_features',
                                    'test_preprocessed_features',
                                    'train_excluded',
                                    'test_excluded',
                                    'train_length',
                                    'test_human_scores',
                                    'train_flagged',
                                    'test_flagged'],
                                   rename_dict,
                                   file_format=file_format)

    # Initialize the analyzer
    analyzer = Analyzer()

    (analyzed_config,
     analyzed_container) = analyzer.run_data_composition_analyses_for_rsmtool(processed_container,
                                                                              processed_config)

    # Write out files
    writer.write_experiment_output(csvdir,
                                   analyzed_container,
                                   file_format=file_format)

    logger.info('Training {} model.'.format(processed_config['model_name']))

    # Initialize modeler
    modeler = Modeler()

    modeler.train(processed_config,
                  processed_container,
                  csvdir,
                  figdir,
                  file_format)

    # Identify the features used by the model
    selected_features = modeler.get_feature_names()

    # Add selected features to processed configuration
    processed_config['selected_features'] = selected_features

    # Write out files
    writer.write_feature_csv(featuredir,
                             processed_container,
                             selected_features,
                             file_format=file_format)

    features_data_container = processed_container.copy()

    # Get selected feature info, and write out to file
    df_feature_info = features_data_container.feature_info.copy()
    df_selected_feature_info = df_feature_info[df_feature_info['feature'].isin(selected_features)]
    selected_feature_dataset_dict = {'name': 'selected_feature_info',
                                     'frame': df_selected_feature_info}

    features_data_container.add_dataset(selected_feature_dataset_dict,
                                        update=True)

    writer.write_experiment_output(csvdir,
                                   features_data_container,
                                   dataframe_names=['selected_feature_info'],
                                   new_names_dict={'selected_feature_info': 'feature'},
                                   file_format=file_format)

    logger.info('Running analyses on training set.')

    (train_analyzed_config,
     train_analyzed_container) = analyzer.run_training_analyses(processed_container,
                                                                processed_config)

    # Write out files
    writer.write_experiment_output(csvdir,
                                   train_analyzed_container,
                                   reset_index=True,
                                   file_format=file_format)

    # Use only selected features for predictions
    columns_for_prediction = ['spkitemid', 'sc1'] + selected_features
    train_for_prediction = processed_container.train_preprocessed_features[columns_for_prediction]
    test_for_prediction = processed_container.test_preprocessed_features[columns_for_prediction]

    logged_str = 'Generating training and test set predictions'
    logged_str += ' (expected scores).' if configuration['predict_expected_scores'] else '.'
    logger.info(logged_str)
    (pred_config,
     pred_data_container) = modeler.predict_train_and_test(train_for_prediction,
                                                           test_for_prediction,
                                                           processed_config)

    # Write out files
    writer.write_experiment_output(csvdir,
                                   pred_data_container,
                                   new_names_dict={'pred_test': 'pred_processed'},
                                   file_format=file_format)

    original_coef_file = join(csvdir, '{}_coefficients.{}'.format(pred_config['experiment_id'],
                                                                  file_format))

    # If coefficients file exists, then generate
    # scaled coefficients and save to file
    if exists(original_coef_file):
        logger.info('Scaling the coefficients and saving them to disk')
        try:

            # Scale coefficients, and return DataContainer w/ scaled coefficients
            scaled_data_container = modeler.scale_coefficients(pred_config)

            # Write out files to disk
            writer.write_experiment_output(csvdir,
                                           scaled_data_container,
                                           file_format=file_format)

        except AttributeError:
            raise ValueError("It appears you are trying to save two different "
                             "experiments to the same directory using the same "
                             "ID. Please clear the content of the directory and "
                             "rerun both experiments using different "
                             "experiment IDs.")

    # Add processed data_container frames to pred_data_container
    new_pred_data_container = pred_data_container + processed_container

    logger.info('Running prediction analyses.')
    (pred_analysis_config,
     pred_analysis_data_container) = analyzer.run_prediction_analyses(new_pred_data_container,
                                                                      pred_config)

    # Write out files
    writer.write_experiment_output(csvdir,
                                   pred_analysis_data_container,
                                   reset_index=True,
                                   file_format=file_format)
    # Initialize reporter
    reporter = Reporter()

    # generate the report
    logger.info('Starting report generation.')
    reporter.create_report(processed_config,
                           csvdir,
                           figdir)
 def test_init_directory_instead_of_file(self):
     with tempfile.TemporaryDirectory() as tempd:
         _ = ConfigurationParser(tempd)
 def test_init_nonexistent_file(self):
     non_existent_file = "/x/y.json"
     _ = ConfigurationParser(non_existent_file)
Example #38
0
def run_comparison(config_file_or_obj, output_dir):
    """
    Run an ``rsmcompare`` experiment using the given configuration
    file and generate the report in the given directory.

    Parameters
    ----------
    config_file_or_obj : str or Configuration
        Path to the experiment configuration file.
        Users can also pass a `Configuration` object that is in memory.
    output_dir : str
        Path to the experiment output directory.

    Raises
    ------
    ValueError
        If any of the required fields are missing or ill-specified.
    """

    logger = logging.getLogger(__name__)

    # Allow users to pass Configuration object to the
    # `config_file_or_obj` argument, rather than read file
    if not isinstance(config_file_or_obj, Configuration):

        # Instantiate configuration parser object
        parser = ConfigurationParser.get_configparser(config_file_or_obj)
        configuration = parser.read_normalize_validate_and_process_config(
            config_file_or_obj, context='rsmcompare')

        # get the directory where the configuration file lives
        configpath = dirname(config_file_or_obj)

    else:
        configuration = config_file_or_obj
        if configuration.filepath is not None:
            configpath = dirname(configuration.filepath)
        else:
            configpath = os.getcwd()

    logger.info('Saving configuration file.')
    configuration.save(output_dir)

    # get the information about the "old" experiment
    experiment_id_old = configuration['experiment_id_old']
    experiment_dir_old = DataReader.locate_files(
        configuration['experiment_dir_old'], configpath)
    if not experiment_dir_old:
        raise FileNotFoundError("The directory {} "
                                "does not exist.".format(
                                    configuration['experiment_dir_old']))
    else:
        csvdir_old = normpath(join(experiment_dir_old, 'output'))
        figdir_old = normpath(join(experiment_dir_old, 'figure'))
        if not exists(csvdir_old) or not exists(figdir_old):
            raise FileNotFoundError("The directory {} does not contain "
                                    "the output of an rsmtool "
                                    "experiment.".format(experiment_dir_old))

    check_experiment_id(experiment_dir_old, experiment_id_old)

    # get the information about the "new" experiment
    experiment_id_new = configuration['experiment_id_new']
    experiment_dir_new = DataReader.locate_files(
        configuration['experiment_dir_new'], configpath)
    if not experiment_dir_new:
        raise FileNotFoundError("The directory {} "
                                "does not exist.".format(
                                    configuration['experiment_dir_new']))
    else:
        csvdir_new = normpath(join(experiment_dir_new, 'output'))
        figdir_new = normpath(join(experiment_dir_new, 'figure'))
        if not exists(csvdir_new) or not exists(figdir_new):
            raise FileNotFoundError("The directory {} does not contain "
                                    "the output of an rsmtool "
                                    "experiment.".format(experiment_dir_new))

    check_experiment_id(experiment_dir_new, experiment_id_new)

    # are there specific general report sections we want to include?
    general_report_sections = configuration['general_sections']

    # what about the special or custom sections?
    special_report_sections = configuration['special_sections']

    custom_report_section_paths = configuration['custom_sections']

    # if custom report sections exist, locate sections; otherwise, create empty list
    if custom_report_section_paths:
        logger.info('Locating custom report sections')
        custom_report_sections = Reporter.locate_custom_sections(
            custom_report_section_paths, configpath)
    else:
        custom_report_sections = []

    # get the section order
    section_order = configuration['section_order']

    # get the subgroups if any
    subgroups = configuration.get('subgroups')

    # Initialize reporter
    reporter = Reporter()

    chosen_notebook_files = reporter.get_ordered_notebook_files(
        general_report_sections,
        special_report_sections,
        custom_report_sections,
        section_order,
        subgroups,
        model_type=None,
        context='rsmcompare')

    # add chosen notebook files to configuration
    configuration['chosen_notebook_files'] = chosen_notebook_files

    # now generate the comparison report
    logger.info('Starting report generation.')
    reporter.create_comparison_report(configuration, csvdir_old, figdir_old,
                                      csvdir_new, figdir_new, output_dir)
def run_summary(config_file_or_obj, output_dir):
    """
    Run rsmsummarize experiment using the given configuration
    file and generate all outputs in the given directory.

    Parameters
    ----------
    config_file_or_obj : str or configuration_parser.Configuration
        Path to the experiment configuration file.
        Users can also pass a `Configuration` object that is in memory.
    output_dir : str
        Path to the experiment output directory.

    Raises
    ------
    ValueError
        If any of the required fields are missing or ill-specified.
    """
    logger = logging.getLogger(__name__)

    # create the 'output' and the 'figure' sub-directories
    # where all the experiment output such as the CSV files
    # and the box plots will be saved
    csvdir = abspath(join(output_dir, 'output'))
    figdir = abspath(join(output_dir, 'figure'))
    reportdir = abspath(join(output_dir, 'report'))

    os.makedirs(csvdir, exist_ok=True)
    os.makedirs(figdir, exist_ok=True)
    os.makedirs(reportdir, exist_ok=True)

    # Allow users to pass Configuration object to the
    # `config_file_or_obj` argument, rather than read file
    if not isinstance(config_file_or_obj, Configuration):

        # Instantiate configuration parser object
        parser = ConfigurationParser.get_configparser(config_file_or_obj)
        configuration = parser.read_normalize_validate_and_process_config(config_file_or_obj,
                                                                          context='rsmsummarize')

        # get the directory where the configuration file lives
        configpath = dirname(config_file_or_obj)

    else:

        configuration = config_file_or_obj
        if configuration.filepath is not None:
            configpath = dirname(configuration.filepath)
        else:
            configpath = os.getcwd()

    # get the list of the experiment dirs
    experiment_dirs = configuration['experiment_dirs']

    # check the experiment dirs and assemble the list of csvdir and jsons
    all_experiments = []
    for experiment_dir in experiment_dirs:
        experiments = check_experiment_dir(experiment_dir, configpath)
        all_experiments.extend(experiments)

    # get the subgroups if any
    # Note: at the moment no comparison are reported for subgroups.
    # this option is added to the code to make it easier to add
    # subgroup comparisons in future versions
    subgroups = configuration.get('subgroups')

    general_report_sections = configuration['general_sections']

    # get any special sections that the user might have specified
    special_report_sections = configuration['special_sections']

    # get any custom sections and locate them to make sure
    # that they exist, otherwise raise an exception
    custom_report_section_paths = configuration['custom_sections']
    if custom_report_section_paths:
        logger.info('Locating custom report sections')
        custom_report_sections = Reporter.locate_custom_sections(custom_report_section_paths,
                                                                 configpath)
    else:
        custom_report_sections = []

    section_order = configuration['section_order']

    # Initialize reporter
    reporter = Reporter()

    # check all sections values and order and get the
    # ordered list of notebook files
    chosen_notebook_files = reporter.get_ordered_notebook_files(general_report_sections,
                                                                special_report_sections,
                                                                custom_report_sections,
                                                                section_order,
                                                                subgroups,
                                                                model_type=None,
                                                                context='rsmsummarize')

    # add chosen notebook files to configuration
    configuration['chosen_notebook_files'] = chosen_notebook_files

    # now generate the comparison report
    logger.info('Starting report generation')
    reporter.create_summary_report(configuration,
                                   all_experiments,
                                   csvdir)